Python UrlGenerator.get_page_params示例，helpers.url_generator.UrlGenerator.get_page_params Python示例

示例#1

0

显示文件

文件： d_spider_5owe.py 项目： Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)

            # parse items links
            items_links = grab.doc.select(
                '//div[@class="catalog"]//div[@class="header"]//a')

            for row in items_links:
                link = row.attr('href')
                link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_item', url=link, priority=100, raw=True)

            # parse next page
            items_next_page = grab.doc.select(
                '//div[@class="pagination"]//a[contains(@class, "nextpage")]')

            for row in items_next_page:
                link = row.attr('href')
                link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_page', url=link, priority=90, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#2

0

显示文件

文件： d_spider_7rus.py 项目： Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            catalog = grab.doc.select('//div[@class="catalog_section "]')

            # parse items links
            items_list = catalog.select('.//div[@class="product_item"]//a')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = catalog.select('//a[@class="next_page"]').attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(self.domain, next_page, {})
                yield self.do_task('parse_page', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#3

0

显示文件

文件： d_spider_6ele.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # catalog
            catalog = grab.doc.select('//div[@id="js_ajax-catalog"]')

            # parse items
            items_list = catalog.select('.//a[@class="bx_rcm_view_link"]')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = catalog.select('.//a[@title="След."]').attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(
                    self.domain, next_page, {})
                yield self.do_task('initial', next_page, 90, last=False)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#4

0

显示文件

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse items links
            items_list = grab.doc.select(
                '//div[@class="catalog-section-it-table-body"]//a')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})

                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page if current is ok
            next_page = grab.doc.select(
                '//a[@class="catalog-pagenav-next"]').attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(
                    self.domain, next_page, {})
                yield self.do_task('parse_page', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#5

0

显示文件

文件： d_spider_7san.py 项目： Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            items_list = grab.doc.select(
                '//div[contains(@class, "bx_catalog_list_home")]//div[@class="title"]//a'
            )

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = grab.doc.select(
                '//div[@class="bx-pagination "]//li[@class="bx-pag-next"]/a'
            ).attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(
                    self.domain, next_page, {})
                yield self.do_task('parse_page', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#6

0

显示文件

文件： d_spider_6cab.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse items
            items_list = grab.doc.select(
                '//div[@class="products-wrap"]//a[@itemprop="url" and @class="products-name"]'
            )

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = grab.doc.select(
                '//div[contains(@class, "pagination")][1]//a[@class="pg-next" and contains(@href, "p=")]'
            ).attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(
                    self.domain, next_page, {})
                yield self.do_task('initial', next_page, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#7

0

显示文件

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            catalog = grab.doc.select('//div[@class="listcatalog"]')

            # gerenate new tasks
            links = catalog.select(
                './/div[@class="navigation"]/div[@class="nav"]//a')
            max_page = 1

            for link in links:
                page_number = link.text('')

                if page_number and Ree.number.match(page_number):
                    max_page = max(max_page, int(page_number))

            if max_page > 1:
                for page in range(2, max_page):
                    next_page = UrlGenerator.get_page_params(
                        task.url, '', {'PAGEN_1': page})
                    yield self.do_task('parse_page_items', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#8

0

显示文件

文件： d_spider_5sti.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task, f'Err task, attempt {task.task_try_count}')
                return

            exclude_links_labels = ['Оплата', 'Доставка', 'Гарантия', 'Акции', 'Рекомендации по подбору', 'Информация и реквизиты',
                                    'Новости', 'Контакты', 'Сервис-центр']

            # take all links from horizontal nav, exclude anchors (#) and external links
            category_list = grab.doc.select('//div[@id="navbar"]//a[starts-with(@href, "/")]')

            # take links only for main cats, because its already contain all sub-cats items
            for link in category_list:
                # skip if label have stop words
                if link.text().strip() in exclude_links_labels:
                    continue

                link = link.attr('href')

                # make absolute urls if needed
                if link[:1] == '/':
                    link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_page', url=link, priority=90, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#9

0

显示文件

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse items links
            items_list = grab.doc.select(
                '//table[@class="prod"]//tr[not(contains(@class, "white"))]')

            # if all page with useless records - skip other pages
            success_pages = 0

            for index, row in enumerate(items_list):
                # check row
                status = row.select('./td[2]')
                price = row.select('./td[3]')

                if status == '0' or price == 'под заказ':
                    self.log.warning(
                        task, f'Skip item, because status {status} / {price}')
                    continue

                link = row.select('./td[@class="name"]/a').attr('href')
                link = UrlGenerator.get_page_params(self.domain, link, {})

                success_pages += 1
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page if current is ok
            if success_pages > 0:
                next_page = grab.doc.select(
                    '//div[@class="pagination"][1]//a[@class="next_page_link"]'
                ).attr('href', '')

                if next_page:
                    next_page = UrlGenerator.get_page_params(
                        self.domain, next_page, {})
                    yield self.do_task('parse_page', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#10

0

显示文件

文件： d_spider_5izi.py 项目： Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse table rows
            table = grab.doc.select('//table[@class="table search_table list"]//tr')

            # parse table links to items
            items_links = table.select('.//a[starts-with(@href, "/catalog/catalog")]')

            for index, row in enumerate(items_links):
                link = row.attr('href')

                # make absolute urls if needed
                if link[:1] == '/':
                    link = UrlGenerator.get_page_params(self.domain, link, {})

                yield self.do_task('parse_item', link, 100, last=True)

            # parse "показать ещё" links
            more_links = grab.doc.select('.//a[starts-with(@href, "/catalog/?")]')

            # hope it will be only 0 or 1 link
            for index, row in enumerate(more_links):
                link = row.attr('href')

                # make absolute urls if needed
                if link[:1] == '/':
                    link = UrlGenerator.get_page_params(self.domain, link, {})

                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#11

0

显示文件

文件： d_spider_8tds.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            items_list = grab.doc.select('//div[contains(@class, "goodsGoods")]//a[@class="textTitle"]')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = grab.doc.select('//a[contains(text(), "»")]').attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(self.domain, next_page, {})
                yield self.do_task('initial', next_page, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#12

0

显示文件

文件： d_spider_elev.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        self.logger.info('[{}] Initial url: {}'.format(task.name, task.url))

        if self._check_body_errors(grab, task):
            self.logger.fatal('[{}] Err task with url {}, attempt {}'.format(
                task.name, task.url, task.task_try_count))
            return

        try:
            cat_list = grab.doc.select(
                '//div[@class="block-catalog"]//div[@class="tabs-content"]//a[contains(@href, "shop")]'
            )

            # take links only for main cats, because its already contain all sub-cats items
            for row in cat_list:
                raw_link = row.attr('href')

                # skip sub-cats
                # cat:      /shop/cat/      -> 3
                # sub-cat:   /shop/cat/foo/  -> 4
                if raw_link.count('/') > 3:
                    continue

                # make absolute urls if needed
                if raw_link[:1] == '/':
                    raw_link = UrlGenerator.get_page_params(
                        self.domain, raw_link, {
                            'section': '0',
                            'count': '50',
                            'sort': 'alphabet',
                            'order': 'asc',
                        })

                print(raw_link)
                yield Task('parse_items_v2',
                           url=raw_link,
                           priority=90,
                           raw=True,
                           d_base_url=raw_link,
                           d_page=1,
                           d_need_update_pagination=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.logger.info('[{}] Finish: {}'.format(task.name, task.url))

示例#13

0

显示文件

文件： d_spider_5izi.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task, 'Err task, attempt {}'.format(task.task_try_count))
                return

            # make link
            url = UrlGenerator.get_page_params(self.domain, 'catalog', {'curPos': 0})

            # prepare page loop parsing
            yield self.do_task('parse_page', url, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#14

0

显示文件

文件： d_spider_5owe.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task,
                               f'Err task, attempt {task.task_try_count}')

            links = grab.doc.select('//div[@class="gsections"]//ul//a')

            for link in links:
                url = UrlGenerator.get_page_params(self.domain,
                                                   link.attr('href'), {})
                yield Task('parse_page', url=url, priority=90, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#15

0

显示文件

文件： d_spider_7rus.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # catalog
            catalog = grab.doc.select('//div[@id="categories"]//a')

            for link in catalog:
                link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {})
                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#16

0

显示文件

文件： d_spider_7fre.py 项目： Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            items_list = grab.doc.select(
                '//div[@class="catalog-item-price-view"]//a[@itemprop="url"]')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#17

0

显示文件

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task,
                               f'Err task, attempt {task.task_try_count}')
                return

            category_list = grab.doc.select(
                '//div[@id="categories_block_left"]/div[1]//a')

            for link in category_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#18

0

显示文件

文件： d_spider_5int.py 项目： Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)

            # parse items links
            items_links = grab.doc.select(
                '//div[@id="catalog-list"]//div[@class="catalog-items"]//a[@property="name"]'
            )

            for row in items_links:
                link = row.attr('href')
                link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_item', url=link, priority=100, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#19

0

显示文件

文件： d_spider_6dui.py 项目： Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse items links
            items_list = grab.doc.select(
                '//div[@class="tovar-table tovar_basic"]//div[@class="tovar-col tovar2"]/a'
            )

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#20

0

显示文件

    def task_parse_page_items(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            catalog = grab.doc.select('//div[@class="listcatalog"]')

            # parse items links
            items_list = catalog.select(
                './/table[@class="lclistitem"]//td[@class="name"]//a')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#21

0

显示文件

文件： d_spider_6dui.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse cats
            categories_list = grab.doc.select(
                '//div[@class="main-links"]//p[@class="home_subcatalog_links_box"]/a'
            )

            for link in categories_list:
                link = UrlGenerator.get_page_params(
                    self.domain, link.attr('href'),
                    {'SET_PAGE_COUNT': '99999'})
                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#22

0

显示文件

文件： d_spider_5sti.py 项目： Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)

            # parse items
            items_list = grab.doc.select('//div[@class="prod-list-cell"]//a[.!=""]')

            for index, row in enumerate(items_list):
                link = row.attr('href')

                # make absolute urls if needed
                if link[:1] == '/':
                    link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_item', url=link, priority=100, raw=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#23

0

显示文件

文件： d_spider_7fre.py 项目： Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # catalog
            catalog = grab.doc.select(
                '//div[@class="workarea"]//div[contains(@class, "catalog-section-title")]/a'
            )

            for link in catalog:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {
                                                        'limit': 900,
                                                        'view': 'price'
                                                    })
                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#24

0

显示文件

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task,
                               f'Err task, attempt {task.task_try_count}')
                return

            links = grab.doc.select(
                '//nav//a[not(.//img) and re:match(@href, "/product_list/.+")]'
            )

            for link in links:
                url = UrlGenerator.get_page_params(self.domain,
                                                   link.attr('href'), {
                                                       'count': 999999,
                                                       'name': 'asc'
                                                   })
                yield Task('parse_page', url=url, priority=90, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#25

0

显示文件

文件： d_spider_5sti.py 项目： Holovin/PythonParsersGrab

    def task_parse_item(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)

            # parse fields
            # A = name
            product_name = grab.doc.select('//h1').text()

            # B = count
            # C = status
            product_count_string = grab.doc.select('//span[@class="p-qty-wh"]').text()

            if product_count_string == 'Под заказ':
                product_status = '-1'
                product_count = '-1'

            elif product_count_string == 'На складе: более 100':
                product_status = '-1'
                product_count = 100

            else:
                product_status = '-1'
                product_count = DSpider.re_product_count.match(product_count_string).groupdict()['count']

            # D = unit [const = value]
            product_unit = 'ед.'

            # E = price
            product_price = DSpider.re_product_price.match(grab.doc.select('//div[@class="ppage-product-price"]').text()).groupdict()['price'].replace(' ', '')

            # check if positive and correct price
            if not product_price.isdigit():
                self.log.debug(task, f'Skip item, cuz wrong price {product_price}')
                return

            # F = vendor code [const = skip for parsing]
            product_vendor_code = ''

            # G = vendor [const = value]
            product_vendor = 'Stiebel Eltron'

            # H = photo url
            product_photo_url = UrlGenerator.get_page_params(self.domain, grab.doc.select('//img[@id="Image1"]').attr('src'), {})

            # I = description
            product_description = {'ОБЛАСТЬ ПРИМЕНЕНИЯ': grab.doc.select('//div[@class="col-md-14"]/p').text(default=' ')}

            table = grab.doc.select('//div[@class="col-md-14"]/table//tr')

            for row in table:

                key = row.select('./td[1]').text()
                value = row.select('./td[2]').text()

                if key:
                    product_description[key] = value

            # save
            self.result.append({
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description
            })

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#26

0

显示文件

文件： d_spider_5int.py 项目： Holovin/PythonParsersGrab

    def task_parse_item(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)

            # common block with info
            product_info = grab.doc.select('//div[@id="product-info"]')

            # parse fields
            # A = name
            product_name = product_info.select('.//h1').text()

            # B = [const]
            # C = [const]
            # D = [const]
            product_count_string = product_info.select(
                './/div[@class="product-data-storehouse"]').text(
                    default='[not found]')
            product_count = '-1'
            product_status = '0'
            product_unit = 'ед.'

            if product_count_string != 'в наличии':
                self.log.warning(
                    task, 'Skip item, cuz wrong count {}'.format(
                        product_count_string))
                return

            # E = price
            # if E = "запросить цену и наличие" => zapros
            # else => float
            product_price = product_info.select(
                './/span[@itemprop="price"]').text().replace(' ', '')

            if product_price == 'Уточняйте':
                product_price = '-1'

            else:
                # E = price (float)
                # check if correct price
                if not Ree.float.match(product_price):
                    self.log.warning(
                        task, f'Skip item, cuz wrong price {product_price}')
                    return

            # F = vendor code
            product_vendor_code = product_info.select(
                './/div[@class="product-data-articul"]').text()

            # G = vendor
            product_vendor = product_info.select(
                './/div[@class="product-data-producer"]').text()

            # H = photo url
            product_photo_url_raw = product_info.select(
                './/div[@id="product-images-list"]/div[1]/img[@itemprop="contentUrl"]'
            ).attr('src')
            product_photo_url = UrlGenerator.get_page_params(
                self.domain, product_photo_url_raw, {})

            # pre I
            product_description_part_raw = product_info.select('.//div[@class="product-description description"]/following-sibling::node()[2]')\
                .text(default='')\
                .replace('$(".description").html(\'', '')\
                .replace('\');', '')

            # I = description
            # this part insert pure html with js, so we need clear all html tags and &-symbols
            product_description_part_list = html.fromstring(
                f'<div>{product_description_part_raw}</div>').xpath('string()')
            product_description_part = ''

            for row in product_description_part_list:
                product_description_part += row

            product_description = {'Описание': product_description_part}

            table = product_info.select(
                './/div[@class="product-description table"]/div')

            for row in table:
                key = row.select('./text()').text()
                value = row.select('./span').text()

                if key:
                    product_description[key] = value

            # save
            row = {
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description
            }

            self.log.info(task, row)
            self.result.append(row)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

示例#27

0

显示文件

    def task_parse_item(self, grab, task):
        try:
            # common block with info
            product_info = grab.doc.select('//div[@class="itemcatalog"]')

            # parse fields
            # A = name
            product_name = product_info.select('.//h1').text()

            # B = count (quantity)
            # C = status (delivery)
            product_count_string = product_info.select(
                './/table[@id="hint-table"]//tr[1]//td[2]').text('')

            if product_count_string == 'Имеется в наличии':
                product_count = '-1'
                product_status = '0'

            elif product_count_string in [
                    'Ожидается поступление', 'Под заказ'
            ]:
                product_count = '-1'
                product_status = '-1'

            else:
                self.log.warning(
                    task,
                    f'Unknown count status {product_count_string} skip...')
                return

            tin_tab = product_info.select('.//table[@class="tintab"]')

            # D = unit (measure)
            product_unit = tin_tab.select('.//tr[2]/td[2]').text('ед.')

            # E = price
            product_price = product_info.select(
                './/span[@class="price"]').text('').replace(' руб.', '')

            if product_price == 'по запросу':
                product_price = '-1'

            if not product_price or not Ree.float.match(product_price):
                self.log.warning(
                    task, f'Unknown price status {product_price}, skip...')
                return

            # F = vendor code (sku)
            product_vendor_code = tin_tab.select('.//tr[1]/td[2]').text('')

            # G = vendor (manufacture)
            product_vendor = tin_tab.select('.//tr[last()]/td[2]').text('')

            # H = photo url
            product_photo_url_raw = product_info.select(
                './/a[@itemprop="image"]').attr('href', '')

            if product_photo_url_raw:
                product_photo_url = UrlGenerator.get_page_params(
                    self.domain, product_photo_url_raw, {})
            else:
                product_photo_url = ''

            # I = description (properties)
            product_description = {}

            # try parse full props
            for row in tin_tab.select('.//tr'):
                key = row.select('./td[1]').text()
                value = row.select('./td[2]').text()

                if key:
                    product_description[key] = value

            # common
            item_description_rows = grab.doc.select(
                '//div[@itemprop="description"]')
            item_description = ''

            for row in item_description_rows:
                if row.node().tag not in ['table', 'img']:
                    item_description += row.text('')

            if item_description:
                product_description['Техническое описание'] = item_description

            # save
            self.result.append({
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description
            })

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task, last=True)

示例#28

0

显示文件

文件： d_spider_6cab.py 项目： Holovin/PythonParsersGrab

    def task_parse_item(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # common block with info
            product_info = grab.doc.select('//div[@id="product"]')

            # parse fields
            # A = name
            product_name = product_info.select('.//h1').text()

            # B = count (quantity)
            # C = status (delivery)
            product_count_string_stock = product_info.select(
                './/span[contains(@class, "in-stock")]').text(default='')
            product_count_string_order = product_info.select(
                './/span[contains(@class, "under-order")]').text(default='')

            # D = unit (measure) [const if no stock, else parse]
            if product_count_string_stock and product_count_string_stock[
                    -1] == 'м':
                product_unit = 'м'
                product_count_string_stock = product_count_string_stock[:
                                                                        -1].strip(
                                                                        )
            else:
                product_unit = 'ед.'

            if product_count_string_stock.isdigit():
                product_count = product_count_string_stock
                product_status = '0'

            elif product_count_string_order == 'под заказ':
                product_count = '-1'
                product_status = '-1'

            else:
                self.log.warning(
                    task,
                    f'Unknown count status {product_count_string_stock} or {product_count_string_order}, skip...'
                )
                return

            # E = price
            product_price_raw = product_info.select(
                './/p[@class="summ"]').text(default='')

            if not product_price_raw:
                self.log.warning(
                    task, f'Unknown price status {product_price_raw}, skip...')
                return

            if product_price_raw == 'по запросу':
                product_price = '-1'

            else:
                # parse number from child node
                product_price_raw = product_info.select(
                    './/p[@class="summ"]/span[@id="commmon_price"]').text(
                        default='')

                if not product_price_raw or not Ree.float.match(
                        product_price_raw):
                    self.log.warning(
                        task,
                        f'Unknown price status {product_price_raw}, skip...')
                    return

                product_price = product_price_raw

            # F = vendor code (sku) [const]
            product_vendor_code = ''

            # G = vendor (manufacture) [const]
            product_vendor = ''

            # H = photo url
            product_photo_url_raw = product_info.select(
                './/img[@itemprop="image"]').attr('src')
            product_photo_url = UrlGenerator.get_page_params(
                self.domain, product_photo_url_raw, {})

            # I = description (properties)
            product_description = {}

            # I :: Base
            table = product_info.select('.//div[@class="tab-content-list"]')

            for row in table:
                key = row.select('./span[1]').text(default=None)
                value = row.select('./span[2]').text(default=None)

                if key and value and key != 'Наличие':
                    product_description[key] = value

            # I :: description
            description = product_info.select('.//div[@id="opisanie"]').text(
                default='')
            if description:
                product_description['Описание'] = description

            # I :: using
            description = product_info.select('.//div[@id="primenenie"]').text(
                default='')
            if description:
                product_description['Применение'] = description

            # I :: tech
            description = product_info.select(
                './/div[@id="tehnicheskie_harakteristiki"]').text(default='')
            if description:
                product_description['Технические характеристики'] = description

            # save
            self.result.append({
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description
            })

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task, last=True)

示例#29

0

显示文件

文件： d_spider_7san.py 项目： Holovin/PythonParsersGrab

    def task_parse_item(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # common block with info
            product_info = grab.doc.select('//div[@id="content"]')

            # parse fields
            # A = name
            product_name = product_info.select('.//h1').text()

            # B = count (quantity)
            # C = status (delivery)
            product_count_string_full = product_info.select(
                './/div[@class="presence-box"]/span[@class="presence sel"]'
            ).count()
            product_count_string_empty = product_info.select(
                './/div[@class="presence-box"]/span[@class="presence"]').count(
                )

            # 111 (fulled squares)
            if product_count_string_full == 3:
                product_count = '-1'
                product_status = '0'

            # 000 (empty squares)
            elif product_count_string_empty == 3:
                product_count = '-1'
                product_status = '-1'

            # ???
            else:
                self.log.warning(
                    task,
                    f'Unknown count status {product_count_string_full}, {product_count_string_empty} skip...'
                )
                return

            # D = unit (measure) [const!]
            product_unit = 'ед.'

            # E = price
            product_price_raw = product_info.select(
                './/div[@class="item_current_price"]').text('')
            product_price_raw = Ree.extract_float.match(product_price_raw)

            if product_price_raw:
                product_price = product_price_raw.groupdict()['float']

            else:
                self.log.warning(
                    task, f'Unknown price status {product_price_raw}, skip...')
                return

            if product_price == '0':
                product_price = '-1'

            table = product_info.select('.//table[@class="prop-list"]//tr')

            product_vendor_code = ''
            product_vendor = ''

            for row in table:
                key = row.select('./td[1]').text('')
                value = row.select('./td[2]').text('')

                # G = vendor (manufacture)
                if 'Производитель' in key:
                    product_vendor = value
                    continue

                # F = vendor code (sku)
                if 'Артикул' in key:
                    product_vendor_code = value.strip(' .')
                    continue

            # H = photo url
            product_photo_url_raw = product_info.select(
                './/a[@id="pos-big-photo"]').attr('href', '')

            if product_photo_url_raw:
                product_photo_url = UrlGenerator.get_page_params(
                    self.domain, product_photo_url_raw, {})
            else:
                product_photo_url = ''

            # I = description (properties)
            product_description = {
                'Описание':
                product_info.select('.//div[@id="detail-text-content"]').text(
                    '')
            }

            # save
            o = {
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description
            }

            self.log.info(task, 'Add: {}'.format(o))

            self.result.append({
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description
            })

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task, last=True)

示例#30

0

显示文件

文件： d_spider_8tds.py 项目： Holovin/PythonParsersGrab

    def task_parse_item(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # common block with info
            product_info = grab.doc.select('//div[@id="product"]')

            # parse fields
            # A = name
            product_name = product_info.select('.//h1').text()

            # B = count (quantity)
            # C = status (delivery)
            product_count_string = product_info.select('.//span[@itemprop="availability"]').text('')

            if 'шт.' in product_count_string or 'м.п.' in product_count_string or product_count_string in ['есть', 'в наличии']:
                product_count = '-1'
                product_status = '0'

            elif 'срок поставки' in product_count_string or product_count_string in ['НЕТ', 'нет']:
                self.log.info(task, f'Skip count status {product_count_string} skip...')
                return

            else:
                self.log.warning(task, f'Unknown count status {product_count_string} skip...')
                return

            # D = unit (measure)
            product_unit = product_info.select('.//form[@class="form_addCart"]//span[@class="measure"]').text('ед.')

            # E = price
            product_price = product_info.select('.//form[@class="form_addCart"]//meta[@itemprop="price"]').attr('content', '')

            if not product_price or not Ree.float.match(product_price):
                self.log.warning(task, f'Unknown price status {product_price}, skip...')
                return

            # F = vendor code (sku)
            product_vendor_code = product_info.select('.//span[@class="articleValue"]').text('')

            # G = vendor (manufacture)
            product_vendor = product_info.select('.//a[@itemprop="brand"]').text('')

            # H = photo url
            product_photo_url_raw = product_info.select('.//img[@itemprop="image"]').attr('src', '')

            if product_photo_url_raw:
                product_photo_url = UrlGenerator.get_page_params(self.domain, product_photo_url_raw, {})
            else:
                product_photo_url = ''

            # I = description (properties)
            product_description = {'Описание': product_info.select('.//div[@class="content"][1]').text('')}

            # ID
            product_id = product_info.select('.//input[@name="addcart"]').attr('value', '')

            # save
            self.result.append({
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description,
                'id': product_id,
            })

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task, last=True)