Python UrlGenerator 예제들, helpers.url_generator.UrlGenerator Python 예제들

예제 #1

0

파일 보기

파일: d_spider_6ele.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # catalog
            catalog = grab.doc.select('//div[@id="js_ajax-catalog"]')

            # parse items
            items_list = catalog.select('.//a[@class="bx_rcm_view_link"]')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = catalog.select('.//a[@title="След."]').attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(
                    self.domain, next_page, {})
                yield self.do_task('initial', next_page, 90, last=False)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #2

0

파일 보기

파일: d_spider_7san.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            items_list = grab.doc.select(
                '//div[contains(@class, "bx_catalog_list_home")]//div[@class="title"]//a'
            )

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = grab.doc.select(
                '//div[@class="bx-pagination "]//li[@class="bx-pag-next"]/a'
            ).attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(
                    self.domain, next_page, {})
                yield self.do_task('parse_page', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #3

0

파일 보기

파일: d_spider_6cab.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse items
            items_list = grab.doc.select(
                '//div[@class="products-wrap"]//a[@itemprop="url" and @class="products-name"]'
            )

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = grab.doc.select(
                '//div[contains(@class, "pagination")][1]//a[@class="pg-next" and contains(@href, "p=")]'
            ).attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(
                    self.domain, next_page, {})
                yield self.do_task('initial', next_page, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #4

0

파일 보기

파일: d_spider_7rus.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            catalog = grab.doc.select('//div[@class="catalog_section "]')

            # parse items links
            items_list = catalog.select('.//div[@class="product_item"]//a')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = catalog.select('//a[@class="next_page"]').attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(self.domain, next_page, {})
                yield self.do_task('parse_page', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #5

0

파일 보기

파일: d_spider_5owe.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)

            # parse items links
            items_links = grab.doc.select(
                '//div[@class="catalog"]//div[@class="header"]//a')

            for row in items_links:
                link = row.attr('href')
                link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_item', url=link, priority=100, raw=True)

            # parse next page
            items_next_page = grab.doc.select(
                '//div[@class="pagination"]//a[contains(@class, "nextpage")]')

            for row in items_next_page:
                link = row.attr('href')
                link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_page', url=link, priority=90, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #6

0

파일 보기

파일: d_spider_cab.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        self.logger.debug('[{}] Initial url: {}'.format(task.name, task.url))

        if self._check_body_errors(grab, task):
            self.logger.fatal('[{}] Err task with url {}, attempt {}'.format(
                task.name, task.url, task.task_try_count))
            return

        try:
            items = grab.doc.select(
                '//div[contains(@class, "pagination")]//a[contains(@href, "{}")]'
                .format(Config.get('SITE_PAGE_PARAM')))
            max_page = get_max_page(items, 0, -1)

            self.logger.info('[{}] Task: {}, max_page: {}'.format(
                task.name, task.url, max_page))

            url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM'))

            for p in range(0, max_page + 1):
                url = url_gen.get_page(p)
                yield Task('parse_page', url=url, priority=90)

        except Exception as e:
            self._process_error(grab, task, e)

        self.logger.info('[{}] Tasks added...'.format(task.name))

예제 #7

0

파일 보기

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse items links
            items_list = grab.doc.select(
                '//div[@class="catalog-section-it-table-body"]//a')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})

                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page if current is ok
            next_page = grab.doc.select(
                '//a[@class="catalog-pagenav-next"]').attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(
                    self.domain, next_page, {})
                yield self.do_task('parse_page', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #8

0

파일 보기

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            catalog = grab.doc.select('//div[@class="listcatalog"]')

            # gerenate new tasks
            links = catalog.select(
                './/div[@class="navigation"]/div[@class="nav"]//a')
            max_page = 1

            for link in links:
                page_number = link.text('')

                if page_number and Ree.number.match(page_number):
                    max_page = max(max_page, int(page_number))

            if max_page > 1:
                for page in range(2, max_page):
                    next_page = UrlGenerator.get_page_params(
                        task.url, '', {'PAGEN_1': page})
                    yield self.do_task('parse_page_items', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #9

0

파일 보기

파일: d_spider_5sti.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task, f'Err task, attempt {task.task_try_count}')
                return

            exclude_links_labels = ['Оплата', 'Доставка', 'Гарантия', 'Акции', 'Рекомендации по подбору', 'Информация и реквизиты',
                                    'Новости', 'Контакты', 'Сервис-центр']

            # take all links from horizontal nav, exclude anchors (#) and external links
            category_list = grab.doc.select('//div[@id="navbar"]//a[starts-with(@href, "/")]')

            # take links only for main cats, because its already contain all sub-cats items
            for link in category_list:
                # skip if label have stop words
                if link.text().strip() in exclude_links_labels:
                    continue

                link = link.attr('href')

                # make absolute urls if needed
                if link[:1] == '/':
                    link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_page', url=link, priority=90, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #10

0

파일 보기

    def __init__(self, thread_number: int, try_limit: int = 0) -> None:
        super().__init__(thread_number=thread_number,
                         network_try_limit=try_limit,
                         priority_mode='const')

        # Logger
        self.log = Log(DSpiderCommon.logger)
        self.logger = DSpiderCommon.logger

        # Re module init
        Ree.init()

        # Work data
        self.single_task_mode = False
        self.tasks_store = {}
        self.result = []
        self.cookie_jar = {}

        # Info
        self.info = StatCounter()
        self.info.add_task(StatCounter.TASK_FACTORY)

        # Common vars
        self.domain = UrlGenerator.get_host_from_url(
            Config.get_seq('SITE_URL')[0])
        self.err_limit = try_limit

        # Cache
        cache_enabled = Config.get('APP_CACHE_ENABLED', '')
        cache_db_host = Config.get('APP_CACHE_DB_HOST', '')

        if cache_enabled and cache_db_host:
            cache_db_name = Config.get('APP_CACHE_DB_NAME', 'pythonparsers')
            cache_db_type = Config.get('APP_CACHE_DB_TYPE', 'mysql')
            cache_db_port = int(Config.get('APP_CACHE_DB_PORT', '3306'))
            cache_db_user = Config.get('APP_CACHE_DB_USER', 'root')
            cache_db_pass = Config.get('APP_CACHE_DB_PASS', '')

            if cache_db_user and cache_db_pass:
                self.setup_cache(backend=cache_db_type,
                                 database=cache_db_name,
                                 host=cache_db_host,
                                 port=cache_db_port,
                                 user=cache_db_user,
                                 password=cache_db_pass)
            else:
                self.setup_cache(backend=cache_db_type,
                                 database=cache_db_name,
                                 host=cache_db_host,
                                 port=cache_db_port)

            self.logger.info('!!! CACHE MODE ENABLED !!!')

        # Debug mode (only 1 iteration of each task)
        if Config.get('APP_SINGLE_TASK', ''):
            self.logger.info('!!! SINGLE MODE ENABLED !!!')
            self.single_task_mode = True

        self.logger.info('Init parser ok...')

예제 #11

0

파일 보기

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse items links
            items_list = grab.doc.select(
                '//table[@class="prod"]//tr[not(contains(@class, "white"))]')

            # if all page with useless records - skip other pages
            success_pages = 0

            for index, row in enumerate(items_list):
                # check row
                status = row.select('./td[2]')
                price = row.select('./td[3]')

                if status == '0' or price == 'под заказ':
                    self.log.warning(
                        task, f'Skip item, because status {status} / {price}')
                    continue

                link = row.select('./td[@class="name"]/a').attr('href')
                link = UrlGenerator.get_page_params(self.domain, link, {})

                success_pages += 1
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page if current is ok
            if success_pages > 0:
                next_page = grab.doc.select(
                    '//div[@class="pagination"][1]//a[@class="next_page_link"]'
                ).attr('href', '')

                if next_page:
                    next_page = UrlGenerator.get_page_params(
                        self.domain, next_page, {})
                    yield self.do_task('parse_page', next_page, 90)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #12

0

파일 보기

파일: d_spider_5izi.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse table rows
            table = grab.doc.select('//table[@class="table search_table list"]//tr')

            # parse table links to items
            items_links = table.select('.//a[starts-with(@href, "/catalog/catalog")]')

            for index, row in enumerate(items_links):
                link = row.attr('href')

                # make absolute urls if needed
                if link[:1] == '/':
                    link = UrlGenerator.get_page_params(self.domain, link, {})

                yield self.do_task('parse_item', link, 100, last=True)

            # parse "показать ещё" links
            more_links = grab.doc.select('.//a[starts-with(@href, "/catalog/?")]')

            # hope it will be only 0 or 1 link
            for index, row in enumerate(more_links):
                link = row.attr('href')

                # make absolute urls if needed
                if link[:1] == '/':
                    link = UrlGenerator.get_page_params(self.domain, link, {})

                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #13

0

파일 보기

파일: d_spider_8tds.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            items_list = grab.doc.select('//div[contains(@class, "goodsGoods")]//a[@class="textTitle"]')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

            # parse next page link
            next_page = grab.doc.select('//a[contains(text(), "»")]').attr('href', '')

            if next_page:
                next_page = UrlGenerator.get_page_params(self.domain, next_page, {})
                yield self.do_task('initial', next_page, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #14

0

파일 보기

파일: d_spider_elev.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        self.logger.info('[{}] Initial url: {}'.format(task.name, task.url))

        if self._check_body_errors(grab, task):
            self.logger.fatal('[{}] Err task with url {}, attempt {}'.format(
                task.name, task.url, task.task_try_count))
            return

        try:
            cat_list = grab.doc.select(
                '//div[@class="block-catalog"]//div[@class="tabs-content"]//a[contains(@href, "shop")]'
            )

            # take links only for main cats, because its already contain all sub-cats items
            for row in cat_list:
                raw_link = row.attr('href')

                # skip sub-cats
                # cat:      /shop/cat/      -> 3
                # sub-cat:   /shop/cat/foo/  -> 4
                if raw_link.count('/') > 3:
                    continue

                # make absolute urls if needed
                if raw_link[:1] == '/':
                    raw_link = UrlGenerator.get_page_params(
                        self.domain, raw_link, {
                            'section': '0',
                            'count': '50',
                            'sort': 'alphabet',
                            'order': 'asc',
                        })

                print(raw_link)
                yield Task('parse_items_v2',
                           url=raw_link,
                           priority=90,
                           raw=True,
                           d_base_url=raw_link,
                           d_page=1,
                           d_need_update_pagination=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.logger.info('[{}] Finish: {}'.format(task.name, task.url))

예제 #15

0

파일 보기

파일: d_spider_5izi.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task, 'Err task, attempt {}'.format(task.task_try_count))
                return

            # make link
            url = UrlGenerator.get_page_params(self.domain, 'catalog', {'curPos': 0})

            # prepare page loop parsing
            yield self.do_task('parse_page', url, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #16

0

파일 보기

파일: d_spider_7rus.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # catalog
            catalog = grab.doc.select('//div[@id="categories"]//a')

            for link in catalog:
                link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {})
                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #17

0

파일 보기

파일: d_spider_5owe.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task,
                               f'Err task, attempt {task.task_try_count}')

            links = grab.doc.select('//div[@class="gsections"]//ul//a')

            for link in links:
                url = UrlGenerator.get_page_params(self.domain,
                                                   link.attr('href'), {})
                yield Task('parse_page', url=url, priority=90, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #18

0

파일 보기

파일: d_spider_7fre.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            items_list = grab.doc.select(
                '//div[@class="catalog-item-price-view"]//a[@itemprop="url"]')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #19

0

파일 보기

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task,
                               f'Err task, attempt {task.task_try_count}')
                return

            category_list = grab.doc.select(
                '//div[@id="categories_block_left"]/div[1]//a')

            for link in category_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #20

0

파일 보기

파일: d_spider_6dui.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse items links
            items_list = grab.doc.select(
                '//div[@class="tovar-table tovar_basic"]//div[@class="tovar-col tovar2"]/a'
            )

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #21

0

파일 보기

파일: d_spider_5int.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)

            # parse items links
            items_links = grab.doc.select(
                '//div[@id="catalog-list"]//div[@class="catalog-items"]//a[@property="name"]'
            )

            for row in items_links:
                link = row.attr('href')
                link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_item', url=link, priority=100, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #22

0

파일 보기

파일: d_spider_5sti.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_page(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)

            # parse items
            items_list = grab.doc.select('//div[@class="prod-list-cell"]//a[.!=""]')

            for index, row in enumerate(items_list):
                link = row.attr('href')

                # make absolute urls if needed
                if link[:1] == '/':
                    link = UrlGenerator.get_page_params(self.domain, link, {})

                yield Task('parse_item', url=link, priority=100, raw=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #23

0

파일 보기

파일: d_spider_6dui.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # parse cats
            categories_list = grab.doc.select(
                '//div[@class="main-links"]//p[@class="home_subcatalog_links_box"]/a'
            )

            for link in categories_list:
                link = UrlGenerator.get_page_params(
                    self.domain, link.attr('href'),
                    {'SET_PAGE_COUNT': '99999'})
                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #24

0

파일 보기

    def task_parse_page_items(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            catalog = grab.doc.select('//div[@class="listcatalog"]')

            # parse items links
            items_list = catalog.select(
                './/table[@class="lclistitem"]//td[@class="name"]//a')

            for link in items_list:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {})
                yield self.do_task('parse_item', link, 100, last=True)

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #25

0

파일 보기

파일: d_spider_7fre.py 프로젝트: Holovin/PythonParsersGrab

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # catalog
            catalog = grab.doc.select(
                '//div[@class="workarea"]//div[contains(@class, "catalog-section-title")]/a'
            )

            for link in catalog:
                link = UrlGenerator.get_page_params(self.domain,
                                                    link.attr('href'), {
                                                        'limit': 900,
                                                        'view': 'price'
                                                    })
                yield self.do_task('parse_page', link, 90)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #26

0

파일 보기

    def task_initial(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                self.log.fatal(task,
                               f'Err task, attempt {task.task_try_count}')
                return

            links = grab.doc.select(
                '//nav//a[not(.//img) and re:match(@href, "/product_list/.+")]'
            )

            for link in links:
                url = UrlGenerator.get_page_params(self.domain,
                                                   link.attr('href'), {
                                                       'count': 999999,
                                                       'name': 'asc'
                                                   })
                yield Task('parse_page', url=url, priority=90, raw=True)

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #27

0

파일 보기

파일: d_spider_5sti.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_item(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)

            # parse fields
            # A = name
            product_name = grab.doc.select('//h1').text()

            # B = count
            # C = status
            product_count_string = grab.doc.select('//span[@class="p-qty-wh"]').text()

            if product_count_string == 'Под заказ':
                product_status = '-1'
                product_count = '-1'

            elif product_count_string == 'На складе: более 100':
                product_status = '-1'
                product_count = 100

            else:
                product_status = '-1'
                product_count = DSpider.re_product_count.match(product_count_string).groupdict()['count']

            # D = unit [const = value]
            product_unit = 'ед.'

            # E = price
            product_price = DSpider.re_product_price.match(grab.doc.select('//div[@class="ppage-product-price"]').text()).groupdict()['price'].replace(' ', '')

            # check if positive and correct price
            if not product_price.isdigit():
                self.log.debug(task, f'Skip item, cuz wrong price {product_price}')
                return

            # F = vendor code [const = skip for parsing]
            product_vendor_code = ''

            # G = vendor [const = value]
            product_vendor = 'Stiebel Eltron'

            # H = photo url
            product_photo_url = UrlGenerator.get_page_params(self.domain, grab.doc.select('//img[@id="Image1"]').attr('src'), {})

            # I = description
            product_description = {'ОБЛАСТЬ ПРИМЕНЕНИЯ': grab.doc.select('//div[@class="col-md-14"]/p').text(default=' ')}

            table = grab.doc.select('//div[@class="col-md-14"]/table//tr')

            for row in table:

                key = row.select('./td[1]').text()
                value = row.select('./td[2]').text()

                if key:
                    product_description[key] = value

            # save
            self.result.append({
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description
            })

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task)

예제 #28

0

파일 보기

    def task_parse_items(self, grab, task):
        self.logger.info('[{}] Start: {}'.format(task.name, task.url))

        if self._check_body_errors(grab, task):
            if task.task_try_count < self.err_limit:
                self.logger.error(
                    '[{}] Restart task with url {}, attempt {}'.format(
                        task.name, task.url, task.task_try_count))
                yield Task('parse_items',
                           url=task.url,
                           priority=105,
                           task_try_count=task.task_try_count + 1,
                           raw=True)
            else:
                self.logger.error(
                    '[{}] Skip task with url {}, attempt {}'.format(
                        task.name, task.url, task.task_try_count))

            return

        try:
            # parse pagination numbers
            if not task.get('d_skip_page_check'):
                items = grab.doc.select('//a[contains(@href, "{}")]'.format(
                    Config.get('SITE_PAGE_PARAM')))
                max_page = get_max_page(items, 1)
                self.logger.info('[{}] Find max page: {}'.format(
                    task.name, max_page))

                url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM'))

                # self-execute from 2 page (if needed)
                for p in range(2, max_page + 1):
                    url = url_gen.get_page(p)
                    yield Task('parse_items',
                               url=url,
                               priority=100,
                               d_skip_page_check=True,
                               raw=True)

            # parse items
            items_list = grab.doc.select(
                '//div[@class="cart_table"]/div/div/table/tbody/tr')

            for index, row in enumerate(items_list):
                try:
                    # NAME
                    item_name = row.select(
                        './td[1]//div[@class="description"]/div/a').text(
                        ).strip()

                    # UNIT
                    unit = row.select('./td[2]').text().strip()
                    if unit == '':
                        unit = 'ед.'

                    # PRICE
                    price_raw = row.select(
                        './td[6]//meta[@itemprop="lowprice"]').attr('content')
                    match = Ree.float.match(price_raw)
                    # check & fix
                    if not match:
                        self.logger.warning(
                            '[{}] Skip item, because price is {} (line: {})'.
                            format(task.name, price_raw, index))
                        continue

                    price = match.groupdict()['price'].replace(',', '.')

                    # COUNT
                    count = row.select('./td[5]')
                    count_text = count.text().strip()

                    # case 1: string line
                    if count_text == 'распродано':
                        item_count = self.const_price_on_request
                        item_place = self.const_default_place

                        # OUTPUT
                        self.logger.debug(
                            '[{}] Item added, index {} at url {}'.format(
                                task.name, index, task.url))
                        self.result.append({
                            'name': item_name,
                            'count': item_count,
                            'unit': unit,
                            'price': price,
                            'place': item_place
                        })

                    # case 2: string line
                    elif count_text == 'под заказ':
                        item_count = self.const_stock_zero
                        item_place = self.const_default_place
                        # OUTPUT
                        self.logger.debug(
                            '[{}] Item added, index {} at url {}'.format(
                                task.name, index, task.url))
                        self.result.append({
                            'name': item_name,
                            'count': item_count,
                            'unit': unit,
                            'price': price,
                            'place': item_place
                        })

                    # case 3
                    else:
                        count_rows = count.select(
                            './/div[@class="layer_info"]/table/tbody/tr')

                        for count_row in count_rows:
                            item_place = count_row.select(
                                './td[1]').text().strip()
                            item_count = 0

                            # add stock
                            place_count_stock = count_row.select(
                                './td[1]').text().strip()
                            if Ree.float.match(place_count_stock):
                                item_count += float(place_count_stock)

                            # add expo
                            place_count_expo = count_row.select(
                                './td[2]').text().strip()
                            if Ree.float.match(place_count_expo):
                                item_count += float(place_count_expo)

                            if item_count > 0:
                                # OUTPUT
                                self.logger.debug(
                                    '[{}] Item added, index {} at url {}'.
                                    format(task.name, index, task.url))
                                self.result.append({
                                    'name':
                                    item_name,
                                    # 3.140 -> 3.14; 3.0 -> 3
                                    'count':
                                    '{0:g}'.format(item_count),
                                    'unit':
                                    unit,
                                    'price':
                                    price,
                                    'place':
                                    item_place
                                })
                except IndexError as e:
                    self.logger.warning('[{}] Skip item: {}, {}'.format(
                        task.name,
                        type(e).__name__, task.url))

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.logger.info('[{}] Finish: {}'.format(task.name, task.url))

예제 #29

0

파일 보기

파일: d_spider_6cab.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_item(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # common block with info
            product_info = grab.doc.select('//div[@id="product"]')

            # parse fields
            # A = name
            product_name = product_info.select('.//h1').text()

            # B = count (quantity)
            # C = status (delivery)
            product_count_string_stock = product_info.select(
                './/span[contains(@class, "in-stock")]').text(default='')
            product_count_string_order = product_info.select(
                './/span[contains(@class, "under-order")]').text(default='')

            # D = unit (measure) [const if no stock, else parse]
            if product_count_string_stock and product_count_string_stock[
                    -1] == 'м':
                product_unit = 'м'
                product_count_string_stock = product_count_string_stock[:
                                                                        -1].strip(
                                                                        )
            else:
                product_unit = 'ед.'

            if product_count_string_stock.isdigit():
                product_count = product_count_string_stock
                product_status = '0'

            elif product_count_string_order == 'под заказ':
                product_count = '-1'
                product_status = '-1'

            else:
                self.log.warning(
                    task,
                    f'Unknown count status {product_count_string_stock} or {product_count_string_order}, skip...'
                )
                return

            # E = price
            product_price_raw = product_info.select(
                './/p[@class="summ"]').text(default='')

            if not product_price_raw:
                self.log.warning(
                    task, f'Unknown price status {product_price_raw}, skip...')
                return

            if product_price_raw == 'по запросу':
                product_price = '-1'

            else:
                # parse number from child node
                product_price_raw = product_info.select(
                    './/p[@class="summ"]/span[@id="commmon_price"]').text(
                        default='')

                if not product_price_raw or not Ree.float.match(
                        product_price_raw):
                    self.log.warning(
                        task,
                        f'Unknown price status {product_price_raw}, skip...')
                    return

                product_price = product_price_raw

            # F = vendor code (sku) [const]
            product_vendor_code = ''

            # G = vendor (manufacture) [const]
            product_vendor = ''

            # H = photo url
            product_photo_url_raw = product_info.select(
                './/img[@itemprop="image"]').attr('src')
            product_photo_url = UrlGenerator.get_page_params(
                self.domain, product_photo_url_raw, {})

            # I = description (properties)
            product_description = {}

            # I :: Base
            table = product_info.select('.//div[@class="tab-content-list"]')

            for row in table:
                key = row.select('./span[1]').text(default=None)
                value = row.select('./span[2]').text(default=None)

                if key and value and key != 'Наличие':
                    product_description[key] = value

            # I :: description
            description = product_info.select('.//div[@id="opisanie"]').text(
                default='')
            if description:
                product_description['Описание'] = description

            # I :: using
            description = product_info.select('.//div[@id="primenenie"]').text(
                default='')
            if description:
                product_description['Применение'] = description

            # I :: tech
            description = product_info.select(
                './/div[@id="tehnicheskie_harakteristiki"]').text(default='')
            if description:
                product_description['Технические характеристики'] = description

            # save
            self.result.append({
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description
            })

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task, last=True)

예제 #30

0

파일 보기

파일: d_spider_8tds.py 프로젝트: Holovin/PythonParsersGrab

    def task_parse_item(self, grab, task):
        try:
            if self.check_body_errors(grab, task):
                yield self.check_errors(task)
                return

            # common block with info
            product_info = grab.doc.select('//div[@id="product"]')

            # parse fields
            # A = name
            product_name = product_info.select('.//h1').text()

            # B = count (quantity)
            # C = status (delivery)
            product_count_string = product_info.select('.//span[@itemprop="availability"]').text('')

            if 'шт.' in product_count_string or 'м.п.' in product_count_string or product_count_string in ['есть', 'в наличии']:
                product_count = '-1'
                product_status = '0'

            elif 'срок поставки' in product_count_string or product_count_string in ['НЕТ', 'нет']:
                self.log.info(task, f'Skip count status {product_count_string} skip...')
                return

            else:
                self.log.warning(task, f'Unknown count status {product_count_string} skip...')
                return

            # D = unit (measure)
            product_unit = product_info.select('.//form[@class="form_addCart"]//span[@class="measure"]').text('ед.')

            # E = price
            product_price = product_info.select('.//form[@class="form_addCart"]//meta[@itemprop="price"]').attr('content', '')

            if not product_price or not Ree.float.match(product_price):
                self.log.warning(task, f'Unknown price status {product_price}, skip...')
                return

            # F = vendor code (sku)
            product_vendor_code = product_info.select('.//span[@class="articleValue"]').text('')

            # G = vendor (manufacture)
            product_vendor = product_info.select('.//a[@itemprop="brand"]').text('')

            # H = photo url
            product_photo_url_raw = product_info.select('.//img[@itemprop="image"]').attr('src', '')

            if product_photo_url_raw:
                product_photo_url = UrlGenerator.get_page_params(self.domain, product_photo_url_raw, {})
            else:
                product_photo_url = ''

            # I = description (properties)
            product_description = {'Описание': product_info.select('.//div[@class="content"][1]').text('')}

            # ID
            product_id = product_info.select('.//input[@name="addcart"]').attr('value', '')

            # save
            self.result.append({
                'name': product_name,
                'quantity': product_count,
                'delivery': product_status,
                'measure': product_unit,
                'price': product_price,
                'sku': product_vendor_code,
                'manufacture': product_vendor,
                'photo': product_photo_url,
                'properties': product_description,
                'id': product_id,
            })

        except Exception as e:
            self.process_error(grab, task, e)

        finally:
            self.process_finally(task, last=True)