def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # catalog catalog = grab.doc.select('//div[@id="js_ajax-catalog"]') # parse items items_list = catalog.select('.//a[@class="bx_rcm_view_link"]') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = catalog.select('.//a[@title="След."]').attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('initial', next_page, 90, last=False) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return items_list = grab.doc.select( '//div[contains(@class, "bx_catalog_list_home")]//div[@class="title"]//a' ) for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = grab.doc.select( '//div[@class="bx-pagination "]//li[@class="bx-pag-next"]/a' ).attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('parse_page', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse items items_list = grab.doc.select( '//div[@class="products-wrap"]//a[@itemprop="url" and @class="products-name"]' ) for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = grab.doc.select( '//div[contains(@class, "pagination")][1]//a[@class="pg-next" and contains(@href, "p=")]' ).attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('initial', next_page, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return catalog = grab.doc.select('//div[@class="catalog_section "]') # parse items links items_list = catalog.select('.//div[@class="product_item"]//a') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = catalog.select('//a[@class="next_page"]').attr('href', '') if next_page: next_page = UrlGenerator.get_page_params(self.domain, next_page, {}) yield self.do_task('parse_page', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) # parse items links items_links = grab.doc.select( '//div[@class="catalog"]//div[@class="header"]//a') for row in items_links: link = row.attr('href') link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_item', url=link, priority=100, raw=True) # parse next page items_next_page = grab.doc.select( '//div[@class="pagination"]//a[contains(@class, "nextpage")]') for row in items_next_page: link = row.attr('href') link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_page', url=link, priority=90, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): self.logger.debug('[{}] Initial url: {}'.format(task.name, task.url)) if self._check_body_errors(grab, task): self.logger.fatal('[{}] Err task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) return try: items = grab.doc.select( '//div[contains(@class, "pagination")]//a[contains(@href, "{}")]' .format(Config.get('SITE_PAGE_PARAM'))) max_page = get_max_page(items, 0, -1) self.logger.info('[{}] Task: {}, max_page: {}'.format( task.name, task.url, max_page)) url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM')) for p in range(0, max_page + 1): url = url_gen.get_page(p) yield Task('parse_page', url=url, priority=90) except Exception as e: self._process_error(grab, task, e) self.logger.info('[{}] Tasks added...'.format(task.name))
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse items links items_list = grab.doc.select( '//div[@class="catalog-section-it-table-body"]//a') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page if current is ok next_page = grab.doc.select( '//a[@class="catalog-pagenav-next"]').attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('parse_page', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return catalog = grab.doc.select('//div[@class="listcatalog"]') # gerenate new tasks links = catalog.select( './/div[@class="navigation"]/div[@class="nav"]//a') max_page = 1 for link in links: page_number = link.text('') if page_number and Ree.number.match(page_number): max_page = max(max_page, int(page_number)) if max_page > 1: for page in range(2, max_page): next_page = UrlGenerator.get_page_params( task.url, '', {'PAGEN_1': page}) yield self.do_task('parse_page_items', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, f'Err task, attempt {task.task_try_count}') return exclude_links_labels = ['Оплата', 'Доставка', 'Гарантия', 'Акции', 'Рекомендации по подбору', 'Информация и реквизиты', 'Новости', 'Контакты', 'Сервис-центр'] # take all links from horizontal nav, exclude anchors (#) and external links category_list = grab.doc.select('//div[@id="navbar"]//a[starts-with(@href, "/")]') # take links only for main cats, because its already contain all sub-cats items for link in category_list: # skip if label have stop words if link.text().strip() in exclude_links_labels: continue link = link.attr('href') # make absolute urls if needed if link[:1] == '/': link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_page', url=link, priority=90, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def __init__(self, thread_number: int, try_limit: int = 0) -> None: super().__init__(thread_number=thread_number, network_try_limit=try_limit, priority_mode='const') # Logger self.log = Log(DSpiderCommon.logger) self.logger = DSpiderCommon.logger # Re module init Ree.init() # Work data self.single_task_mode = False self.tasks_store = {} self.result = [] self.cookie_jar = {} # Info self.info = StatCounter() self.info.add_task(StatCounter.TASK_FACTORY) # Common vars self.domain = UrlGenerator.get_host_from_url( Config.get_seq('SITE_URL')[0]) self.err_limit = try_limit # Cache cache_enabled = Config.get('APP_CACHE_ENABLED', '') cache_db_host = Config.get('APP_CACHE_DB_HOST', '') if cache_enabled and cache_db_host: cache_db_name = Config.get('APP_CACHE_DB_NAME', 'pythonparsers') cache_db_type = Config.get('APP_CACHE_DB_TYPE', 'mysql') cache_db_port = int(Config.get('APP_CACHE_DB_PORT', '3306')) cache_db_user = Config.get('APP_CACHE_DB_USER', 'root') cache_db_pass = Config.get('APP_CACHE_DB_PASS', '') if cache_db_user and cache_db_pass: self.setup_cache(backend=cache_db_type, database=cache_db_name, host=cache_db_host, port=cache_db_port, user=cache_db_user, password=cache_db_pass) else: self.setup_cache(backend=cache_db_type, database=cache_db_name, host=cache_db_host, port=cache_db_port) self.logger.info('!!! CACHE MODE ENABLED !!!') # Debug mode (only 1 iteration of each task) if Config.get('APP_SINGLE_TASK', ''): self.logger.info('!!! SINGLE MODE ENABLED !!!') self.single_task_mode = True self.logger.info('Init parser ok...')
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse items links items_list = grab.doc.select( '//table[@class="prod"]//tr[not(contains(@class, "white"))]') # if all page with useless records - skip other pages success_pages = 0 for index, row in enumerate(items_list): # check row status = row.select('./td[2]') price = row.select('./td[3]') if status == '0' or price == 'под заказ': self.log.warning( task, f'Skip item, because status {status} / {price}') continue link = row.select('./td[@class="name"]/a').attr('href') link = UrlGenerator.get_page_params(self.domain, link, {}) success_pages += 1 yield self.do_task('parse_item', link, 100, last=True) # parse next page if current is ok if success_pages > 0: next_page = grab.doc.select( '//div[@class="pagination"][1]//a[@class="next_page_link"]' ).attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('parse_page', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse table rows table = grab.doc.select('//table[@class="table search_table list"]//tr') # parse table links to items items_links = table.select('.//a[starts-with(@href, "/catalog/catalog")]') for index, row in enumerate(items_links): link = row.attr('href') # make absolute urls if needed if link[:1] == '/': link = UrlGenerator.get_page_params(self.domain, link, {}) yield self.do_task('parse_item', link, 100, last=True) # parse "показать ещё" links more_links = grab.doc.select('.//a[starts-with(@href, "/catalog/?")]') # hope it will be only 0 or 1 link for index, row in enumerate(more_links): link = row.attr('href') # make absolute urls if needed if link[:1] == '/': link = UrlGenerator.get_page_params(self.domain, link, {}) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return items_list = grab.doc.select('//div[contains(@class, "goodsGoods")]//a[@class="textTitle"]') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = grab.doc.select('//a[contains(text(), "»")]').attr('href', '') if next_page: next_page = UrlGenerator.get_page_params(self.domain, next_page, {}) yield self.do_task('initial', next_page, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): self.logger.info('[{}] Initial url: {}'.format(task.name, task.url)) if self._check_body_errors(grab, task): self.logger.fatal('[{}] Err task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) return try: cat_list = grab.doc.select( '//div[@class="block-catalog"]//div[@class="tabs-content"]//a[contains(@href, "shop")]' ) # take links only for main cats, because its already contain all sub-cats items for row in cat_list: raw_link = row.attr('href') # skip sub-cats # cat: /shop/cat/ -> 3 # sub-cat: /shop/cat/foo/ -> 4 if raw_link.count('/') > 3: continue # make absolute urls if needed if raw_link[:1] == '/': raw_link = UrlGenerator.get_page_params( self.domain, raw_link, { 'section': '0', 'count': '50', 'sort': 'alphabet', 'order': 'asc', }) print(raw_link) yield Task('parse_items_v2', url=raw_link, priority=90, raw=True, d_base_url=raw_link, d_page=1, d_need_update_pagination=True) except Exception as e: self._process_error(grab, task, e) finally: self.logger.info('[{}] Finish: {}'.format(task.name, task.url))
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, 'Err task, attempt {}'.format(task.task_try_count)) return # make link url = UrlGenerator.get_page_params(self.domain, 'catalog', {'curPos': 0}) # prepare page loop parsing yield self.do_task('parse_page', url, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # catalog catalog = grab.doc.select('//div[@id="categories"]//a') for link in catalog: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, f'Err task, attempt {task.task_try_count}') links = grab.doc.select('//div[@class="gsections"]//ul//a') for link in links: url = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield Task('parse_page', url=url, priority=90, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return items_list = grab.doc.select( '//div[@class="catalog-item-price-view"]//a[@itemprop="url"]') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, f'Err task, attempt {task.task_try_count}') return category_list = grab.doc.select( '//div[@id="categories_block_left"]/div[1]//a') for link in category_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse items links items_list = grab.doc.select( '//div[@class="tovar-table tovar_basic"]//div[@class="tovar-col tovar2"]/a' ) for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) # parse items links items_links = grab.doc.select( '//div[@id="catalog-list"]//div[@class="catalog-items"]//a[@property="name"]' ) for row in items_links: link = row.attr('href') link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_item', url=link, priority=100, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) # parse items items_list = grab.doc.select('//div[@class="prod-list-cell"]//a[.!=""]') for index, row in enumerate(items_list): link = row.attr('href') # make absolute urls if needed if link[:1] == '/': link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_item', url=link, priority=100, raw=True) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse cats categories_list = grab.doc.select( '//div[@class="main-links"]//p[@class="home_subcatalog_links_box"]/a' ) for link in categories_list: link = UrlGenerator.get_page_params( self.domain, link.attr('href'), {'SET_PAGE_COUNT': '99999'}) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page_items(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return catalog = grab.doc.select('//div[@class="listcatalog"]') # parse items links items_list = catalog.select( './/table[@class="lclistitem"]//td[@class="name"]//a') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # catalog catalog = grab.doc.select( '//div[@class="workarea"]//div[contains(@class, "catalog-section-title")]/a' ) for link in catalog: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), { 'limit': 900, 'view': 'price' }) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, f'Err task, attempt {task.task_try_count}') return links = grab.doc.select( '//nav//a[not(.//img) and re:match(@href, "/product_list/.+")]' ) for link in links: url = UrlGenerator.get_page_params(self.domain, link.attr('href'), { 'count': 999999, 'name': 'asc' }) yield Task('parse_page', url=url, priority=90, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_item(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) # parse fields # A = name product_name = grab.doc.select('//h1').text() # B = count # C = status product_count_string = grab.doc.select('//span[@class="p-qty-wh"]').text() if product_count_string == 'Под заказ': product_status = '-1' product_count = '-1' elif product_count_string == 'На складе: более 100': product_status = '-1' product_count = 100 else: product_status = '-1' product_count = DSpider.re_product_count.match(product_count_string).groupdict()['count'] # D = unit [const = value] product_unit = 'ед.' # E = price product_price = DSpider.re_product_price.match(grab.doc.select('//div[@class="ppage-product-price"]').text()).groupdict()['price'].replace(' ', '') # check if positive and correct price if not product_price.isdigit(): self.log.debug(task, f'Skip item, cuz wrong price {product_price}') return # F = vendor code [const = skip for parsing] product_vendor_code = '' # G = vendor [const = value] product_vendor = 'Stiebel Eltron' # H = photo url product_photo_url = UrlGenerator.get_page_params(self.domain, grab.doc.select('//img[@id="Image1"]').attr('src'), {}) # I = description product_description = {'ОБЛАСТЬ ПРИМЕНЕНИЯ': grab.doc.select('//div[@class="col-md-14"]/p').text(default=' ')} table = grab.doc.select('//div[@class="col-md-14"]/table//tr') for row in table: key = row.select('./td[1]').text() value = row.select('./td[2]').text() if key: product_description[key] = value # save self.result.append({ 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description }) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_items(self, grab, task): self.logger.info('[{}] Start: {}'.format(task.name, task.url)) if self._check_body_errors(grab, task): if task.task_try_count < self.err_limit: self.logger.error( '[{}] Restart task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) yield Task('parse_items', url=task.url, priority=105, task_try_count=task.task_try_count + 1, raw=True) else: self.logger.error( '[{}] Skip task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) return try: # parse pagination numbers if not task.get('d_skip_page_check'): items = grab.doc.select('//a[contains(@href, "{}")]'.format( Config.get('SITE_PAGE_PARAM'))) max_page = get_max_page(items, 1) self.logger.info('[{}] Find max page: {}'.format( task.name, max_page)) url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM')) # self-execute from 2 page (if needed) for p in range(2, max_page + 1): url = url_gen.get_page(p) yield Task('parse_items', url=url, priority=100, d_skip_page_check=True, raw=True) # parse items items_list = grab.doc.select( '//div[@class="cart_table"]/div/div/table/tbody/tr') for index, row in enumerate(items_list): try: # NAME item_name = row.select( './td[1]//div[@class="description"]/div/a').text( ).strip() # UNIT unit = row.select('./td[2]').text().strip() if unit == '': unit = 'ед.' # PRICE price_raw = row.select( './td[6]//meta[@itemprop="lowprice"]').attr('content') match = Ree.float.match(price_raw) # check & fix if not match: self.logger.warning( '[{}] Skip item, because price is {} (line: {})'. format(task.name, price_raw, index)) continue price = match.groupdict()['price'].replace(',', '.') # COUNT count = row.select('./td[5]') count_text = count.text().strip() # case 1: string line if count_text == 'распродано': item_count = self.const_price_on_request item_place = self.const_default_place # OUTPUT self.logger.debug( '[{}] Item added, index {} at url {}'.format( task.name, index, task.url)) self.result.append({ 'name': item_name, 'count': item_count, 'unit': unit, 'price': price, 'place': item_place }) # case 2: string line elif count_text == 'под заказ': item_count = self.const_stock_zero item_place = self.const_default_place # OUTPUT self.logger.debug( '[{}] Item added, index {} at url {}'.format( task.name, index, task.url)) self.result.append({ 'name': item_name, 'count': item_count, 'unit': unit, 'price': price, 'place': item_place }) # case 3 else: count_rows = count.select( './/div[@class="layer_info"]/table/tbody/tr') for count_row in count_rows: item_place = count_row.select( './td[1]').text().strip() item_count = 0 # add stock place_count_stock = count_row.select( './td[1]').text().strip() if Ree.float.match(place_count_stock): item_count += float(place_count_stock) # add expo place_count_expo = count_row.select( './td[2]').text().strip() if Ree.float.match(place_count_expo): item_count += float(place_count_expo) if item_count > 0: # OUTPUT self.logger.debug( '[{}] Item added, index {} at url {}'. format(task.name, index, task.url)) self.result.append({ 'name': item_name, # 3.140 -> 3.14; 3.0 -> 3 'count': '{0:g}'.format(item_count), 'unit': unit, 'price': price, 'place': item_place }) except IndexError as e: self.logger.warning('[{}] Skip item: {}, {}'.format( task.name, type(e).__name__, task.url)) except Exception as e: self._process_error(grab, task, e) finally: self.logger.info('[{}] Finish: {}'.format(task.name, task.url))
def task_parse_item(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # common block with info product_info = grab.doc.select('//div[@id="product"]') # parse fields # A = name product_name = product_info.select('.//h1').text() # B = count (quantity) # C = status (delivery) product_count_string_stock = product_info.select( './/span[contains(@class, "in-stock")]').text(default='') product_count_string_order = product_info.select( './/span[contains(@class, "under-order")]').text(default='') # D = unit (measure) [const if no stock, else parse] if product_count_string_stock and product_count_string_stock[ -1] == 'м': product_unit = 'м' product_count_string_stock = product_count_string_stock[: -1].strip( ) else: product_unit = 'ед.' if product_count_string_stock.isdigit(): product_count = product_count_string_stock product_status = '0' elif product_count_string_order == 'под заказ': product_count = '-1' product_status = '-1' else: self.log.warning( task, f'Unknown count status {product_count_string_stock} or {product_count_string_order}, skip...' ) return # E = price product_price_raw = product_info.select( './/p[@class="summ"]').text(default='') if not product_price_raw: self.log.warning( task, f'Unknown price status {product_price_raw}, skip...') return if product_price_raw == 'по запросу': product_price = '-1' else: # parse number from child node product_price_raw = product_info.select( './/p[@class="summ"]/span[@id="commmon_price"]').text( default='') if not product_price_raw or not Ree.float.match( product_price_raw): self.log.warning( task, f'Unknown price status {product_price_raw}, skip...') return product_price = product_price_raw # F = vendor code (sku) [const] product_vendor_code = '' # G = vendor (manufacture) [const] product_vendor = '' # H = photo url product_photo_url_raw = product_info.select( './/img[@itemprop="image"]').attr('src') product_photo_url = UrlGenerator.get_page_params( self.domain, product_photo_url_raw, {}) # I = description (properties) product_description = {} # I :: Base table = product_info.select('.//div[@class="tab-content-list"]') for row in table: key = row.select('./span[1]').text(default=None) value = row.select('./span[2]').text(default=None) if key and value and key != 'Наличие': product_description[key] = value # I :: description description = product_info.select('.//div[@id="opisanie"]').text( default='') if description: product_description['Описание'] = description # I :: using description = product_info.select('.//div[@id="primenenie"]').text( default='') if description: product_description['Применение'] = description # I :: tech description = product_info.select( './/div[@id="tehnicheskie_harakteristiki"]').text(default='') if description: product_description['Технические характеристики'] = description # save self.result.append({ 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description }) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task, last=True)
def task_parse_item(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # common block with info product_info = grab.doc.select('//div[@id="product"]') # parse fields # A = name product_name = product_info.select('.//h1').text() # B = count (quantity) # C = status (delivery) product_count_string = product_info.select('.//span[@itemprop="availability"]').text('') if 'шт.' in product_count_string or 'м.п.' in product_count_string or product_count_string in ['есть', 'в наличии']: product_count = '-1' product_status = '0' elif 'срок поставки' in product_count_string or product_count_string in ['НЕТ', 'нет']: self.log.info(task, f'Skip count status {product_count_string} skip...') return else: self.log.warning(task, f'Unknown count status {product_count_string} skip...') return # D = unit (measure) product_unit = product_info.select('.//form[@class="form_addCart"]//span[@class="measure"]').text('ед.') # E = price product_price = product_info.select('.//form[@class="form_addCart"]//meta[@itemprop="price"]').attr('content', '') if not product_price or not Ree.float.match(product_price): self.log.warning(task, f'Unknown price status {product_price}, skip...') return # F = vendor code (sku) product_vendor_code = product_info.select('.//span[@class="articleValue"]').text('') # G = vendor (manufacture) product_vendor = product_info.select('.//a[@itemprop="brand"]').text('') # H = photo url product_photo_url_raw = product_info.select('.//img[@itemprop="image"]').attr('src', '') if product_photo_url_raw: product_photo_url = UrlGenerator.get_page_params(self.domain, product_photo_url_raw, {}) else: product_photo_url = '' # I = description (properties) product_description = {'Описание': product_info.select('.//div[@class="content"][1]').text('')} # ID product_id = product_info.select('.//input[@name="addcart"]').attr('value', '') # save self.result.append({ 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description, 'id': product_id, }) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task, last=True)