def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) # parse items links items_links = grab.doc.select( '//div[@class="catalog"]//div[@class="header"]//a') for row in items_links: link = row.attr('href') link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_item', url=link, priority=100, raw=True) # parse next page items_next_page = grab.doc.select( '//div[@class="pagination"]//a[contains(@class, "nextpage")]') for row in items_next_page: link = row.attr('href') link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_page', url=link, priority=90, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return catalog = grab.doc.select('//div[@class="catalog_section "]') # parse items links items_list = catalog.select('.//div[@class="product_item"]//a') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = catalog.select('//a[@class="next_page"]').attr('href', '') if next_page: next_page = UrlGenerator.get_page_params(self.domain, next_page, {}) yield self.do_task('parse_page', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # catalog catalog = grab.doc.select('//div[@id="js_ajax-catalog"]') # parse items items_list = catalog.select('.//a[@class="bx_rcm_view_link"]') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = catalog.select('.//a[@title="След."]').attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('initial', next_page, 90, last=False) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse items links items_list = grab.doc.select( '//div[@class="catalog-section-it-table-body"]//a') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page if current is ok next_page = grab.doc.select( '//a[@class="catalog-pagenav-next"]').attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('parse_page', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return items_list = grab.doc.select( '//div[contains(@class, "bx_catalog_list_home")]//div[@class="title"]//a' ) for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = grab.doc.select( '//div[@class="bx-pagination "]//li[@class="bx-pag-next"]/a' ).attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('parse_page', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse items items_list = grab.doc.select( '//div[@class="products-wrap"]//a[@itemprop="url" and @class="products-name"]' ) for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = grab.doc.select( '//div[contains(@class, "pagination")][1]//a[@class="pg-next" and contains(@href, "p=")]' ).attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('initial', next_page, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return catalog = grab.doc.select('//div[@class="listcatalog"]') # gerenate new tasks links = catalog.select( './/div[@class="navigation"]/div[@class="nav"]//a') max_page = 1 for link in links: page_number = link.text('') if page_number and Ree.number.match(page_number): max_page = max(max_page, int(page_number)) if max_page > 1: for page in range(2, max_page): next_page = UrlGenerator.get_page_params( task.url, '', {'PAGEN_1': page}) yield self.do_task('parse_page_items', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, f'Err task, attempt {task.task_try_count}') return exclude_links_labels = ['Оплата', 'Доставка', 'Гарантия', 'Акции', 'Рекомендации по подбору', 'Информация и реквизиты', 'Новости', 'Контакты', 'Сервис-центр'] # take all links from horizontal nav, exclude anchors (#) and external links category_list = grab.doc.select('//div[@id="navbar"]//a[starts-with(@href, "/")]') # take links only for main cats, because its already contain all sub-cats items for link in category_list: # skip if label have stop words if link.text().strip() in exclude_links_labels: continue link = link.attr('href') # make absolute urls if needed if link[:1] == '/': link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_page', url=link, priority=90, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse items links items_list = grab.doc.select( '//table[@class="prod"]//tr[not(contains(@class, "white"))]') # if all page with useless records - skip other pages success_pages = 0 for index, row in enumerate(items_list): # check row status = row.select('./td[2]') price = row.select('./td[3]') if status == '0' or price == 'под заказ': self.log.warning( task, f'Skip item, because status {status} / {price}') continue link = row.select('./td[@class="name"]/a').attr('href') link = UrlGenerator.get_page_params(self.domain, link, {}) success_pages += 1 yield self.do_task('parse_item', link, 100, last=True) # parse next page if current is ok if success_pages > 0: next_page = grab.doc.select( '//div[@class="pagination"][1]//a[@class="next_page_link"]' ).attr('href', '') if next_page: next_page = UrlGenerator.get_page_params( self.domain, next_page, {}) yield self.do_task('parse_page', next_page, 90) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse table rows table = grab.doc.select('//table[@class="table search_table list"]//tr') # parse table links to items items_links = table.select('.//a[starts-with(@href, "/catalog/catalog")]') for index, row in enumerate(items_links): link = row.attr('href') # make absolute urls if needed if link[:1] == '/': link = UrlGenerator.get_page_params(self.domain, link, {}) yield self.do_task('parse_item', link, 100, last=True) # parse "показать ещё" links more_links = grab.doc.select('.//a[starts-with(@href, "/catalog/?")]') # hope it will be only 0 or 1 link for index, row in enumerate(more_links): link = row.attr('href') # make absolute urls if needed if link[:1] == '/': link = UrlGenerator.get_page_params(self.domain, link, {}) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return items_list = grab.doc.select('//div[contains(@class, "goodsGoods")]//a[@class="textTitle"]') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) # parse next page link next_page = grab.doc.select('//a[contains(text(), "»")]').attr('href', '') if next_page: next_page = UrlGenerator.get_page_params(self.domain, next_page, {}) yield self.do_task('initial', next_page, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): self.logger.info('[{}] Initial url: {}'.format(task.name, task.url)) if self._check_body_errors(grab, task): self.logger.fatal('[{}] Err task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) return try: cat_list = grab.doc.select( '//div[@class="block-catalog"]//div[@class="tabs-content"]//a[contains(@href, "shop")]' ) # take links only for main cats, because its already contain all sub-cats items for row in cat_list: raw_link = row.attr('href') # skip sub-cats # cat: /shop/cat/ -> 3 # sub-cat: /shop/cat/foo/ -> 4 if raw_link.count('/') > 3: continue # make absolute urls if needed if raw_link[:1] == '/': raw_link = UrlGenerator.get_page_params( self.domain, raw_link, { 'section': '0', 'count': '50', 'sort': 'alphabet', 'order': 'asc', }) print(raw_link) yield Task('parse_items_v2', url=raw_link, priority=90, raw=True, d_base_url=raw_link, d_page=1, d_need_update_pagination=True) except Exception as e: self._process_error(grab, task, e) finally: self.logger.info('[{}] Finish: {}'.format(task.name, task.url))
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, 'Err task, attempt {}'.format(task.task_try_count)) return # make link url = UrlGenerator.get_page_params(self.domain, 'catalog', {'curPos': 0}) # prepare page loop parsing yield self.do_task('parse_page', url, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, f'Err task, attempt {task.task_try_count}') links = grab.doc.select('//div[@class="gsections"]//ul//a') for link in links: url = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield Task('parse_page', url=url, priority=90, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # catalog catalog = grab.doc.select('//div[@id="categories"]//a') for link in catalog: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return items_list = grab.doc.select( '//div[@class="catalog-item-price-view"]//a[@itemprop="url"]') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, f'Err task, attempt {task.task_try_count}') return category_list = grab.doc.select( '//div[@id="categories_block_left"]/div[1]//a') for link in category_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) # parse items links items_links = grab.doc.select( '//div[@id="catalog-list"]//div[@class="catalog-items"]//a[@property="name"]' ) for row in items_links: link = row.attr('href') link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_item', url=link, priority=100, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse items links items_list = grab.doc.select( '//div[@class="tovar-table tovar_basic"]//div[@class="tovar-col tovar2"]/a' ) for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page_items(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return catalog = grab.doc.select('//div[@class="listcatalog"]') # parse items links items_list = catalog.select( './/table[@class="lclistitem"]//td[@class="name"]//a') for link in items_list: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), {}) yield self.do_task('parse_item', link, 100, last=True) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # parse cats categories_list = grab.doc.select( '//div[@class="main-links"]//p[@class="home_subcatalog_links_box"]/a' ) for link in categories_list: link = UrlGenerator.get_page_params( self.domain, link.attr('href'), {'SET_PAGE_COUNT': '99999'}) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_page(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) # parse items items_list = grab.doc.select('//div[@class="prod-list-cell"]//a[.!=""]') for index, row in enumerate(items_list): link = row.attr('href') # make absolute urls if needed if link[:1] == '/': link = UrlGenerator.get_page_params(self.domain, link, {}) yield Task('parse_item', url=link, priority=100, raw=True) except Exception as e: self._process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # catalog catalog = grab.doc.select( '//div[@class="workarea"]//div[contains(@class, "catalog-section-title")]/a' ) for link in catalog: link = UrlGenerator.get_page_params(self.domain, link.attr('href'), { 'limit': 900, 'view': 'price' }) yield self.do_task('parse_page', link, 90) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_initial(self, grab, task): try: if self.check_body_errors(grab, task): self.log.fatal(task, f'Err task, attempt {task.task_try_count}') return links = grab.doc.select( '//nav//a[not(.//img) and re:match(@href, "/product_list/.+")]' ) for link in links: url = UrlGenerator.get_page_params(self.domain, link.attr('href'), { 'count': 999999, 'name': 'asc' }) yield Task('parse_page', url=url, priority=90, raw=True) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_item(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) # parse fields # A = name product_name = grab.doc.select('//h1').text() # B = count # C = status product_count_string = grab.doc.select('//span[@class="p-qty-wh"]').text() if product_count_string == 'Под заказ': product_status = '-1' product_count = '-1' elif product_count_string == 'На складе: более 100': product_status = '-1' product_count = 100 else: product_status = '-1' product_count = DSpider.re_product_count.match(product_count_string).groupdict()['count'] # D = unit [const = value] product_unit = 'ед.' # E = price product_price = DSpider.re_product_price.match(grab.doc.select('//div[@class="ppage-product-price"]').text()).groupdict()['price'].replace(' ', '') # check if positive and correct price if not product_price.isdigit(): self.log.debug(task, f'Skip item, cuz wrong price {product_price}') return # F = vendor code [const = skip for parsing] product_vendor_code = '' # G = vendor [const = value] product_vendor = 'Stiebel Eltron' # H = photo url product_photo_url = UrlGenerator.get_page_params(self.domain, grab.doc.select('//img[@id="Image1"]').attr('src'), {}) # I = description product_description = {'ОБЛАСТЬ ПРИМЕНЕНИЯ': grab.doc.select('//div[@class="col-md-14"]/p').text(default=' ')} table = grab.doc.select('//div[@class="col-md-14"]/table//tr') for row in table: key = row.select('./td[1]').text() value = row.select('./td[2]').text() if key: product_description[key] = value # save self.result.append({ 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description }) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_item(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) # common block with info product_info = grab.doc.select('//div[@id="product-info"]') # parse fields # A = name product_name = product_info.select('.//h1').text() # B = [const] # C = [const] # D = [const] product_count_string = product_info.select( './/div[@class="product-data-storehouse"]').text( default='[not found]') product_count = '-1' product_status = '0' product_unit = 'ед.' if product_count_string != 'в наличии': self.log.warning( task, 'Skip item, cuz wrong count {}'.format( product_count_string)) return # E = price # if E = "запросить цену и наличие" => zapros # else => float product_price = product_info.select( './/span[@itemprop="price"]').text().replace(' ', '') if product_price == 'Уточняйте': product_price = '-1' else: # E = price (float) # check if correct price if not Ree.float.match(product_price): self.log.warning( task, f'Skip item, cuz wrong price {product_price}') return # F = vendor code product_vendor_code = product_info.select( './/div[@class="product-data-articul"]').text() # G = vendor product_vendor = product_info.select( './/div[@class="product-data-producer"]').text() # H = photo url product_photo_url_raw = product_info.select( './/div[@id="product-images-list"]/div[1]/img[@itemprop="contentUrl"]' ).attr('src') product_photo_url = UrlGenerator.get_page_params( self.domain, product_photo_url_raw, {}) # pre I product_description_part_raw = product_info.select('.//div[@class="product-description description"]/following-sibling::node()[2]')\ .text(default='')\ .replace('$(".description").html(\'', '')\ .replace('\');', '') # I = description # this part insert pure html with js, so we need clear all html tags and &-symbols product_description_part_list = html.fromstring( f'<div>{product_description_part_raw}</div>').xpath('string()') product_description_part = '' for row in product_description_part_list: product_description_part += row product_description = {'Описание': product_description_part} table = product_info.select( './/div[@class="product-description table"]/div') for row in table: key = row.select('./text()').text() value = row.select('./span').text() if key: product_description[key] = value # save row = { 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description } self.log.info(task, row) self.result.append(row) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task)
def task_parse_item(self, grab, task): try: # common block with info product_info = grab.doc.select('//div[@class="itemcatalog"]') # parse fields # A = name product_name = product_info.select('.//h1').text() # B = count (quantity) # C = status (delivery) product_count_string = product_info.select( './/table[@id="hint-table"]//tr[1]//td[2]').text('') if product_count_string == 'Имеется в наличии': product_count = '-1' product_status = '0' elif product_count_string in [ 'Ожидается поступление', 'Под заказ' ]: product_count = '-1' product_status = '-1' else: self.log.warning( task, f'Unknown count status {product_count_string} skip...') return tin_tab = product_info.select('.//table[@class="tintab"]') # D = unit (measure) product_unit = tin_tab.select('.//tr[2]/td[2]').text('ед.') # E = price product_price = product_info.select( './/span[@class="price"]').text('').replace(' руб.', '') if product_price == 'по запросу': product_price = '-1' if not product_price or not Ree.float.match(product_price): self.log.warning( task, f'Unknown price status {product_price}, skip...') return # F = vendor code (sku) product_vendor_code = tin_tab.select('.//tr[1]/td[2]').text('') # G = vendor (manufacture) product_vendor = tin_tab.select('.//tr[last()]/td[2]').text('') # H = photo url product_photo_url_raw = product_info.select( './/a[@itemprop="image"]').attr('href', '') if product_photo_url_raw: product_photo_url = UrlGenerator.get_page_params( self.domain, product_photo_url_raw, {}) else: product_photo_url = '' # I = description (properties) product_description = {} # try parse full props for row in tin_tab.select('.//tr'): key = row.select('./td[1]').text() value = row.select('./td[2]').text() if key: product_description[key] = value # common item_description_rows = grab.doc.select( '//div[@itemprop="description"]') item_description = '' for row in item_description_rows: if row.node().tag not in ['table', 'img']: item_description += row.text('') if item_description: product_description['Техническое описание'] = item_description # save self.result.append({ 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description }) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task, last=True)
def task_parse_item(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # common block with info product_info = grab.doc.select('//div[@id="product"]') # parse fields # A = name product_name = product_info.select('.//h1').text() # B = count (quantity) # C = status (delivery) product_count_string_stock = product_info.select( './/span[contains(@class, "in-stock")]').text(default='') product_count_string_order = product_info.select( './/span[contains(@class, "under-order")]').text(default='') # D = unit (measure) [const if no stock, else parse] if product_count_string_stock and product_count_string_stock[ -1] == 'м': product_unit = 'м' product_count_string_stock = product_count_string_stock[: -1].strip( ) else: product_unit = 'ед.' if product_count_string_stock.isdigit(): product_count = product_count_string_stock product_status = '0' elif product_count_string_order == 'под заказ': product_count = '-1' product_status = '-1' else: self.log.warning( task, f'Unknown count status {product_count_string_stock} or {product_count_string_order}, skip...' ) return # E = price product_price_raw = product_info.select( './/p[@class="summ"]').text(default='') if not product_price_raw: self.log.warning( task, f'Unknown price status {product_price_raw}, skip...') return if product_price_raw == 'по запросу': product_price = '-1' else: # parse number from child node product_price_raw = product_info.select( './/p[@class="summ"]/span[@id="commmon_price"]').text( default='') if not product_price_raw or not Ree.float.match( product_price_raw): self.log.warning( task, f'Unknown price status {product_price_raw}, skip...') return product_price = product_price_raw # F = vendor code (sku) [const] product_vendor_code = '' # G = vendor (manufacture) [const] product_vendor = '' # H = photo url product_photo_url_raw = product_info.select( './/img[@itemprop="image"]').attr('src') product_photo_url = UrlGenerator.get_page_params( self.domain, product_photo_url_raw, {}) # I = description (properties) product_description = {} # I :: Base table = product_info.select('.//div[@class="tab-content-list"]') for row in table: key = row.select('./span[1]').text(default=None) value = row.select('./span[2]').text(default=None) if key and value and key != 'Наличие': product_description[key] = value # I :: description description = product_info.select('.//div[@id="opisanie"]').text( default='') if description: product_description['Описание'] = description # I :: using description = product_info.select('.//div[@id="primenenie"]').text( default='') if description: product_description['Применение'] = description # I :: tech description = product_info.select( './/div[@id="tehnicheskie_harakteristiki"]').text(default='') if description: product_description['Технические характеристики'] = description # save self.result.append({ 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description }) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task, last=True)
def task_parse_item(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # common block with info product_info = grab.doc.select('//div[@id="content"]') # parse fields # A = name product_name = product_info.select('.//h1').text() # B = count (quantity) # C = status (delivery) product_count_string_full = product_info.select( './/div[@class="presence-box"]/span[@class="presence sel"]' ).count() product_count_string_empty = product_info.select( './/div[@class="presence-box"]/span[@class="presence"]').count( ) # 111 (fulled squares) if product_count_string_full == 3: product_count = '-1' product_status = '0' # 000 (empty squares) elif product_count_string_empty == 3: product_count = '-1' product_status = '-1' # ??? else: self.log.warning( task, f'Unknown count status {product_count_string_full}, {product_count_string_empty} skip...' ) return # D = unit (measure) [const!] product_unit = 'ед.' # E = price product_price_raw = product_info.select( './/div[@class="item_current_price"]').text('') product_price_raw = Ree.extract_float.match(product_price_raw) if product_price_raw: product_price = product_price_raw.groupdict()['float'] else: self.log.warning( task, f'Unknown price status {product_price_raw}, skip...') return if product_price == '0': product_price = '-1' table = product_info.select('.//table[@class="prop-list"]//tr') product_vendor_code = '' product_vendor = '' for row in table: key = row.select('./td[1]').text('') value = row.select('./td[2]').text('') # G = vendor (manufacture) if 'Производитель' in key: product_vendor = value continue # F = vendor code (sku) if 'Артикул' in key: product_vendor_code = value.strip(' .') continue # H = photo url product_photo_url_raw = product_info.select( './/a[@id="pos-big-photo"]').attr('href', '') if product_photo_url_raw: product_photo_url = UrlGenerator.get_page_params( self.domain, product_photo_url_raw, {}) else: product_photo_url = '' # I = description (properties) product_description = { 'Описание': product_info.select('.//div[@id="detail-text-content"]').text( '') } # save o = { 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description } self.log.info(task, 'Add: {}'.format(o)) self.result.append({ 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description }) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task, last=True)
def task_parse_item(self, grab, task): try: if self.check_body_errors(grab, task): yield self.check_errors(task) return # common block with info product_info = grab.doc.select('//div[@id="product"]') # parse fields # A = name product_name = product_info.select('.//h1').text() # B = count (quantity) # C = status (delivery) product_count_string = product_info.select('.//span[@itemprop="availability"]').text('') if 'шт.' in product_count_string or 'м.п.' in product_count_string or product_count_string in ['есть', 'в наличии']: product_count = '-1' product_status = '0' elif 'срок поставки' in product_count_string or product_count_string in ['НЕТ', 'нет']: self.log.info(task, f'Skip count status {product_count_string} skip...') return else: self.log.warning(task, f'Unknown count status {product_count_string} skip...') return # D = unit (measure) product_unit = product_info.select('.//form[@class="form_addCart"]//span[@class="measure"]').text('ед.') # E = price product_price = product_info.select('.//form[@class="form_addCart"]//meta[@itemprop="price"]').attr('content', '') if not product_price or not Ree.float.match(product_price): self.log.warning(task, f'Unknown price status {product_price}, skip...') return # F = vendor code (sku) product_vendor_code = product_info.select('.//span[@class="articleValue"]').text('') # G = vendor (manufacture) product_vendor = product_info.select('.//a[@itemprop="brand"]').text('') # H = photo url product_photo_url_raw = product_info.select('.//img[@itemprop="image"]').attr('src', '') if product_photo_url_raw: product_photo_url = UrlGenerator.get_page_params(self.domain, product_photo_url_raw, {}) else: product_photo_url = '' # I = description (properties) product_description = {'Описание': product_info.select('.//div[@class="content"][1]').text('')} # ID product_id = product_info.select('.//input[@name="addcart"]').attr('value', '') # save self.result.append({ 'name': product_name, 'quantity': product_count, 'delivery': product_status, 'measure': product_unit, 'price': product_price, 'sku': product_vendor_code, 'manufacture': product_vendor, 'photo': product_photo_url, 'properties': product_description, 'id': product_id, }) except Exception as e: self.process_error(grab, task, e) finally: self.process_finally(task, last=True)