def parse_item(self, response): self._logger.info('JomashopSpider#parse_item...') item = JomashopItem() sel = Selector(response) self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['shipping_availability'] = format_html_string(''.join( sel.xpath( '//*[@id="product_addtocart_form"]//li[@class="pdp-shipping-availability"]/span/text()' ).extract())) MagicToolboxContainer_string = ''.join( sel.xpath( '//div[@class="MagicToolboxContainer "]//span[@style="margin-top:8px;"]/text()' ).extract()) item['image_urls'] = re.findall(r'data-href="(.*?)"', MagicToolboxContainer_string, re.MULTILINE | re.DOTALL) item['details'] = format_html_string(''.join( sel.xpath('//dd[@id="tab-container-details"]').extract())) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) return item
def parse_item(self, response): print("FinishlineSpider#parse_item ...") self._logger.debug("FinishlineSpider#parse_item ...") item = DrugstoreItem() self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) sel = Selector(response) item['title'] = ' '.join( sel.xpath('//*[@id="divCaption"]/h1//text()').extract()).strip() item['product_details'] = format_html_string(''.join( sel.xpath('//*[@id="divPromosPDetail"]').extract()).strip()) ingredients = ''.join( sel.xpath('//*[@id="TblProdForkFactsCntr"]').extract()).strip() if len(ingredients) == 0: ingredients = ''.join( sel.xpath( '//*[@id="TblProdForkIngredients"]').extract()).strip() item['ingredients'] = format_html_string(ingredients) s = ''.join( sel.xpath('//*[@id="largeProdImageLink"]/a/@href').extract()) relative_image_url = re_search(r"popUp\(\'(.*?)\'", s) full_image_url = urljoin(response.url, relative_image_url) image_urls = [] while 1: request = urllib2.Request(full_image_url) response_image = urllib2.urlopen(request) image_html_str = response_image.read() node = lxml.html.fromstring(image_html_str) image_url = ''.join(node.xpath('//*[@id="productImage"]/img/@src')) image_urls.append(image_url) no_next_image = node.xpath( '//img[contains(@src,"right_arrow_grey.gif") and @alt="no image"]' ) if no_next_image: break else: full_image_url = urljoin( full_image_url, ''.join( node.xpath('//img[@alt="see next image"]/../@href'))) if not full_image_url: break item['image_urls'] = image_urls self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) return item
def parse_chinese_detail(self, response): #self.log('AshfordSpider#parse_chinese_detail...') self._logger.info('AshfordSpider#parse_chinese_detail...') sel = Selector(response) item = response.meta['item_half'] item['chinese_detail'] = format_html_string(''.join(sel.xpath('//div[@id="tab1_info"]').extract()).strip()) return item
def parse_item(self, response): #self.log('AshfordSpider#parse_item...') self._logger.info('AshfordSpider#parse_item...') item = AshfordItem() sel = Selector(response) self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['prodName'] = ''.join(sel.xpath(' //*[@id="prodName"]/a/text()').extract()).strip() item['prod_desc'] = (''.join(sel.xpath('//*[@id="fstCont"]/h3/text()').extract()).strip()) item['detail'] = format_html_string(''.join(sel.xpath('//div[@id="tab1_info"]').extract()).strip()) item['Brand'] = ''.join(sel.xpath('//h1[@id="prodName"]/a[@id="sameBrandProduct"]/text()[1]').extract()).strip() item['product_images'] = list(set(sel.xpath('//a[contains(@href,"/images/catalog/") and contains(@href,"XA.jpg")]/@href').extract())) item['image_urls'] = [urljoin(response.url, i) for i in item['product_images']] chinese_url = response.url.replace('www.', 'zh.') response.meta['item_half'] = item self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) yield Request( url=chinese_url, meta=response.meta, callback=self.parse_chinese_detail, dont_filter=True )
def parse_item(self, response): self._logger.info('JomashopSpider#parse_item...') item = JomashopItem() sel = Selector(response) self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['shipping_availability'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//li[@class="pdp-shipping-availability"]/span/text()').extract())) MagicToolboxContainer_string = ''.join(sel.xpath('//div[@class="MagicToolboxContainer "]//span[@style="margin-top:8px;"]/text()').extract()) item['image_urls'] = re.findall(r'data-href="(.*?)"', MagicToolboxContainer_string, re.MULTILINE | re.DOTALL) item['details'] = format_html_string(''.join(sel.xpath('//dd[@id="tab-container-details"]').extract())) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item
def parse_item(self, response): print("FinishlineSpider#parse_item ...") self._logger.debug("FinishlineSpider#parse_item ...") item = DrugstoreItem() self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) sel = Selector(response) item['title'] = ' '.join(sel.xpath('//*[@id="divCaption"]/h1//text()').extract()).strip() item['product_details'] = format_html_string(''.join(sel.xpath('//*[@id="divPromosPDetail"]').extract()).strip()) ingredients = ''.join(sel.xpath('//*[@id="TblProdForkFactsCntr"]').extract()).strip() if len(ingredients) == 0: ingredients = ''.join(sel.xpath('//*[@id="TblProdForkIngredients"]').extract()).strip() item['ingredients'] = format_html_string(ingredients) s = ''.join(sel.xpath('//*[@id="largeProdImageLink"]/a/@href').extract()) relative_image_url = re_search(r"popUp\(\'(.*?)\'", s) full_image_url = urljoin(response.url, relative_image_url) image_urls = [] while 1: request = urllib2.Request(full_image_url) response_image = urllib2.urlopen(request) image_html_str = response_image.read() node = lxml.html.fromstring(image_html_str) image_url = ''.join(node.xpath('//*[@id="productImage"]/img/@src')) image_urls.append(image_url) no_next_image = node.xpath('//img[contains(@src,"right_arrow_grey.gif") and @alt="no image"]') if no_next_image: break else: full_image_url = urljoin( full_image_url, ''.join(node.xpath('//img[@alt="see next image"]/../@href')) ) if not full_image_url: break item['image_urls'] = image_urls self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item
def _enrich_same_part(self, item, response): sel = Selector(response) item['brand_name'] = format_html_string(''.join( sel.xpath( '//form[@id="product_addtocart_form"]//span[@class="brand-name"]/text()' ).extract())) item['product_name'] = format_html_string(''.join( sel.xpath( '//form[@id="product_addtocart_form"]//span[@class="product-name"]/text()' ).extract())) item['product_ids'] = format_html_string(''.join( sel.xpath( '//form[@id="product_addtocart_form"]//span[@class="product-ids"]/text()' ).extract())) item['final_price'] = format_html_string(''.join( sel.xpath( '//*[@id="product_addtocart_form"]//p[@class="final-price"]/meta[@itemprop="price"]/@content' ).extract())) item['retail_price'] = format_html_string(''.join( sel.xpath( '//*[@id="product_addtocart_form"]//li[@class="pdp-retail-price"]/span/text()' ).extract())) item['savings'] = format_html_string(''.join( sel.xpath( '//*[@id="product_addtocart_form"]//li[@class="pdp-savings"]/span/text()' ).extract())) item['shipping'] = format_html_string(''.join( sel.xpath( '//*[@id="product_addtocart_form"]//li[@class="pdp-shipping"]/span/text()' ).extract()))
def _enrich_same_part(self, item, response): sel = Selector(response) item['title'] = ' '.join(sel.xpath('//*[@id="prdImage"]/h1/*//text()').extract()).strip() if len(item['title']) < 2: item['title'] = ' '.join(sel.xpath('//*[@id="productStage"]/h1/*/text()').extract()).strip() item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="prdInfoText"]').extract()).strip()) if len(item['productDescription']) == 0: item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]').extract()).strip()) item['stockJSON'] = json.loads(''.join(sel.re(r'var stockJSON =(.*);')).strip().replace(' ', '')) item['dimensions'] = json.loads(''.join(sel.re(r'var dimensions =(.*);')).strip().replace(' ', '')) item['dimToUnitToValJSON'] = json.loads(''.join(sel.re(r'var dimToUnitToValJSON =(.*);')).strip().replace(' ', '')) item['dimensionIdToNameJson'] = json.loads(''.join(sel.re(r'var dimensionIdToNameJson =(.*);')).strip().replace(' ', '')) item['valueIdToNameJSON'] = json.loads(''.join(sel.re(r'var valueIdToNameJSON =(.*);')).strip().replace(' ', '')) item['colorNames'] = json.loads(re_search(r'var colorNames =(.*?);', response.body)) item['colorPrices'] = json.loads(re_search(r'var colorPrices =(.*?);', response.body)) item['styleIds'] = json.loads(re_search(r'var styleIds =(.*?);', response.body)) item['colorIds'] = json.loads(re_search(r'var colorIds =(.*?);', response.body))
def _enrich_same_part(self, item, response): sel = Selector(response) item['brand_name'] = format_html_string(''.join(sel.xpath('//form[@id="product_addtocart_form"]//span[@class="brand-name"]/text()').extract())) item['product_name'] = format_html_string(''.join(sel.xpath('//form[@id="product_addtocart_form"]//span[@class="product-name"]/text()').extract())) item['product_ids'] = format_html_string(''.join(sel.xpath('//form[@id="product_addtocart_form"]//span[@class="product-ids"]/text()').extract())) item['final_price'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//p[@class="final-price"]/meta[@itemprop="price"]/@content').extract())) item['retail_price'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//li[@class="pdp-retail-price"]/span/text()').extract())) item['savings'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//li[@class="pdp-savings"]/span/text()').extract())) item['shipping'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//li[@class="pdp-shipping"]/span/text()').extract()))
def parse_item(self, response): print("FinishlineSpider#parse_item ...") self._logger.debug("FinishlineSpider#parse_item ...") sel = Selector(response) item = FinishlineItem() self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['title'] = ''.join( sel.xpath('//h1[@id="title"]/text()').extract()).strip() list_size = [] sizes = sel.xpath('//div[@id="productSizes"]/div[@class="size"]') for size in sizes: list_size.append([ ''.join(size.xpath('@id').extract()), ''.join(size.xpath('text()').extract()) ]) item['size'] = list_size item['productDescription'] = format_html_string(''.join( sel.xpath('//div[@id="productDescription"]').extract())) item['product_images'] = json.loads(''.join( sel.re(r"JSON.parse\(\'(.*?)\'")).strip()) item['links'] = ''.join(sel.re(r"links: \'(.*?)\'")).split(';') item['product_color'] = ''.join( sel.re(r'"product_color" : \["(.*?)\"')) item['style_color_ids'] = ''.join( sel.xpath( '//div[@id="styleColors"]/span[@class="styleColorIds"]/text()' ).extract()) colorid = ''.join( sel.xpath('//h1[@id="title"]/@data-colorid').extract()) styleid = ''.join( sel.xpath('//h1[@id="title"]/@data-styleid').extract()) imageset_url = 'http://www.finishline.com/store/api/scene7/imageset/?colorId=%s&styleId=%s' % ( colorid, styleid) meta = response.meta meta['item-half'] = item req = Request(url=imageset_url, meta=meta, callback=self.parse_images, dont_filter=response.request.dont_filter) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) print('self.crawler.stats.inc_crawled_pages::::::::::', ) yield req
def parse_item(self, response): self._logger.info("start response in parse_item -> response type:%s" %type(response).__name__) sel = Selector(response) item = AmazonItem() self._enrich_base_data(item, response, is_update=False) node_id_re = re.compile(r'node=(?P<node_id>\w+)') # breadcrum node_id_hrefs = sel.xpath('//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href').extract() item['node_ids'] = [node_id_re.search(x).group('node_id') for x in node_id_hrefs if node_id_re.search(x)] # Look for Similar Items by Category similar_node_id_links = [x.xpath('a/@href').extract() for x in sel.xpath('//div[@id="browse_feature_div"]/div/p')] item['similar_node_ids'] = [[node_id_re.search(x).group('node_id') for x in links] for links in [links for links in similar_node_id_links]] item['parent_asin'] = ''.join(sel.re(r'"parent_asin":"(.*?)"')).strip() if len(item['parent_asin']) == 0: item['parent_asin'] = ''.join(sel.xpath('//form[@id="addToCart"]/input[@id="ASIN"]/@value').extract()).strip() item['title'] = ''.join(sel.xpath('//span[@id="productTitle"]/text()').extract()).strip() item['product_specifications'] = format_html_string(''.join(sel.xpath('//div[@id="technicalSpecifications_feature_div"]//table').extract()).strip()) item['product_description'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]//p/text()').extract()).strip()) brand_href = ''.join(sel.xpath('//a[@id="brand"]/@href').extract()).strip() brand_re = re.compile(r'^/(?P<brand>.*)/b/') m = brand_re.search(brand_href) if m: brand = brand_re.search(brand_href).group('brand') else: brand = ''.join(sel.xpath('//a[@id="brand"]/text()').extract()).strip() item['brand'] = brand item['feature'] = format_html_string(''.join(sel.xpath('//div[@id="feature-bullets"]').extract()).strip()) item['dimensions_display'] = safely_json_loads(format_html_string(''.join(sel.re(r'"dimensionsDisplay":(.*?]),')).strip())) item['variations_data'] = safely_json_loads(''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip()) enrich_color_images(item, sel) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item
def parse_item(self, response): self._logger.info('JacobtimeSpider#parse_item...') item = JacobtimeItem() sel = Selector(response) self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['details'] = format_html_string(''.join(sel.xpath('//div[@id="tab1"]').extract())) item['image_urls'] = [urljoin(response.url, i) for i in sel.xpath('//a[@class="lightbox"]/@href').extract()] self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item
def parse_item(self, response): print("FinishlineSpider#parse_item ...") self._logger.debug("FinishlineSpider#parse_item ...") sel = Selector(response) item = FinishlineItem() self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['title'] = ''.join(sel.xpath('//h1[@id="title"]/text()').extract()).strip() list_size = [] sizes = sel.xpath('//div[@id="productSizes"]/div[@class="size"]') for size in sizes: list_size.append([ ''.join(size.xpath('@id').extract()), ''.join(size.xpath('text()').extract()) ]) item['size'] = list_size item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]').extract())) item['product_images'] = json.loads(''.join(sel.re(r"JSON.parse\(\'(.*?)\'")).strip()) item['links'] = ''.join(sel.re(r"links: \'(.*?)\'")).split(';') item['product_color'] = ''.join(sel.re(r'"product_color" : \["(.*?)\"')) item['style_color_ids'] = ''.join(sel.xpath('//div[@id="styleColors"]/span[@class="styleColorIds"]/text()').extract()) colorid = ''.join(sel.xpath('//h1[@id="title"]/@data-colorid').extract()) styleid = ''.join(sel.xpath('//h1[@id="title"]/@data-styleid').extract()) imageset_url = 'http://www.finishline.com/store/api/scene7/imageset/?colorId=%s&styleId=%s' % (colorid,styleid) meta = response.meta meta['item-half'] = item req = Request( url=imageset_url, meta=meta, callback=self.parse_images, dont_filter=response.request.dont_filter ) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) print('self.crawler.stats.inc_crawled_pages::::::::::',) yield req
def parse_item(self, response): self._logger.info('JacobtimeSpider#parse_item...') item = JacobtimeItem() sel = Selector(response) self._enrich_base_data(item, response, is_update=False) self._enrich_same_part(item, response) item['details'] = format_html_string(''.join( sel.xpath('//div[@id="tab1"]').extract())) item['image_urls'] = [ urljoin(response.url, i) for i in sel.xpath('//a[@class="lightbox"]/@href').extract() ] self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) return item
def parse_item_update(self, response): self._logger.info("start response in parse_item_update -> response type:%s" % type(response).__name__) item = AmazonItem() meta = response.meta self._enrich_base_data(item, response, is_update=True) item['asin'] = re_search(r'product/(.*)/', response.url) sel = Selector(response) asin_divs = sel.xpath('//input[@id="ASIN"]/@value').extract() if len(asin_divs) > 0: item['parent_asin'] = ''.join(asin_divs[0]).strip() else: item['parent_asin'] = '' item['size'] = re_search(r'\"%s\":\[(.*?)\]' % item['asin'], ''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip()) item['dimensions_display'] = safely_json_loads(format_html_string(''.join(sel.re(r'"dimensionsDisplay":(.*?]),')).strip())) item['merchants'] = sel.xpath('//div[@id="merchant-info"]/a/text()').extract() item['merchant_3p'] = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/b/text()').extract()).strip() item['price_3p'] = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/span[contains(@class, "price3P")]/text()').extract()).strip() shipping_cost_3p_string = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/span[contains(@class, "shipping3P")]/text()').extract()).strip() item['shipping_cost_3p'] = extract_shipping_cost_price_from_shipping_cost_string(shipping_cost_3p_string) item['from_price'] = ''.join(sel.xpath('//div[@id="mbc"]/div[@class="a-box"]/div/span/span[@class="a-color-price"]/text()').extract()).strip() availability_divs = [ ''.join(sel.xpath('//div[@id="availability"]/span/text()').extract()), ''.join(sel.xpath('//span[@class="availRed"]/text()').extract()), ''.join(sel.xpath('//span[@class="availGreen"]/text()').extract()) ] availability_str = ''.join(availability_divs).strip().lower() merchant_info_str = ''.join(sel.xpath('//div[@id="merchant-info"]/text()').extract()).strip().lower() if ( (len(availability_divs) <= 0) or availability_str.startswith('only') or availability_str.startswith('in stock') or availability_str.startswith('usually') ): item['availability'] = 'true' item['availability_reason'] = "001: %s" % availability_str elif ( merchant_info_str.startswith('ships from and sold by') ): item['availability'] = 'true' item['availability_reason'] = "002: %s" % merchant_info_str elif ( availability_str.startswith('available from') ): item['availability'] = 'other' item['availability_reason'] = "003: %s" % availability_str elif availability_str.startswith('currently unavailable'): item['availability'] = 'false' item['availability_reason'] = "004: %s" % availability_str else: item['availability'] = 'false' item['availability_reason'] = '000: _' if item['availability'] in ['true']: item['list_price'] = ''.join([ ''.join(sel.xpath('//div[@id="price"]//tr[1]/td[2]/text()').extract()).strip(), ''.join(sel.xpath('//span[@id="listPriceValue"]/text()').extract()).strip() ]) item['price'] = ''.join([ ''.join(sel.xpath('//span[@id="priceblock_ourprice"]/text()').extract()).strip(), ''.join(sel.xpath('//span[@id="priceblock_saleprice"]/text()').extract()).strip(), ''.join(sel.xpath('//span[@id="priceblock_dealprice"]/text()').extract()).strip(), ''.join(sel.xpath('//span[@id="actualPriceValue"]/b/text()').extract()).strip() ]) if ((len(item['list_price']) + len(item['price'])) <= 0): #self.log("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body))) self._logger.info("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body))) dump_response_body(item['asin'], response.body) shipping_cost_string_ourprice = ''.join(sel.xpath('//*[@id="ourprice_shippingmessage"]/span/text()').extract()).strip() shipping_cost_string_saleprice = ''.join(sel.xpath('//*[@id="saleprice_shippingmessage"]/span/text()').extract()).strip() shipping_cost_string = shipping_cost_string_ourprice or shipping_cost_string_saleprice item['shipping_cost'] = extract_shipping_cost_price_from_shipping_cost_string(shipping_cost_string) self._logger.info("Spiderid: %s Crawlid: %s yield item in parse, asin: %s" % (response.meta['spiderid'],response.meta['crawlid'],item.get("asin", "unknow"))) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item elif item['availability'] in ['other']: item['price'] = ''.join([ ''.join(sel.xpath('//*[@id="unqualifiedBuyBox"]//span[@class="a-color-price"]/text()').extract()).strip() ]) new_url = ''.join(sel.xpath('//div[@id="unqualifiedBuyBox"]/div/div[1]/a/@href').extract()).strip() new_url = urljoin(response.url, new_url) meta['item_half'] = item req = Request( url=new_url, meta=meta, callback=self.parse_shipping_cost, dont_filter=response.request.dont_filter ) self._logger.info("Spiderid: %s Crawlid: %s yield request in parse, asin: %s" % (response.meta['spiderid'],response.meta['crawlid'],req.meta.get("asin", "unknow"))) return req else: self._logger.info("yield item in parse, asin: %s" % item.get("asin", "unknow")) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item
def parse_item_update(self, response): self._logger.info( "start response in parse_item_update -> response type:%s" % type(response).__name__) item = AmazonItem() meta = response.meta self._enrich_base_data(item, response, is_update=True) item['asin'] = re_search(r'product/(.*)/', response.url) sel = Selector(response) asin_divs = sel.xpath('//input[@id="ASIN"]/@value').extract() if len(asin_divs) > 0: item['parent_asin'] = ''.join(asin_divs[0]).strip() else: item['parent_asin'] = '' item['size'] = re_search( r'\"%s\":\[(.*?)\]' % item['asin'], ''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip()) item['dimensions_display'] = safely_json_loads( format_html_string(''.join( sel.re(r'"dimensionsDisplay":(.*?]),')).strip())) item['merchants'] = sel.xpath( '//div[@id="merchant-info"]/a/text()').extract() item['merchant_3p'] = ''.join( sel.xpath( '//div[@id="soldByThirdParty"]/b/text()').extract()).strip() item['price_3p'] = ''.join( sel.xpath( '//div[@id="soldByThirdParty"]/span[contains(@class, "price3P")]/text()' ).extract()).strip() shipping_cost_3p_string = ''.join( sel.xpath( '//div[@id="soldByThirdParty"]/span[contains(@class, "shipping3P")]/text()' ).extract()).strip() item[ 'shipping_cost_3p'] = extract_shipping_cost_price_from_shipping_cost_string( shipping_cost_3p_string) item['from_price'] = ''.join( sel.xpath( '//div[@id="mbc"]/div[@class="a-box"]/div/span/span[@class="a-color-price"]/text()' ).extract()).strip() availability_divs = [ ''.join( sel.xpath('//div[@id="availability"]/span/text()').extract()), ''.join(sel.xpath('//span[@class="availRed"]/text()').extract()), ''.join(sel.xpath('//span[@class="availGreen"]/text()').extract()) ] availability_str = ''.join(availability_divs).strip().lower() merchant_info_str = ''.join( sel.xpath('//div[@id="merchant-info"]/text()').extract()).strip( ).lower() if ((len(availability_divs) <= 0) or availability_str.startswith('only') or availability_str.startswith('in stock') or availability_str.startswith('usually')): item['availability'] = 'true' item['availability_reason'] = "001: %s" % availability_str elif (merchant_info_str.startswith('ships from and sold by')): item['availability'] = 'true' item['availability_reason'] = "002: %s" % merchant_info_str elif (availability_str.startswith('available from')): item['availability'] = 'other' item['availability_reason'] = "003: %s" % availability_str elif availability_str.startswith('currently unavailable'): item['availability'] = 'false' item['availability_reason'] = "004: %s" % availability_str else: item['availability'] = 'false' item['availability_reason'] = '000: _' if item['availability'] in ['true']: item['list_price'] = ''.join([ ''.join( sel.xpath('//div[@id="price"]//tr[1]/td[2]/text()'). extract()).strip(), ''.join( sel.xpath('//span[@id="listPriceValue"]/text()'). extract()).strip() ]) item['price'] = ''.join([ ''.join( sel.xpath('//span[@id="priceblock_ourprice"]/text()'). extract()).strip(), ''.join( sel.xpath('//span[@id="priceblock_saleprice"]/text()'). extract()).strip(), ''.join( sel.xpath('//span[@id="priceblock_dealprice"]/text()'). extract()).strip(), ''.join( sel.xpath('//span[@id="actualPriceValue"]/b/text()'). extract()).strip() ]) if ((len(item['list_price']) + len(item['price'])) <= 0): #self.log("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body))) self._logger.info( "response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body))) dump_response_body(item['asin'], response.body) shipping_cost_string_ourprice = ''.join( sel.xpath('//*[@id="ourprice_shippingmessage"]/span/text()'). extract()).strip() shipping_cost_string_saleprice = ''.join( sel.xpath('//*[@id="saleprice_shippingmessage"]/span/text()'). extract()).strip() shipping_cost_string = shipping_cost_string_ourprice or shipping_cost_string_saleprice item[ 'shipping_cost'] = extract_shipping_cost_price_from_shipping_cost_string( shipping_cost_string) self._logger.info( "Spiderid: %s Crawlid: %s yield item in parse, asin: %s" % (response.meta['spiderid'], response.meta['crawlid'], item.get("asin", "unknow"))) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) return item elif item['availability'] in ['other']: item['price'] = ''.join([ ''.join( sel.xpath( '//*[@id="unqualifiedBuyBox"]//span[@class="a-color-price"]/text()' ).extract()).strip() ]) new_url = ''.join( sel.xpath('//div[@id="unqualifiedBuyBox"]/div/div[1]/a/@href'). extract()).strip() new_url = urljoin(response.url, new_url) meta['item_half'] = item req = Request(url=new_url, meta=meta, callback=self.parse_shipping_cost, dont_filter=response.request.dont_filter) self._logger.info( "Spiderid: %s Crawlid: %s yield request in parse, asin: %s" % (response.meta['spiderid'], response.meta['crawlid'], req.meta.get("asin", "unknow"))) return req else: self._logger.info("yield item in parse, asin: %s" % item.get("asin", "unknow")) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) return item
def parse_item(self, response): self._logger.info("start response in parse_item -> response type:%s" % type(response).__name__) sel = Selector(response) item = AmazonItem() self._enrich_base_data(item, response, is_update=False) node_id_re = re.compile(r'node=(?P<node_id>\w+)') # breadcrum node_id_hrefs = sel.xpath( '//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href' ).extract() item['node_ids'] = [ node_id_re.search(x).group('node_id') for x in node_id_hrefs if node_id_re.search(x) ] # Look for Similar Items by Category similar_node_id_links = [ x.xpath('a/@href').extract() for x in sel.xpath('//div[@id="browse_feature_div"]/div/p') ] item['similar_node_ids'] = [[ node_id_re.search(x).group('node_id') for x in links ] for links in [links for links in similar_node_id_links]] item['parent_asin'] = ''.join(sel.re(r'"parent_asin":"(.*?)"')).strip() if len(item['parent_asin']) == 0: item['parent_asin'] = ''.join( sel.xpath('//form[@id="addToCart"]/input[@id="ASIN"]/@value'). extract()).strip() item['title'] = ''.join( sel.xpath('//span[@id="productTitle"]/text()').extract()).strip() item['product_specifications'] = format_html_string(''.join( sel.xpath('//div[@id="technicalSpecifications_feature_div"]//table' ).extract()).strip()) item['product_description'] = format_html_string(''.join( sel.xpath('//div[@id="productDescription"]//p/text()').extract()). strip()) brand_href = ''.join( sel.xpath('//a[@id="brand"]/@href').extract()).strip() brand_re = re.compile(r'^/(?P<brand>.*)/b/') m = brand_re.search(brand_href) if m: brand = brand_re.search(brand_href).group('brand') else: brand = ''.join( sel.xpath('//a[@id="brand"]/text()').extract()).strip() item['brand'] = brand item['feature'] = format_html_string(''.join( sel.xpath('//div[@id="feature-bullets"]').extract()).strip()) item['dimensions_display'] = safely_json_loads( format_html_string(''.join( sel.re(r'"dimensionsDisplay":(.*?]),')).strip())) item['variations_data'] = safely_json_loads(''.join( sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip()) enrich_color_images(item, sel) self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid']) return item