def _populate_from_js(self, response, product): # Images are not always on the same spot... img_jsons = response.css( '#landingImage ::attr(data-a-dynamic-image)').extract() if img_jsons: img_data = json.loads(img_jsons[0]) cond_set_value( product, 'image_url', max(img_data.items(), key=lambda (_, size): size[0]), conv=lambda (url, _): url)
def parse_product(self, response): prod = response.meta['product'] if not self._has_captcha(response): self._populate_from_js(response, prod) self._populate_from_html(response, prod) cond_set_value(prod, 'locale', 'en-US') # Default locale. result = prod elif response.meta.get('captch_solve_try', 0) >= self.captcha_retries: self.log("Giving up on trying to solve the captcha challenge after" " %s tries for: %s" % (self.captcha_retries, prod['url']), level=WARNING) result = None else: result = self._handle_captcha(response, self.parse_product) return result
def _scrape_product_links(self, response): # To populate the description, fetching the product page is necessary. url = response.url # This will contain everything except for the URL and description. product_jsons = response.xpath( "//script[@type='text/javascript']/text()" ).re( r"\s*tesco\.productData\.push\((\{.+?\})\);" ) if not product_jsons: self.log("Found no product data on: %s" % url, ERROR) product_links = response.css( ".product > .desc > h2 > a ::attr('href')").extract() if not product_links: self.log("Found no product links on: %s" % url, ERROR) for product_json, product_link in zip(product_jsons, product_links): prod = SiteProductItem() cond_set_value(prod, 'url', urlparse.urljoin(url, product_link)) product_data = json.loads(product_json) cond_set_value(prod, 'price', product_data.get('price')) cond_set_value(prod, 'image_url', product_data.get('mediumImage')) try: brand, title = self.brand_from_title(product_data['name']) cond_set_value(prod, 'brand', brand) cond_set_value(prod, 'title', title) except KeyError: raise AssertionError( "Did not find title or brand from JS for product: %s" % product_link ) yield None, prod
def _populate_from_html(self, response, product): if 'title' in product and product['title'] == '': del product['title'] cond_set(product, 'title', response.xpath('//h1[@itemprop="name"]/text()').extract(), conv=string.strip) cond_set(product, 'description', response.xpath('//div[@itemprop="description"]').extract(), conv=string.strip) image_url = is_empty( response.xpath('//div[@id="izView"]/noscript/img/@src').extract()) if image_url: cond_set_value(product, 'image_url', 'http:' + image_url) json_data = is_empty( response.xpath('//script').re('jcpPPJSON\s?=\s?({.*});')) if json_data: data = json.loads(json_data) brand = is_empty(is_empty(data['products'])['lots']).get( 'brandName', None) cond_set_value(product, 'brand', brand) price = is_empty( response.xpath( '//span[@itemprop="price"]/a/text() |' '//span[@itemprop="price"]/text() ').re("\d+.?\d{0,2}")) if price: product['price'] = Price(price=price, priceCurrency='USD') else: product['price'] = Price(price='0.0', priceCurrency='USD')
def parse_product_mobile(self, response): prod = response.meta['product'] prod['url'] = response.url regex = "id=([A-Z0-9\-]+)" reseller_id = re.findall(regex, prod.get('url', '')) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(prod, "reseller_id", reseller_id) cond_set(prod, 'locale', ['en-GB']) title = response.xpath( '//div[contains(@class,"descriptionDetails")]//h1//span[@data-title="true"]//text()' ).extract() cond_set(prod, 'title', title) try: brand, title = self.brand_from_title(prod['title']) cond_set_value(prod, 'brand', brand) except KeyError: raise AssertionError( "Did not find title or brand from JS for product: %s" % response.url) img = response.xpath('//*[@id="pdp_image"]/img/@src').extract() cond_set(prod, 'image_url', img) price = response.xpath('//div[contains(@class,"main_price")]' '/text()').re(FLOATING_POINT_RGEX) if price: prod['price'] = Price(price=price[0], priceCurrency='GBP') desc = response.xpath( 'string(//p[@class="descriptionText"])').extract() cond_set(prod, "description", desc) return prod
def _populate_from_html(self, response, product): self._populate_hardcoded_fields(product) cond_set(product, 'title', response.css('#itemTitle::text').extract()) cond_set( product, 'price', response.css('[itemprop=price]::text , ' '#mm-saleDscPrc::text').extract(), self._unify_price) seller = response.xpath('//div[@class="mbg"]/a/span/text()').extract() if seller: seller = seller[0].strip() product["marketplace"] = [{ "name": seller, "price": product.get("price", None) }] cond_replace(product, 'image_url', response.css('[itemprop=image]::attr(src)').extract()) xpath = '//*[@id="vi-desc-maincntr"]/node()[normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) cond_replace(product, 'url', response.css('[rel=canonical]::attr(href)').extract()) xpath = '//td[@class="attrLabels" and contains(text(), "Brand:")]' \ '/following-sibling::td/span/text()' cond_set(product, 'brand', response.xpath(xpath).extract()) if not product.get('brand', None): dump_url_to_file(response.url) xpath = '//td[@class="attrLabels" and contains(text(), "Model:")]' \ '/following-sibling::td/span/text()' cond_set(product, 'model', response.xpath(xpath).extract()) reseller_id_regex = "-\/([^\/&?\.\s]+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id)
def populate_bestseller_rank(self, product, response): ranks = { ' > '.join( map(unicode.strip, itm.css('.zg_hrsr_ladder a::text').extract())): int( re.sub('[ ,]', '', itm.css('.zg_hrsr_rank::text').re('([\d, ]+)')[0])) for itm in response.css('.zg_hrsr_item') } prim = response.css('#SalesRank::text, #SalesRank .value' '::text').re('#([\d ,]+) .*in (.+)\(') if prim: prim = {prim[1].strip(): int(re.sub('[ ,]', '', prim[0]))} ranks.update(prim) ranks = [{'category': k, 'rank': v} for k, v in ranks.iteritems()] cond_set_value(product, 'category', ranks) # parse department department = amazon_parse_department(ranks) if department is None: product['department'] = None else: product['department'], product['bestseller_rank'] \ = department.items()[0]
def parse_product(self, response): prod = response.meta['product'] cond_set_value(prod, 'url', response.url) cond_set_value(prod, 'locale', 'en-IN') self._populate_from_html(response, prod) pv = PepperfryVariants() pv.setupSC(response) variants = pv._variants() cond_set_value(prod, 'variants', variants) return prod
def parse_product(self, response): product = response.meta['product'] cond_set_value(product, 'locale', 'en-GB') title = response.css('.product-name h1').extract() cond_set(product, 'title', title) image_url = response.css('#zoom1 img::attr(src)').extract() cond_set(product, 'image_url', image_url) brand = response.css('.box-brand a img::attr(alt)').extract() cond_set(product, 'brand', brand) model = response.xpath('//div[@itemprop="name"]/p/text()').extract() cond_set(product, 'model', model) reseller_id = response.xpath( '//*[@class="product-sku"]/text()').extract() cond_set(product, 'reseller_id', reseller_id) # Is_out_of_stock xpath = '//span[@id="availability-box" and text()="Out of stock"]' cond_set_value(product, 'is_out_of_stock', response.xpath(xpath), bool) # Description selection = response.css('.tabs-panels .std .content-wrapper') if selection: selection = selection[0].xpath('node()[normalize-space()]') cond_set_value(product, 'description', selection.extract(), u''.join) # Price price = response.css('[itemprop=price]::attr(content)') currency = response.css('[itemprop=priceCurrency]::attr(content)') if price and float(price[0].extract()) and currency: cond_set_value( product, 'price', Price(price=price[0].extract(), priceCurrency=currency[0].extract())) self._populate_buyer_reviews(response, product) self._populate_related_products(response, product) return product
def _populate_from_html(self, response, product): cond_set(product, 'image_url', response.css('.largeimage::attr(src)').extract()) cond_set(product, 'title', response.css('.productname::text').extract()) cond_set(product, 'brand', response.css('.productbrand [itemprop=name]::text').extract()) delivery_opts = response.css('.deliverycallout li') delivery_opts = [bool(do.css('.available')) for do in delivery_opts] opt_len = len(filter(None, delivery_opts)) if opt_len: cond_set_value(product, 'is_in_store_only', delivery_opts[1] and opt_len == 1) else: cond_set_value(product, 'is_out_of_stock', False) cond_set(product, 'price', response.css('[itemprop=price]::text').extract(), unicode.strip) cond_set(product, 'model', response.css('[itemprop=model]::text').extract()) regex = "\/(\d+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) price = product.get("price") if not re.findall(u'\xa3 *\d[\d, .]*', price): price = response.xpath( "//ul[contains(@class, 'pricing')]/li[last()]/span/text()" ).extract() if price: price = price[0].strip() price = re.findall(u'\xa3 *\d[\d, .]*', price) if price: price = re.sub(u'[\xa3, ]+', '', price[0]) cond_replace_value(product, 'price', Price(priceCurrency='GBP', price=price)) xpath = '//div[@id="pdpTab1"]/node()[normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) product['url'] = product['url'].rsplit('#', 1)[0]
def _tires_parse_product(self, response): product = response.meta['product'] cond_set( product, 'title', map( string.strip, response.xpath("//div[@class='productContent']/h1" "/div[@id='productName']/text()").extract())) cond_set( product, 'brand', response.xpath("//script[contains(text(),'dim7')]" "/text()").re(r'.*"dim7":"([^"]*)"}.*')) productid = response.xpath( "//p[@id='prodNo']/span[@id='metaProductID']/text()") if productid: productid = productid.extract()[0].strip().replace('P', '') try: product['upc'] = int(productid) except ValueError: self.log("Failed to parse upc number : %r" % productid, ERROR) cond_set( product, 'image_url', response.xpath( "//div[@class='bigImage']/img[@id='mainProductImage']" "/@src").extract()) price = response.xpath( "//div[contains(@class,'bigPrice')]/div[@class='price']" "/descendant::*[text()]/text()") price = [x.strip() for x in price.extract()] price = "".join(price) m = re.match(r'\$(.*)\*.*', price) if m: price = m.group(1) cond_set_value(product, 'price', Price('USD', price) if price else None) info = response.xpath("//div[@id='features']/div[@class='tabContent']" "/descendant::*[text()]/text()") if info: cond_set_value(product, 'description', " ".join(info.extract())) cond_set_value(product, 'locale', "en-US") return product
def _populate_related_products(self, response, product): related_products = {} cond_set_value( related_products, 'Customers also viewed', list( self._carousel_getitems( response.css('.skuRightColInner .carouselInner'))) or None) cond_set_value( related_products, 'Customers also bought', list( self._carousel_getitems( response.xpath('//h3[text()="Customers also bought"]/..'))) or None) fbt = [] for item in response.css('.bTogether .formRow'): title = item.css('.desc::text') url = item.css('.formLabel.SL_m::attr(href)') if url and title: fbt.append( RelatedProduct(url=urljoin(response.url, url[0].extract()), title=_strip_non_ascii(title[0].extract()))) cond_set_value(related_products, 'Frequently Bought Together,', fbt or None) cond_set_value(product, 'related_products', related_products)
def _populate_from_html(self, response, product): title = response.xpath( '//div[contains(@class, "product-title")]/h1/text()').extract() if isinstance(title, list): title = ''.join(title) cond_set(product, 'title', (title.strip(), )) cond_set(product, 'price', response.css('.saleprice span::text').re('\$([\d ,.]+)')) cond_set(product, 'price', response.css('.regprice span::text').re('\$([\d ,.]+)')) image_url = is_empty(response.css('.jqzoom img::attr(src)').extract()) if image_url: image_url = is_empty(re.findall("(.*)\?", image_url)) if not "http" == image_url[:4]: image_url = "http:" + image_url cond_set(product, 'image_url', (image_url, )) cond_set_value( product, 'is_out_of_stock', not (response.css( '.stockstatus .info::text').re('In Stock|Low Stock'))) cond_set(product, 'brand', response.css('.alignBrandImageSpec::attr(alt)').extract(), lambda brand: brand.replace('_', ' ')) xpath = '//td[@class="detailsText"]/text() | ' \ '//div[contains(@class, "tab-info")]' \ '/div[contains(@class, "tab-title")]' \ '/h2[contains(text(), "details")]/../../div' cond_set_value(product, 'description', response.xpath(xpath).extract(), u''.join) price = product.get('price', None) if price == 0: del product['price'] elif price: product['price'] = Price(priceCurrency='USD', price=re.sub('[ ,]', '', price)) reseller_id_regex = "i=(\d+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id) model = response.xpath("//div[@class='item']/text()").re( FLOATING_POINT_RGEX) cond_set(product, 'model', model) self._populate_buyer_reviews(response, product)
def _parse_variants(self, response): variants = [] variants_prop = {} variant_search = re.search('Product.Config\((.*)\)', response.body) if not variant_search: return None try: variants_json = json.loads(variant_search.group(1)) except ValueError: return None for attr_id in response.xpath( '//div[not(contains(@class,"hidden"))]/div/' 'select/@name').re('super_attribute\[(\d+)\]'): attribute = variants_json['attributes'][attr_id] attribute_name = attribute['label'] for option in attribute['options']: value = option['label'] for product in option['products']: prop = variants_prop.get(product, {}) prop[attribute_name] = value variants_prop[product] = prop for variant_id in variants_json['childProducts']: vr = {} variant = variants_json['childProducts'][variant_id] sku = variant.get('productSku') cond_set_value(vr, 'skuId', sku) final_price = variant.get('finalPrice') cond_set_value(vr, 'price', final_price) prop = variants_prop[variant_id] cond_set_value(vr, 'properties', prop) variants.append(vr) return variants if variants else None
def _match_walmart_threadsafe(self, response): product = response.meta.get('product') upc = product.get('upc') walmart_category = response.xpath('//p[@class="dept-head-list-heading"]/a/text()').extract() walmart_url = response.xpath('//a[@class="js-product-title"][1]/@href').extract() if walmart_url: walmart_exists = True walmart_url = urlparse.urljoin('http://www.walmart.com/', walmart_url[0]) else: walmart_exists = False cond_set_value(product, 'walmart_url', walmart_url) cond_set_value(product, 'walmart_category', walmart_category) cond_set_value(product, 'walmart_exists', walmart_exists) # This is for case when both flags are true if self.match_target: target_url = 'http://tws.target.com/searchservice/item/search_results/v2/by_keyword?search_term={}&alt=json&' \ 'pageCount=24&response_group=Items&zone=mobile&offset=0' req = Request(target_url.format(upc), callback=self._match_target_threadsafe) req.meta['product'] = product yield req else: yield product
def _populate_from_html(self, response, product): cond_set(product, 'price', response.css('.price span::text').re( u'\u00a3([\d, .]+)')) cond_set(product, 'title', _itemprop(response, 'model'), unicode.strip) cond_set(product, 'brand', _itemprop(_itemprop(response, 'brand', False), 'name'), unicode.strip) cond_set(product, 'image_url', _itemprop(response, 'image', False) .css('img::attr(src)').extract()) image = product.get('image_url') if image and image.endswith('noImage.gif'): del (product['image_url']) cond_set_value(product, 'is_out_of_stock', response.css('.stockMessaging::text').re( 'out of stock|Discontinued product'), bool) regex = "\/([0-9]+)[\/\?]" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) details = response.css('.prodDetailsContainer').xpath( 'node()[normalize-space()]') details = [d.extract() for d in details if not d.css('form')] if details: cond_set_value(product, 'description', details, conv=''.join) self._populate_related_products(response, product) self._populate_buyer_reviews(response, product) price = product.get('price', None) if price == 0: del (product['price']) elif price: price = re.sub(', ', '', price) cond_replace_value(product, 'price', Price(priceCurrency='GBP', price=price))
def _populate_buyer_reviews(self, response, product): total = response.css( '.pr-snapshot-average-based-on-text .count::text').re('[\d ,]+') if not total: cond_set_value(product, 'buyer_reviews', 0) return total = int(re.sub('[ ,]', '', total[0])) avg = response.css('.pr-rating.pr-rounded.average::text')[0].extract() avg = float(avg) by_star = response.css('.pr-histogram-count span::text') by_star = by_star.re('\(([\d, ]+)\)') by_star = { i + 1: int(re.sub('[ ,]', '', c)) for i, c in enumerate(reversed(by_star)) } cond_set_value( product, 'buyer_reviews', BuyerReviews(num_of_reviews=total, average_rating=avg, rating_by_star=by_star)) if not total or not avg: cond_set_value(product, 'buyer_reviews', 0)
def parse_product(self, response): reqs = [] meta = response.meta.copy() product = meta['product'] # Set locale product['locale'] = 'en_GB' # Parse title title = self._parse_title(response) cond_set_value(product, 'title', title, conv=string.strip) # Parse brand brand = self._parse_brand(response) cond_set_value(product, 'brand', brand) # Parse department department = self._parse_department(response) cond_set_value(product, 'department', department) # Parse categories categories = self._parse_categories(response) cond_set_value(product, 'categories', categories) # Parse price price, currency = self._parse_price(response) price = Price(price=float(price), priceCurrency=currency) cond_set_value(product, 'price', price) # Parse special pricing special_pricing = self._parse_special_pricing(response) cond_set_value(product, 'special_pricing', special_pricing, conv=bool) # Parse image url image_url = self._parse_image_url(response) cond_set_value(product, 'image_url', image_url, conv=string.strip) # Parse description description = self._parse_description(response) cond_set_value(product, 'description', description, conv=string.strip) # Parse stock status is_out_of_stock = self._parse_stock_status(response) cond_set_value(product, 'is_out_of_stock', is_out_of_stock) # Parse upc upc = self._parse_upc(response) cond_set_value(product, 'upc', upc) # Parse variants variants = self._parse_variants(response) cond_set_value(product, 'variants', variants) # Parse buyer reviews reqs.append( Request(url=self.BUYER_REVIEWS_URL.format(upc=upc), dont_filter=True, callback=self.br.parse_buyer_reviews)) # Parse related products related_products = self._parse_related_products(response) cond_set_value(product, 'related_products', related_products) if reqs: return self.send_next_request(reqs, response) return product
def parse_product(self, response): product = response.meta['product'] populate_from_open_graph(response, product) cond_set( product, 'title', response.xpath('//meta[@property="og:title"]/@content').extract(), conv=string.strip) if not product.get('brand', None): brand = guess_brand_from_first_words( product.get('title', None).strip()) if brand: product['brand'] = brand cond_replace( product, 'image_url', response.css( ".main-image-container>img::attr(src)" ).extract(), lambda url: urlparse.urljoin(response.url, url) ) prod_description = response.css( ".product-details-container .description-copy p::text" ) cond_set_value(product, 'description', "\n".join( x.strip() for x in prod_description.extract() if x.strip())) sku = response.css( ".product-details-container .caption-2::text").extract() if sku: sku = re.findall('\d+', sku[0]) else: sku = None cond_set(product, 'sku', sku, string.strip) cond_set(product, 'reseller_id', sku, string.strip) price_now = response.css( ".product-details-container .right-side .price .sale::text" ).extract() if not price_now: price = response.css( ".product-details-container .right-side .price span::text" ).extract() else: price = price_now cond_set( product, 'price', price, conv=string.strip, ) if price: product['price'] = Price( price=product['price'].replace(u'\xa3', '').strip(), priceCurrency='GBP') related_products = self._parse_related(response) cond_set_value(product, 'related_products', related_products) cond_set_value(product, 'locale', 'en-GB') sample = response.xpath('//select[@id="SizeKey"]/option/text()').extract() variants = [] for index, i in enumerate(sample): if index > 0: var = i.replace('Size ', '') variants.append(var.strip()) variant_list = [] for variant in variants: variant_item = {} properties = {} if 'out of stock' in variant: properties['size'] = variant.replace(' (out of stock)', '') else: properties['size'] = variant variant_item['price'] = price[0].replace(u'\xa3', '').strip() variant_item['in_sock'] = False if 'out of stock' in variant else True variant_item['properties'] = properties variant_item['selected'] = False variant_list.append(variant_item) product['variants'] = variant_list return product
def parse_product(self, response): reqs = [] meta = response.meta.copy() product = meta['product'] # Set locale product['locale'] = 'en_GB' # Set product id product_id = is_empty( response.xpath( '//input[@name="catCode"]/@value' ).extract(), '0' ) response.meta['product_id'] = product_id # Set title title = self._parse_title(response) cond_set_value(product, 'title', title, conv=string.strip) # Set price price = self._parse_price(response) cond_set_value(product, 'price', price) # Set special pricing special_pricing = self._parse_special_pricing(response) cond_set_value(product, 'special_pricing', special_pricing, conv=bool) # Set image url image_url = self._parse_image_url(response) cond_set_value(product, 'image_url', image_url) # Set categories category = self._parse_category(response) cond_set_value(product, 'category', category) if category: # Set department department = category[-1] cond_set_value(product, 'department', department) # Set variants variants = self._parse_variants(response) cond_set_value(product, 'variants', variants) # variant_request = Request( # url=self.IMG_URL.format(cat_code=cat_code), # callback=self.info_variant_parse, # dont_filter=True, # ) # reqs.append(variant_request) # Set stock status is_out_of_stock = self._parse_stock_status(response) cond_set_value(product, 'is_out_of_stock', is_out_of_stock, conv=bool) # Set description description = self._parse_description(response) cond_set_value(product, 'description', description) # Parse related products related_products = self._parse_related_products(response) cond_set_value(product, 'related_products', related_products) # Parse buyer reviews reqs.append( Request( url=self.BUYER_REVIEWS_URL.format(product_id=product_id), dont_filter=True, callback=self.br.parse_buyer_reviews ) ) if reqs: yield self.send_next_request(reqs, response) yield product
def parse_product(self, response): meta = response.meta.copy() product = meta.get('product', SiteProductItem()) if response.status == 404 or "www.dockers.com/US/en_US/error" in response.url: product.update({"not_found": True}) product.update({"no_longer_available": True}) product.update({"locale": 'en-US'}) return product else: product.update({"no_longer_available": False}) reqs = [] meta['reqs'] = reqs product['ranking'] = response.meta.get('_ranking', None) product['total_matches'] = self.total_matches product['url'] = response.url product['site'] = self.allowed_domains[0] product['search_term'] = self.searchterms[0] if self.searchterms else None product['scraped_results_per_page'] = product['results_per_page'] = self.PAGINATE_BY # product id self.product_id = is_empty(response.xpath('//meta[@itemprop="model"]/@content').extract()) # product data in json self.js_data = self.parse_data(response) # Parse locate locale = 'en_US' cond_set_value(product, 'locale', locale) # Parse model cond_set_value(product, 'model', self.product_id) reseller_id_regex = "p\/([^\/&?\.\s]+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id) # Parse title title = self.parse_title(response) cond_set(product, 'title', title) # Parse image image = self.parse_image(response) cond_set_value(product, 'image_url', image) # Parse brand brand = self.parse_brand(response) cond_set_value(product, 'brand', brand) # Parse upc upc = self.parse_upc(response) cond_set_value(product, 'upc', upc) # Parse sku sku = self.parse_sku(response) cond_set_value(product, 'sku', sku) # Parse description description = self.parse_description(response) cond_set_value(product, 'description', description) # Parse price price = self.parse_price(response) cond_set_value(product, 'price', price) # Parse variants variants = self._parse_variants(response) product['variants'] = variants # Parse product_categories self.product_categories = self._extract_categories(response.body_as_unicode()) response.meta['marks'] = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0} real_count = is_empty(re.findall(r'<span itemprop="reviewCount">(\d+)<\/span>', response.body_as_unicode())) response.meta['product'] = product meta = response.meta if real_count: # Parse buyer reviews if int(real_count) > 8: for index, i in enumerate(xrange(9, int(real_count) + 1, 30)): reqs.append( Request( url=self.REVIEW_URL.format(product_id=self.product_id, index=index+2), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta ) ) reqs.append( Request( url=self.REVIEW_URL.format(product_id=self.product_id, index=0), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta )) if reqs: return self.send_next_request(reqs, response) return product
def parse_product(self, response): product = response.meta['product'] desc = response.xpath( '//div[@id="product-description-full"]/text()').extract() if desc: product['description'] = desc[0] if not product.get("price"): _prices = response.xpath('.//*[contains(@class, "price")]') price = get_price(_prices) if price: product["price"] = Price(price=price.replace("\xa3", ""), priceCurrency="GBP") if not product.get("title"): title = response.xpath("//h1[@itemprop='name']/text()").extract() if title: product["title"] = title[0] cond_set(product, 'upc', get_upc(response)) cond_set( product, 'brand', response.xpath( '//div[@id="specs"]' '//div[@class="specs-row"]' '[contains(./*[@class="specs-name"]/text(), "Brand")]' '/*[@class="specs-value"]/text()').extract()) if not product.get('brand', None): brand = guess_brand_from_first_words(product['title']) if brand: product['brand'] = brand reseller_id_regex = "/(\d+)\??" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id) related = response.css('#related li.rel-item .rel-title a') r = [] for rel in related: title = rel.xpath('text()').extract() url = rel.xpath('@href').extract() if title and url: r.append( RelatedProduct(title=title[0], url=urlparse.urljoin(response.url, url[0]))) product['related_products'] = {'recommended': r} # get right url if it redirect url redirect_pattern = r'&adurl=(.*)' res = re.findall(redirect_pattern, product['url']) if res: try: req_url = urllib.unquote(res[0]) res = urllib.urlopen(req_url) url_not_stripped = res.geturl() product['url'] = url_not_stripped except: pass review_link = product['buyer_reviews'] if review_link: link = 'https://www.google.co.uk' + review_link return Request(link, callback=self.handle_reviews_request, meta=response.meta) else: product['buyer_reviews'] = ZERO_REVIEWS_VALUE # strip GET data from only google urls if 'google.co.uk/shopping/product' in product['url']: self._populate_buyer_reviews(response, product) pattern = r'([^\?]*)' result = re.findall(pattern, product['url']) if result: product['url'] = result[0] product['google_source_site'] = "{}" stores_link = result[0] + '/online' return Request(stores_link, callback=self.populate_stores, meta={ 'product': product, 'page': 0 }) return product
def _parse_bazaarv(self, response): reqs = response.meta.get('reqs', []) product = response.meta['product'] text = response.body_as_unicode().encode('utf-8') if response.status == 200: x = re.search(r"var materials=(.*),\sinitializers=", text, re.M + re.S) if x: jtext = x.group(1) jdata = json.loads(jtext) html = jdata['BVRRSourceID'] sel = Selector(text=html) avrg = sel.xpath("//div[contains(@id,'BVRRRatingOverall')]" "/div[@class='BVRRRatingNormalOutOf']" "/span[contains(@class,'BVRRRatingNumber')]" "/text()").extract() if avrg: try: avrg = float(avrg[0]) except ValueError: avrg = 0.0 else: avrg = 0.0 total = sel.xpath( "//div[@class='BVRRHistogram']" "/div[@class='BVRRHistogramTitle']" "/span[contains(@class,'BVRRNonZeroCount')]" "/span[@class='BVRRNumber']/text()").extract() if total: try: total = int(total[0]) except ValueError: total = 0 else: total = 0 hist = sel.xpath( "//div[@class='BVRRHistogram']" "/div[@class='BVRRHistogramContent']" "/div[contains(@class,'BVRRHistogramBarRow')]") distribution = {} for ih in hist: name = ih.xpath("span/span[@class='BVRRHistStarLabelText']" "/text()").re("(\d) star") try: if name: name = int(name[0]) value = ih.xpath( "span[@class='BVRRHistAbsLabel']/text()").extract( ) if value: value = int(value[0]) distribution[name] = value except ValueError: pass if distribution: reviews = BuyerReviews(total, avrg, distribution) cond_set_value(product, 'buyer_reviews', reviews) if reqs: return self.send_next_request(reqs, response) return product
def parse_product(self, response): product = response.meta['product'] cond_set( product, 'title', response.xpath("//section[@itemscope]/h1" "/span[@itemprop='name']/text()").extract()) cond_set( product, 'brand', response.xpath("//section[@itemscope]/h1" "/span[@itemprop='brand']/text()").extract()) if not product.get('brand', None): dump_url_to_file(response.url) cond_set( product, 'upc', response.xpath("//section[@itemscope]/meta[@itemprop='identifier']" "/@content").extract()) price = response.xpath( "//section[@itemscope]/div[contains(@class,'productDetail')]" "/section[contains(@class,'description')]" "/div/div[contains(@class,'productPrices')]" "/span[@itemprop='price']/ins/text()").re(FLOATING_POINT_RGEX) if price: product['price'] = Price(price=price[0], priceCurrency='GBP') cond_set( product, 'image_url', response.xpath( "//section[@itemscope]/descendant::section[@class='productMedias']" "/div[@id='currentView']/a/img/@src").extract()) regex = "(\d+)-pdt" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) if self.DO_DESCRIPTION: cond_set(product, 'description', response.xpath("//section[@id='longDesc']").extract()) cond_set_value(product, 'locale', "en-GB") out_of_stock = response.xpath( "//div[contains(@class,'productDetail')]" "/section[@class='col3']/div[@class='nested']" "/strong/text()").re(r"Out of stock") if out_of_stock: product['is_out_of_stock'] = True # review = response.xpath( # "//div[contains(@class,'productDetail')]" # "/section[@class='col3']/p[@id='reviews']" # "/a/@href" # ).extract() payload = self._extract_rr_parms(response) productid = payload['p'] product['upc'] = productid review_url = ( 'http://mark.reevoo.com/reevoomark/en-GB/product?sku={sku}' '&trkref=PCG').format(sku=productid) new_meta = response.meta.copy() new_meta['handle_httpstatus_list'] = [404] reevoo_request = Request(url=review_url, callback=self._parse_reevoo, meta=new_meta) response.meta['reevoo'] = reevoo_request if payload: new_meta = response.meta.copy() rr_url = urlparse.urljoin(self.SCRIPT_URL, "?" + urllib.urlencode(payload)) return Request(rr_url, self._parse_rr_json, meta=new_meta) else: self.log("No {rr} payload at %s" % response.url, DEBUG) return product
def parse_product(self, response): product = response.meta['product'] data = is_empty( re.findall("page_products\'\:\s+([^\}]*)", response.body_as_unicode())) + "}" try: data = json.loads(data.strip().replace("'", "\"")) except ValueError: data = {} product["description"] = is_empty( response.xpath( "//div[contains(@class, 'prd-description')]").extract()) average = is_empty( response.xpath( "//span[contains(@class, 'b-rating-average')]/text()").extract( )) total = is_empty( response.xpath("//h2[@class='b-ttl-2']/span/text()").re( FLOATING_POINT_RGEX)) if average and total: product["buyer_reviews"] = BuyerReviews(num_of_reviews=total, average_rating=average, rating_by_star={}) if data: product["price"] = Price(price=data["prod_price"], priceCurrency=data["currency"]) product["is_out_of_stock"] = not bool(int(data["stock_available"])) product["title"] = data["prod_name"] product["image_url"] = data["prod_image_url"] product["url"] = data["prod_url"] if not product["description"]: product["description"] = data["description"] product["brand"] = data["brand"] else: price = is_empty( response.xpath("//span[contains(@class, 'price')]/text()").re( FLOATING_POINT_RGEX), None) if price: product["price"] = Price(price=price, priceCurrency="GBP") product["title"] = is_empty( response.xpath( "//h1[contains(@class, 'b-ttl-main')]/text()").extract()) product["image_url"] = is_empty( response.xpath( "//*[@id='cart-form']/div[2]/div[1]/div/div/a/@href"). extract()) product["url"] = response.url product["brand"] = is_empty( response.xpath("//span[@itemprop='brand']/text()").extract()) if not product.get('brand', None): dump_url_to_file(response.url) cond_set_value(product, 'locale', "en-GB") if "You May Also Like" in response.body_as_unicode(): catId = is_empty( re.findall("cat_id\'\:\s+(\d+)", response.body_as_unicode())) sid = is_empty( re.findall("sid\'\:\s+\"([^\"]*)", response.body_as_unicode())) if catId and sid and "item_id" in data: url = "http://www.rakuten.co.uk/api/recommendation?" \ "category_id=%s" \ "&item_id=%s" \ "&shop_id=%s" % (catId, data["item_id"], sid) return Request(url=url, callback=self._related_parse, meta={"product": product}) return product
def parse_product(self, response): reqs = response.meta.get('reqs', []) product = response.meta['product'] # Set locale product['locale'] = 'en_US' # Parse title title = self._parse_title(response) cond_set_value(product, 'title', title, conv=string.strip) # Parse brand brand = self._parse_brand(response) cond_set_value(product, 'brand', brand, conv=string.strip) # Parse model model = self._parse_model(response) cond_set_value(product, 'model', model) # Parse categories categories = self._parse_categories(response) cond_set_value(product, 'categories', categories) # Parse category category = self._parse_category(response) cond_set_value(product, 'category', category) # Parse description description = self._parse_description(response) cond_set_value(product, 'description', description) # Parse price price = self._parse_price(response) cond_set_value(product, 'price', price) # Parse reseller_id reseller_id = self._parse_reseller_id(response) cond_set_value(product, 'reseller_id', reseller_id) # Parse image url image_url = self._parse_image_url(response) cond_set_value(product, 'image_url', image_url) # Parse variants variants = self._parse_variants(response) cond_set_value(product, 'variants', variants) # Parse stock status out_of_stock = self._parse_is_out_of_stock(response) cond_set_value(product, 'is_out_of_stock', out_of_stock) no_longer_available = self._parse_no_longer_available(response) cond_set_value(product, 'no_longer_available', no_longer_available) related_products = self._parse_related_products(response) cond_set_value(product, 'related_products', related_products) # Reviews bv_product_id = response.xpath( '//*[@id="bvProductId"]/@value').extract() bv_product_id = bv_product_id[0] if bv_product_id else None if not bv_product_id: bv_product_id = response.url.split('/')[-1] if bv_product_id: url = self.RATING_URL.format(prodid=bv_product_id) reqs.append( Request(url, dont_filter=True, callback=self._parse_bazaarv, meta={ 'product': product, 'reqs': reqs })) if reqs: return self.send_next_request(reqs, response) return product
def parse_coupon(self, response): item = DiscountCoupon() d = self._parse_description(response) cond_set_value(item, 'description', d) if not d: return cond_set_value(item, 'category', self._parse_category(response)) cond_set_value(item, 'discount', self._parse_discount(response)) cond_set_value(item, 'conditions', self._parse_conditions(response)) cond_set_value(item, 'start_date', self._parse_start_date(response)) cond_set_value(item, 'end_date', self._parse_end_date(response)) promo_code = None if not item.get('promo_code'): promo_regex = "[Uu]sing\s?[Pp]romo\s?[Cc]ode:\s?([A-Z0-9]+)" promo_code = re.findall(promo_regex, item.get('conditions')) promo_code = promo_code[0] if promo_code else None if not promo_code: promo_code = re.findall(promo_regex, item.get('description')) promo_code = promo_code[0] if promo_code else None cond_set_value(item, 'promo_code', promo_code) return item
def _parse_popup_promo(self, response): item = DiscountCoupon() description = response.xpath('.//*[@class="subscribe_header"]/text()').extract() description = description[0].strip() if description else None if description: cond_set_value(item, 'description', description) cond_set_value(item, 'category', None) cond_set_value(item, 'discount', ' '.join(response.xpath(".//*[@id='EmailSignupForm']/p[1]/text()").re('\d+\%'))) cond_set_value(item, 'conditions', ''.join(response.xpath(".//*[@id='EmailSignupForm']/p[1]/text()").extract())) cond_set_value(item, 'start_date', None) cond_set_value(item, 'end_date', None) cond_set_value(item, 'promo_code', None) return item
def _parse_special_promo_code(self, response): item = DiscountCoupon() description = response.xpath(".//*[@id='mdl-jc-sale-campaign']/p[1]/text()").extract() if description: cond_set_value(item, 'description', description) cond_set_value(item, 'category', None) cond_set_value(item, 'discount', response.xpath(".//*[@id='mdl-jc-sale-campaign']/h2/text()").re('\d+\%')) cond_set_value(item, 'conditions', response.xpath(".//*[@id='mdl-jc-sale-campaign']/h2/text()").extract()) cond_set_value(item, 'start_date', None) cond_set_value(item, 'end_date', None) promo_code = response.xpath(".//*[@id='mdl-jc-sale-campaign']/*[contains(text(), 'code ')]/text()").extract() promo_code = ''.join(promo_code).split(' ') promo_code = promo_code[-1] if promo_code else None cond_set_value(item, 'promo_code', promo_code) return item
def parse_product(self, response): prod = response.meta.get('product', SiteProductItem()) prod['_subitem'] = True _ranking = response.meta.get('_ranking', None) prod['ranking'] = _ranking prod['url'] = response.url cond_set(prod, 'title', response.css('h1 ::text').extract()) prod['price'] = DellProductSpider._parse_price(response) prod['image_url'] = DellProductSpider._parse_image(response) prod['description'] = DellProductSpider._parse_description(response) prod['brand'] = DellProductSpider._parse_brand(response, prod.get('title', '')) prod['related_products'] = self._related_products(response) response.meta['product'] = prod is_links, variants = self._parse_variants(response) if is_links: yield variants.pop(0) else: cond_set_value(prod, 'variants', self._collect_variants_from_dict(variants)) if 'This product is currently unavailable.' in response.body_as_unicode( ): prod['is_out_of_stock'] = True else: yield self._get_stock_status(response, prod) # this should be OOS field meta = {'product': prod} prod_id = self._get_product_id(response) if prod_id: # first page type if response.css('#bazaarVoice').extract(): meta.update({'br_page_type': 1}) yield Request( # reviews request url=self.REVIEW_URL.format(product_id=prod_id), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta) buyer_reviews_iframe_src = response.xpath( '//iframe[contains(@src,"reviews.htm")]/@src').extract() if buyer_reviews_iframe_src: # second page type meta.update({'br_page_type': 2}) yield Request( # reviews request url=buyer_reviews_iframe_src[0].replace('format=noscript', ''), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta) try: r_url, related_data = self.RELATED_PROD_URL_V1, self._collect_related_products_data_v1( response) except Exception: r_url, related_data = self.RELATED_PROD_URL_V2, self._collect_related_products_data_v2( response) yield Request( # related products request r_url.format(**related_data), callback=self._parse_related_products, meta=meta) yield prod
def parse_product(self, response): reviewed = response.meta.get('reviewed') prod = response.meta['product'] # if there was no any request for item review try to send it if not reviewed: revs_a = response.xpath('//a[@class="read_reviews_action"]') if revs_a: avg = revs_a.xpath( './/span[@itemprop="ratingValue"]/text()').extract() total = revs_a.xpath( './/span[@itemprop="ratingCount"]/text()').extract() rev_url = response.url + '/reviewhtml/all' meta = response.meta.copy() meta['avg'] = avg meta['total'] = total meta['initial_response'] = response return Request(rev_url, callback=self.populate_reviews, meta=meta) else: cond_set_value(prod, 'buyer_reviews', ZERO_REVIEWS_VALUE) title = response.xpath( '//div[@class="product-summary"]/h1/text()').extract() cond_set(prod, 'title', title) brand = [ is_empty(re.findall(r'"manufacturer":\s"(.*)",', response.body), None) ] if not brand: if prod.get("title"): brand = is_empty([guess_brand_from_first_words(prod['title'])], None) if brand: cond_set(prod, 'brand', brand) if not prod.get('brand', None): dump_url_to_file(response.url) price = response.xpath( '//p[@class="new-price"]/meta[@itemprop="price"]/@content' ).extract() priceCurrency = response.xpath( '//p[@class="new-price"]/meta[@itemprop="priceCurrency"]/@content' ).extract() if price and priceCurrency: if re.match("\d+(.\d+){0,1}", price[0]): prod["price"] = Price(priceCurrency=priceCurrency[0], price=price[0]) else: prod["price"] = Price(priceCurrency="GBP", price=0.00) else: prod["price"] = Price(priceCurrency="GBP", price=0.00) des = response.xpath('//div[@class="productDescription"]').extract() cond_set(prod, 'description', des) img_url = response.xpath( '//div[@class="product-images"]/img/@src').extract() cond_set(prod, 'image_url', img_url) cond_set(prod, 'locale', ['en-US']) if not prod.get("reseller_id"): reseller_id = response.xpath( './/*[@itemprop="sku"]/text()').extract() cond_set(prod, 'reseller_id', reseller_id) prod['url'] = response.url available = response.xpath( '//form[contains(@id,"addToCartForm")]/input[@type="submit"]/@value' ).extract() if available and 'Email when back in stock' in available[0]: cond_set(prod, 'is_out_of_stock', [True]) if available and 'Last few in store' in available[0]: lim = LimitedStock(is_limited=True, items_left=[1]) cond_set(prod, 'limited_stock', [lim]) prod_id = re.findall(r'"id":\s"(.*)",', response.body) if prod_id: recomm_url = self.RECOMM_URL.format(prod_id=prod_id[0]) return Request(recomm_url, callback=self.populate_recommendations, meta=response.meta.copy()) return prod
def _get_products(self, response): remaining = response.meta['remaining'] search_term = response.meta['search_term'] prods_per_page = response.meta.get('products_per_page') total_matches = response.meta.get('total_matches') scraped_results_per_page = response.meta.get('scraped_results_per_page') if self.deal_product_url_list: prods = self._generate_goldbox_links_from_deals(response) else: prods = self._scrape_product_links(response) if prods_per_page is None: # Materialize prods to get its size. prods = list(prods) prods_per_page = len(prods) response.meta['products_per_page'] = prods_per_page if scraped_results_per_page is None: scraped_results_per_page = self._scrape_results_per_page(response) if scraped_results_per_page: self.log( "Found %s products at the first page" %scraped_results_per_page , INFO) else: scraped_results_per_page = prods_per_page if hasattr(self, 'is_nothing_found'): if not self.is_nothing_found(response): self.log( "Failed to scrape number of products per page", WARNING) response.meta['scraped_results_per_page'] = scraped_results_per_page if total_matches is None: total_matches = self._scrape_total_matches(response) if total_matches is not None: response.meta['total_matches'] = total_matches self.log("Found %d total matches." % total_matches, INFO) else: if hasattr(self, 'is_nothing_found'): if not self.is_nothing_found(response): self.log( "Failed to parse total matches for %s" % response.url,ERROR) if total_matches and not prods_per_page: # Parsing the page failed. Give up. self.log("Failed to get products for %s" % response.url, ERROR) return for i, (prod_url, prod_item) in enumerate(islice(prods, 0, remaining)): # Initialize the product as much as possible. prod_item['site'] = self.site_name prod_item['search_term'] = search_term prod_item['total_matches'] = total_matches prod_item['results_per_page'] = prods_per_page prod_item['scraped_results_per_page'] = scraped_results_per_page # The ranking is the position in this page plus the number of # products from other pages. prod_item['ranking'] = (i + 1) + (self.quantity - remaining) if self.user_agent_key not in ["desktop", "default"]: prod_item['is_mobile_agent'] = True if prod_url is None: # The product is complete, no need for another request. yield prod_item elif isinstance(prod_url, Request): cond_set_value(prod_item, 'url', prod_url.url) # Tentative. yield prod_url else: # Another request is necessary to complete the product. url = urlparse.urljoin(response.url, prod_url) cond_set_value(prod_item, 'url', url) # Tentative. yield Request( url, callback=self.parse_product, meta={'product': prod_item}, )
def parse_product(self, response): product = response.meta['product'] # locale product['locale'] = 'en_US' product_json = self.extract_product_json(response) # title title = product_json.get("id_json", {}).get("name", None) cond_set_value(product, 'title', title) # categories categories = product_json.get("style_data", {}).get("categories", []) categories = [category_info["value"] for category_info in categories] if categories: cond_set_value(product, 'categories', categories) if product.get('categories'): product['category'] = product['categories'][-1] # description description = response.xpath( "//div[@class='description']").extract()[0] cond_set_value(product, 'description', description) # price price = product_json.get("style_data", {}).get("price", None) cond_set_value(product, 'price', price) # image image = product_json.get("id_json", {}).get("image", None) if image: cond_set_value(product, 'image_url', image) # brand brand = product_json.get("id_json", {}).get("brand", {}).get("name", None) cond_set_value(product, "brand", brand) # original price original_price = product_json.get("style_data", {}).get("originalPrice", None) cond_set_value(product, 'price_original', original_price) # no longer available availability = response.xpath( "//meta[@property='og:availability']/@content").extract() if availability: no_longer_avail = False if availability[0] == 'instock' else True cond_set_value(product, 'no_longer_available', no_longer_avail) if product['no_longer_available']: product['is_out_of_stock'] = True return product