def parse_product(self, response): product = response.meta['product'] xpath = '//div[@class="product_disclaimer"]/node()[normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) cond_set(product, 'brand', response.css('.at-a-glance span::text').re('Brand (.+)')) cond_set_value(product, 'locale', u'en-GB') model = response.css('.product-detail::attr(partnumber)').extract() if not model: model = response.css('.product-detail::attr(partNumber)').extract() if not model: self.log('Could not find partNumber') return if model: product["model"] = is_empty(model) url = self.REVIEW_API_URL.format(model=model[0], apipass=self.REVIEW_API_PASS) if url: meta = {"product": product} return Request( url, callback=self.parse_buyer_reviews, dont_filter=True, meta=meta, ) else: cond_set_value(product, 'buyer_reviews', ZERO_REVIEWS_VALUE) return product
def _scrape_product_links(self, response): boxes = response.css('.product-description') for box in boxes: product = SiteProductItem() url = box.xpath('h3/a/@href').extract() cond_set(product, 'brand', box.xpath('p/text()').extract()) yield url[0], product
def _populate_from_html(self, response, product): title = response.xpath( '//h1[contains(@class, "search-prod-desc")]/text()' #'/@title' ).extract() cond_set(product, 'title', title) xpath = '//div[@id="dotcombrand"]/../preceding-sibling::li[1]/text()' brand = response.xpath(xpath).extract() if not brand: brand = response.xpath('//p[@class="brand-name"]/text()').extract() if brand: brand = brand[0].split(':') if len(brand) == 1: brand = [brand[0]] else: brand = [brand[1]] cond_set(product, 'brand', brand) xpath = '//h3[text()="Description"]' \ '/following-sibling::p[normalize-space()] |' \ '//div[contains(@class, "product-details-desc")]' desc = response.xpath(xpath).extract() cond_set(product, 'description', desc) image_url = re.findall("enlargedImageURL = '([^']*)'", response.body) cond_set(product, 'image_url', image_url) model = re.findall('"model" : "([^"]*)"', response.body) cond_set(product, 'model', model) regex = "currentSKUNbr=(\d+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) self._populate_related_products(response, product)
def _populate_from_html(self, response, product): _populate_from_open_graph_product(response, product) cont = '#productDetailsLeftSidebar .inner-container ' cond_set(product, 'title', response.css(cont + 'h1::text').extract(), unicode.strip) if not product.get("title"): title = response.xpath( "//h1[contains(@class, 'prod_name')]/text()").extract() if title: cond_set(product, 'title', title, unicode.strip) regex = "\/_\/([^?$\s]+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) price = response.xpath( '//div[@id="productPrice"]' \ '/div[contains(@class, "display_price")]/input/@value |' '//div[@id="productPrice"]/span[last()]/text()' ).extract() if price: price = price[0].replace("$", "").strip() product["price"] = Price(priceCurrency='USD', price=price) model = response.css('#storeStyleNumber::text').extract() if model: model = re.search(r'Store Style #:\xa0(.+)', model[0]) cond_set_value(product, 'model', model, lambda model: model.group(1)) self._populate_related_products(response, product) self._populate_hardcoded_fields(product)
def _populate_from_box(self, response, box, product): cond_set(product, 'title', box.css('.productTitle a::text').extract()) cond_set(product, 'price', box.css('.currentPrice ins::text').extract(), unicode.strip) cond_set_value(product, 'is_in_store_only', len(box.css('.availability .available')) == 1) cond_set_value(product, 'is_out_of_stock', not box.css('.availability .available'))
def parse_product(self, response): prod = response.meta['product'] reqs = response.meta.get('reqs', []) title = response.xpath('//*[@id="productName"]/text()' '|//*[@class="wag-prod-title"]/text()').extract() title = [x.strip() for x in title if x.strip()] cond_set(prod, 'title', title) no_longer_available = bool(response.xpath( '//*[@role="alert"]/span[contains' '(text(),"no longer available")]')) cond_set_value(prod, 'no_longer_available', no_longer_available) img_url = response.xpath( '//img[@id="main-product-image"]/@data-src').extract() if img_url: img_url = urlparse.urljoin(self.site, img_url[0]) prod['image_url'] = img_url prod['url'] = response.url prod['locale'] = 'en-US' cond_set_value( prod, 'description', ''.join( response.xpath('//div[@id="description-content"]').extract()), ) cond_set( prod, 'model', response.xpath( '//section[@class="panel-body wag-colornone"]/text()' ).re('Item Code: (\d+)') ) regex = "[Ii][Dd]=prod(\d+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(prod, "reseller_id", reseller_id) prod_id = re.findall('ID=(.*)-', response.url)[0] review_url = self.REVIEW_API_URL.format(prod_id=prod_id) price_variants_url = self.PRICE_VARI_API_URL.format(prod_id=prod_id) reqs.append(Request(review_url, meta=response.meta, callback=self._parse_review_api)) reqs.append(Request(price_variants_url, meta=response.meta, callback=self._parse_price_and_variants)) if reqs: return self.send_next_request(reqs, response) return prod
def parse_product(self, response): product = response.meta['product'] title_list = response.xpath( "//h1[@class='productTitle'][1]//text()").extract() if len(title_list) >= 2: cond_set_value(product, 'title', self.clear_desc(title_list[-2:])) cond_set( product, 'price', response.xpath( "//div[@id='bopRight']//meta[@itemprop='price']/@content"). extract()) if product.get('price', None): if isinstance(product['price'], str): product['price'] = product['price'].decode('utf8') if not u'£' in product['price']: self.log('Unknown currency at %s' % response.url, level=ERROR) else: product['price'] = Price(priceCurrency='GBP', price=product['price'].replace( u'£', '').replace(' ', '').replace( ',', '').strip()) img_url = response.xpath( "//ul[@id='galleryImages']/li[1]/a/@href").extract() if img_url: cond_set_value(product, 'image_url', urlparse.urljoin(response.url, img_url[0])) cond_set_value( product, 'description', self.clear_desc( response.xpath( "//div[@id='bopBottom']" "//h2[@class='bopSectionHeader' and text()[1]='Product Description'][1]" "/following-sibling::*[@class='bopSection']" "//text()").extract())) cond_set_value(product, 'locale', "en_GB") regex = "\/(\d+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) cond_set( product, 'brand', response.xpath( "string(//div[@id='bopBottom']//*[@itemprop='brand'])"). extract(), string.strip, ) return product
def _scrape_product_links(self, response): product_boxes = response.css('.product') for box in product_boxes: product = SiteProductItem() url = box.css('.title a::attr(href)').extract()[0] cond_set(product, 'title', box.css('.title a::text').extract()) cond_set(product, 'price', box.css('.price .main::text').extract(), string.strip) yield url, product
def _populate_from_box(self, response, box, product): red_span = box.xpath('..//span[@class="red"]/text()').extract() if red_span: s = 'Currently out of stock' if s in red_span[0]: cond_set_value(product, 'is_out_of_stock', True) else: cond_set_value(product, 'is_out_of_stock', False) cond_set(product, 'title', _itemprop(box, 'name'), unicode.strip) product['title'] = _strip_non_ascii(product.get('title', '')) cond_set(product, 'price', _itemprop(box, 'price'))
def parse_product(self, response): product = response.meta['product'] vid = 1 if "vid" in response.meta: vid = response.meta['vid'] if 'OutOfStockNoResults' in response.url: self.log("Product OutOfStock %s %s" % (response.url, product), DEBUG) return if not product.get("price"): price = is_empty( response.xpath( "//span[@id='priceText']/text() |" \ "//div[@id='tabWindow']/noscript" ).extract(), "" ) price = is_empty(re.findall("\d+\.\d+", price)[::-1]) if price: product["price"] = Price(price=price, priceCurrency="USD") title = product.get('title') if isinstance(title, str): product['title'] = title.decode('utf-8', 'ignore') title = product.get('title') else: title = is_empty( response.xpath( "//div[@id='productNameText']/h1/text()").extract()) if title: product["title"] = title brindex = title.find("™") if brindex > 1: brand = title[:brindex] cond_set_value(product, 'brand', brand) # print "BRAND=", brand cond_set_value(product, 'brand', self.BRAND) cond_set(product, 'description', response.xpath("//div[@id='tabWindow']").extract()) product['locale'] = "en-US" new_meta = response.meta.copy() pid = product.get('upc') if not pid: pid = re.findall("pid=(\d+)", response.url) if pid: pid = pid[0] url = self.PRODUCT_URL_JS.format(pid=pid, vid=vid) return Request(url, callback=self._parse_product_js, meta=new_meta, priority=100)
def _populate_from_js(self, response, product): scripts = response.xpath( "//script[contains(text(), 'var utag_data=')]") if not scripts: self.log("No JS matched in %s." % response.url, WARNING) return cond_set(product, 'upc', scripts.re("product_sku:'(.+)[']")) cond_set(product, 'brand', scripts.re("product_brand:'(.+)[']")) price = scripts.re("product_price:'(.+)[']") if price: product['price'] = Price(price=price[0], priceCurrency='TRY')
def parse_product(self, response): if response.url != self.product_url: product = response.meta['product'] cond_set( product, 'title', response.xpath( "//div[contains(@class,'prodTitle')]/h1/span[@itemprop='name']" "/text()").extract()) # Title key must be present even if it is blank cond_set_value(product, 'title', "") return product
def _populate_from_html(self, response, product): reseller_id = re.findall('\/sku(\d+)', response.url) # reseller_id = reseller_id[0] if reseller_id else None cond_set(product, 'reseller_id', reseller_id) cond_set(product, 'title', response.css('[itemprop=name]::text').extract()) cond_set(product, 'brand', response.css('#ctl00_content_lnkBrand::text').extract()) cond_set(product, 'price', response.css('[itemprop=price]::text').extract()) if product.get('price', '') and not isinstance(product['price'], Price): if not 'Rp' in product['price']: self.log('Unrecognized currency at %s' % response.url) else: product['price'] = Price( price=product['price'].lower().replace( 'rp', '').replace(',', '').strip(), priceCurrency='IDR' ) cond_replace(product, 'image_url', response.css('#prodMedia img::attr(src)').extract()) specs = response.css('.spesifications').extract() specs = specs[0] if specs else '' description = product.get('description', '') + specs.strip() cond_replace_value(product, 'description', description) self._get_model_from_title(product)
def _populate_from_html(self, response, product): cond_set(product, 'title', response.css('.productSummary h1::text').extract()) cond_set(product, 'price', response.css('.pricePerUnit::text').extract(), unicode.strip) cond_set(product, 'price', response.css('.pricing [class*=pricePer]').extract(), unicode.strip) xpath = '//*[@id="information"]' \ '/node()[not(@class="access")][normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) cond_replace( product, 'image_url', response.css('#productImageHolder img::attr(src)').extract(), lambda url: urlparse.urljoin(response.url, url)) reseller_id = response.xpath('.//*[@class="skuCode"]/text()').extract() cond_set(product, 'reseller_id', reseller_id, string.strip) title = product['title'] brand = guess_brand_from_first_words(title, max_words=15) cond_set_value(product, 'brand', brand) self._unify_price(product) if not product.get("locale"): product["locale"] = "en_GB"
def _populate_from_html(self, response, product): prices = response.xpath( "//*[@id='priceDivClass']/span/text()").extract() cond_set(product, 'price', prices) # The description is a possible <p> or just the text of the class, # each page is different. desc = response.xpath("//*[@class='pIdDesContent']").extract() cond_set_value(product, 'description', desc, conv=''.join) if not desc: desc = response.xpath("//div[@class='descriptContent']").extract() if desc: del product['description'] cond_set(product, 'description', desc) upcs = response.xpath("//*[@class='skuHidden']/@value").extract() cond_set(product, 'upc', upcs) # Override the title from other sources. This is the one we want. cond_set(product, 'title', response.css('.productTitle h1 ::text').extract()) self._unify_price(product) image_url = response.xpath("//div[contains(@class,'productDetailPic')]" "/div/a/img/@src").extract() if image_url: image_url = image_url[0] if image_url.startswith("//"): image_url = 'http:' + image_url product['image_url'] = image_url
def _populate_from_html(self, response, product): cond_set(product, 'image_url', response.css('[itemprop=image]::attr(src)').extract(), lambda url: urlparse.urljoin(response.url, url)) _populate_from_open_graph_product(response, product) cond_set(product, 'price', response.css('.currentPrice ins::text').extract(), unicode.strip) cond_set(product, 'brand', response.css('[itemprop=brand]::text').extract()) if not product.get('brand', None): dump_url_to_file(response.url) cond_set(product, 'title', response.css('[itemprop=name]::text').extract()) css = '#longDesc article' desc = response.css(css).extract() desc = desc[0] if desc else None cond_set_value(product, 'description', desc) reseller_id_regex = "(\d+)-pdt" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id) self._unify_price(product)
def parse_buyer_reviews(self, response): product = response.meta.get("product") avg = response.xpath("//div[contains(@class,'ratings-summary')]//p[@class='ratingNumber']/span/text()").extract()[0] total_data = response.xpath("//div[contains(@class,'ratings-summary')]//p[@itemprop='reviewCount']/text()").extract()[0] total = re.findall("Based on ([\d,]+) Ratings", total_data)[0].replace(",", "") reviews = {} reviews['num_of_reviews'] = total reviews['average_rating'] = avg reviews['buyer_reviews'] = {} cond_set(product, 'buyer_reviews', None) return product
def _price_from_html(self, response, product): css = '.product-price-bol [itemprop=price]::attr(content)' cond_replace(product, 'price', response.css(css).extract()) cond_set( product, 'price', response.xpath( "//span[@class='offer_price']/meta[@itemprop='price']/@content" ).extract()) currency = response.css('[itemprop=priceCurrency]::attr(content)') currency = currency.extract()[0] if currency else 'EUR' price = product.get('price', '') price = price.replace(',', '.') if price and re.match(' *\d+\.?\d* *\Z', price): cond_replace_value(product, 'price', Price(currency, price))
def _populate_from_box(self, response, box, product): cond_set(product, 'title', box.css('a[data-item-number]::attr(title)').extract()) cond_set(product, 'price', box.css('.price-point font::text').re('\$([\d ,.]+)')) cond_set(product, 'price', box.css('.red-message.price-point::text').re('\$([\d ,.]+)')) cond_set(product, 'price', box.css('.price-point::text').re('\$([\d ,.]+)'))
def _get_price(self, response, product): """ Parses and sets the product price, with all possible variations :param response: Scrapy's Response obj :param product: Scrapy's Item (dict, basically) :return: None """ cond_set( product, 'price', response.css('#priceblock_ourprice ::text' ', #unqualifiedBuyBox .a-color-price ::text' ', #priceblock_saleprice ::text' ', #actualPriceValue ::text' ', #buyNewSection .offer-price ::text').extract(), ) if not product.get('price', None): cond_set( product, 'price', response.xpath( '//td/b[@class="priceLarge"]/text() |' '//span[@class="olp-padding-right"]' '/span[@class="a-color-price"]/text() |' '//div[contains(@data-reftag,"atv_dp_bb_est_hd_movie")]' '/button/text() |' '//span[@id="priceblock_saleprice"]/text() |' '//li[@class="swatchElement selected"]' '//span[@class="a-color-price"]/text() |' '//div[contains(@data-reftag,"atv_dp_bb_est_sd_movie")]' '/button/text() |' '//div[@id="mocaBBRegularPrice"]' '/div/text()[normalize-space()]').extract()) if product.get('price', None): if not '$' in product['price']: if 'FREE' in product['price'] or ' ' in product['price']: product['price'] = Price(priceCurrency='USD', price='0.00') else: self.log('Currency symbol not recognized: %s' % response.url, level=ERROR) else: price = re.findall('[\d ,.]+\d', product['price']) price = re.sub('[, ]', '', price[0]) product['price'] = Price( priceCurrency='USD', price=price.replace('$', '').strip()\ .replace(',', '') )
def parse_product(self, response): product = response.meta['product'] cond_set_value(product, 'locale', 'en-GB') title = response.css('.product-name h1').extract() cond_set(product, 'title', title) image_url = response.css('#zoom1 img::attr(src)').extract() cond_set(product, 'image_url', image_url) brand = response.css('.box-brand a img::attr(alt)').extract() cond_set(product, 'brand', brand) model = response.xpath('//div[@itemprop="name"]/p/text()').extract() cond_set(product, 'model', model) reseller_id = response.xpath( '//*[@class="product-sku"]/text()').extract() cond_set(product, 'reseller_id', reseller_id) # Is_out_of_stock xpath = '//span[@id="availability-box" and text()="Out of stock"]' cond_set_value(product, 'is_out_of_stock', response.xpath(xpath), bool) # Description selection = response.css('.tabs-panels .std .content-wrapper') if selection: selection = selection[0].xpath('node()[normalize-space()]') cond_set_value(product, 'description', selection.extract(), u''.join) # Price price = response.css('[itemprop=price]::attr(content)') currency = response.css('[itemprop=priceCurrency]::attr(content)') if price and float(price[0].extract()) and currency: cond_set_value( product, 'price', Price(price=price[0].extract(), priceCurrency=currency[0].extract())) self._populate_buyer_reviews(response, product) self._populate_related_products(response, product) return product
def _populate_from_html(self, response, product): cond_set(product, 'image_url', response.css('.largeimage::attr(src)').extract()) cond_set(product, 'title', response.css('.productname::text').extract()) cond_set(product, 'brand', response.css('.productbrand [itemprop=name]::text').extract()) delivery_opts = response.css('.deliverycallout li') delivery_opts = [bool(do.css('.available')) for do in delivery_opts] opt_len = len(filter(None, delivery_opts)) if opt_len: cond_set_value(product, 'is_in_store_only', delivery_opts[1] and opt_len == 1) else: cond_set_value(product, 'is_out_of_stock', False) cond_set(product, 'price', response.css('[itemprop=price]::text').extract(), unicode.strip) cond_set(product, 'model', response.css('[itemprop=model]::text').extract()) regex = "\/(\d+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) price = product.get("price") if not re.findall(u'\xa3 *\d[\d, .]*', price): price = response.xpath( "//ul[contains(@class, 'pricing')]/li[last()]/span/text()" ).extract() if price: price = price[0].strip() price = re.findall(u'\xa3 *\d[\d, .]*', price) if price: price = re.sub(u'[\xa3, ]+', '', price[0]) cond_replace_value(product, 'price', Price(priceCurrency='GBP', price=price)) xpath = '//div[@id="pdpTab1"]/node()[normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) product['url'] = product['url'].rsplit('#', 1)[0]
def parse_product_old(self, response): prod = response.meta['product'] # populate_from_open_graph not awailable cause no type=product metadata = _extract_open_graph_metadata(response) description = response.xpath('//p[@itemprop="description"]//text()').extract() if description: cond_set_value(prod, 'description', description[0]) else: cond_set_value(prod, 'description', metadata.get('description')) cond_set_value(prod, 'title', metadata.get('title')) cond_replace_value(prod, 'url', metadata.get('url')) img_url = metadata.get('image').rstrip('?$browse_thumbnail$') cond_set_value(prod, 'image_url', img_url) locale = response.xpath( '//meta[@name="gwt:property"]/@content' ).re(r'locale=\s*(.*)') if locale: cond_set_value(prod, 'locale', locale[0]) re_pattern = r'(\d+,\d+|\d+)' price = response.xpath( '//span[@itemprop="price"]//span[contains(@class,"price-sales")]//text()' ).extract() if len(price) > 0: price = re.findall(r'[\d\.]+', price[0]) if len(price) > 0: price = price[0].replace(",", "") else: price = None # in case item use usual price, not sale if price: prod['price'] = Price( priceCurrency='USD', price=price ) brand = response.xpath( '//meta[@itemprop="brand"]/@content' ).extract() cond_set(prod, 'brand', brand) return prod
def _populate_from_html(self, response, product): self._populate_from_schemaorg(response, product) title = response.css("#sku-title ::text").extract()[0] if len(re.split(r'\s+-\s+ | -', title, 1)) > 1: brand, _ = re.split(r'\s+-\s+', title, 1) cond_set(product, 'brand', [brand]) cond_set(product, 'title', [title]) cond_set_value(product, 'buyer_reviews', self._get_buyer_reviews(response)) cond_set(product, 'upc', response.css("#sku-value ::text").extract()) cond_set(product, 'model', response.css("#model-value ::text").extract()) self._unify_price(product)
def _populate_from_html(self, response, product): if 'title' in product and product['title'] == '': del product['title'] cond_set(product, 'title', response.xpath('//h1[@itemprop="name"]/text()').extract(), conv=string.strip) cond_set(product, 'description', response.xpath('//div[@itemprop="description"]').extract(), conv=string.strip) image_url = is_empty( response.xpath('//div[@id="izView"]/noscript/img/@src').extract()) if image_url: cond_set_value(product, 'image_url', 'http:' + image_url) json_data = is_empty( response.xpath('//script').re('jcpPPJSON\s?=\s?({.*});')) if json_data: data = json.loads(json_data) brand = is_empty(is_empty(data['products'])['lots']).get( 'brandName', None) cond_set_value(product, 'brand', brand) price = is_empty( response.xpath( '//span[@itemprop="price"]/a/text() |' '//span[@itemprop="price"]/text() ').re("\d+.?\d{0,2}")) if price: product['price'] = Price(price=price, priceCurrency='USD') else: product['price'] = Price(price='0.0', priceCurrency='USD')
def parse_product(self, response): product = response.meta['product'] available = response.xpath( '//span[@id="ctl00_ContentPlaceHolder1_ProductControl1_'\ 'MainControl1_ProductMain1_spanLimitedStockCount"]/text()' ).extract() if available: quantity = re.findall("(\d+)", available[0]) if quantity: lim = LimitedStock(is_limited=True, items_left=int(quantity[0])) cond_set(product, 'limited_stock', [lim]) self._populate_from_open_graph(response, product) self._populate_from_js(response, product) #title = response.xpath("//title/text()").extract()[0] product['locale'] = "tr-TR" return product
def parse_product(self, response): meta = response.meta.copy() product = meta.get('product', SiteProductItem()) reqs = [] meta['reqs'] = reqs # Parse locate locale = 'en_US' cond_set_value(product, 'locale', locale) # Parse title title = self.parse_title(response) cond_set(product, 'title', title) # Parse image image = self.parse_image(response) cond_set(product, 'image_url', image) # Parse sku sku = self.parse_sku(response) cond_set_value(product, 'sku', sku) # Parse reseller_id cond_set_value(product, "reseller_id", sku) # Parse price price = self.parse_price(response) cond_set_value(product, 'price', price) # Parse description description = self.parse_description(response) cond_set(product, 'description', description) product['related_products'] = self.parse_related_product(response) otv = OrientaltradingVariants() otv.setupSC(response) _variants = otv._variants() if _variants: product['variants'] = _variants # reqs = self.parse_variants(response, reqs) # Parse reviews reqs.append( Request(url=self.REVIEW_URL.format( product_id=product['sku'].replace('/', '_'), index=0), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta)) if reqs: return self.send_next_request(reqs, response) else: return product
def _populate_from_html(self, response, product): if 'title' in product and product['title'] == '': del product['title'] cond_set( product, 'title', response.xpath( '//div[@class="product-detail-content"]/h3/text()' ).extract(), conv=string.strip ) if not product.get('title', ''): title = response.xpath('//h1[contains(@itemprop, "name")]//text()').extract() if title: product['title'] = title[0].strip() cond_set( product, 'brand', response.xpath( '//div[@class="product-detail-content"]/h5/a/text()' ).extract(), conv=string.strip ) if not product.get('brand', ""): brand = response.xpath( '//h2[contains(@itemprop, "brand")]/a/text()').extract() if brand: product['brand'] = brand[0].strip() cond_set( product, 'description', response.xpath( '//div[@class="product-catalog-content"]' ).extract(), conv=string.strip ) image_url = response.xpath( "//meta[@property='og:image']/@content" ).extract() if image_url: image = 'http:'+image_url[0] product['image_url'] = image in_store_only = response.xpath( '//div[@id="productBadge"]/img/@data-blzsrc[contains(.,"instore")]') if in_store_only: product['is_in_store_only'] = True else: product['is_in_store_only'] = False
def _populate_from_box(self, response, box, product): cond_set(product, 'title', box.css('.productInfo h3 a::text').extract(), unicode.strip) cond_set(product, 'price', box.css('.pricePerUnit::text').extract(), unicode.strip) cond_set(product, 'price', box.css('.pricing [class*=pricePer]').extract(), unicode.strip) cond_set(product, 'image_url', box.css('.productInfo h3 a img::attr(src)').extract(), lambda url: urlparse.urljoin(response.url, url)) # Try to find brand name in a title brands = response.meta.get('brands', []) brand = next( (brand for brand in brands if product.get('title', '').find(brand) == 0), None) cond_set_value(product, 'brand', brand)
def _populate_from_html(self, response, prod): # title title = response.css('h2[itemprop=name]::text') cond_set(prod, 'title', title.extract()) # price price_div = response.css('[itemprop=offers] > [itemprop=price]') price_div = price_div[0] currency = price_div.css('[itemprop=priceCurrency]::attr(content)') price = price_div.css('[itemprop=price]::attr(content)') if currency and price: prod['price'] = Price(currency[0].extract(), price[0].extract()) # out of stock cond_set_value(prod, 'is_out_of_stock', response.css('.out_of_stock_box'), bool) # image img = response.css('.vip_gallery [itemprop=image] ::attr(src)') cond_set(prod, 'image_url', img.extract()) # description, merged with details desc = response.xpath('//div[@itemprop="description"]/p | ' '//ul[@class="linear_list"]') cond_set_value(prod, 'description', ''.join(desc.extract())) # brand brand = response.css('input[name=brand_name] ::attr(value)') cond_set(prod, 'brand', brand.extract()) # reseller_id regex = "-(\d+)\." reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(prod, "reseller_id", reseller_id) # related products related = [] rel_key = ' '.join( response.xpath('//div[@class="moreby_brand"]' '/a/h2//text()').extract()) rel_items = response.css('#morefrom_slider > ul > li') for rel_item in rel_items: r_hr = rel_item.css('a::attr(href)') r_t = rel_item.css('a > span::text') if not r_hr or not r_t: continue r = RelatedProduct(r_t[0].extract(), r_hr[0].extract()) related.append(r) related_products = {rel_key: related} if related_products and related_products.values()[0]: cond_set_value(prod, 'related_products', related_products)
def _populate_from_html(self, response, product): cond_set(product, 'brand', response.css('#brand ::text').extract()) cond_set( product, 'price', response.css('#priceblock_ourprice ::text').extract(), ) cond_set( product, 'description', response.css('.productDescriptionWrapper').extract(), ) cond_set( product, 'image_url', response.css( '#imgTagWrapperId > img ::attr(data-old-hires)').extract() ) cond_set( product, 'title', response.css('#productTitle ::text').extract()) # Some data is in a list (ul element). model = None for li in response.css('td.bucket > .content > ul > li'): raw_keys = li.xpath('b/text()').extract() if not raw_keys: # This is something else, ignore. continue key = raw_keys[0].strip(' :').upper() if key == 'UPC': # Some products have several UPCs. The first one is used. raw_upc = li.xpath('text()').extract()[0] cond_set( product, 'upc', raw_upc.strip().split(' '), conv=int ) elif key == 'ASIN' and model is None or key == 'ITEM MODEL NUMBER': model = li.xpath('text()').extract() cond_set(product, 'model', model, conv=string.strip)