def _populate_from_html(self, response, product): reseller_id = re.findall('\/sku(\d+)', response.url) # reseller_id = reseller_id[0] if reseller_id else None cond_set(product, 'reseller_id', reseller_id) cond_set(product, 'title', response.css('[itemprop=name]::text').extract()) cond_set(product, 'brand', response.css('#ctl00_content_lnkBrand::text').extract()) cond_set(product, 'price', response.css('[itemprop=price]::text').extract()) if product.get('price', '') and not isinstance(product['price'], Price): if not 'Rp' in product['price']: self.log('Unrecognized currency at %s' % response.url) else: product['price'] = Price( price=product['price'].lower().replace( 'rp', '').replace(',', '').strip(), priceCurrency='IDR' ) cond_replace(product, 'image_url', response.css('#prodMedia img::attr(src)').extract()) specs = response.css('.spesifications').extract() specs = specs[0] if specs else '' description = product.get('description', '') + specs.strip() cond_replace_value(product, 'description', description) self._get_model_from_title(product)
def _populate_from_html(self, response, product): cond_set(product, 'title', response.css('.productSummary h1::text').extract()) cond_set(product, 'price', response.css('.pricePerUnit::text').extract(), unicode.strip) cond_set(product, 'price', response.css('.pricing [class*=pricePer]').extract(), unicode.strip) xpath = '//*[@id="information"]' \ '/node()[not(@class="access")][normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) cond_replace( product, 'image_url', response.css('#productImageHolder img::attr(src)').extract(), lambda url: urlparse.urljoin(response.url, url)) reseller_id = response.xpath('.//*[@class="skuCode"]/text()').extract() cond_set(product, 'reseller_id', reseller_id, string.strip) title = product['title'] brand = guess_brand_from_first_words(title, max_words=15) cond_set_value(product, 'brand', brand) self._unify_price(product) if not product.get("locale"): product["locale"] = "en_GB"
def _price_from_html(self, response, product): css = '.product-price-bol [itemprop=price]::attr(content)' cond_replace(product, 'price', response.css(css).extract()) cond_set( product, 'price', response.xpath( "//span[@class='offer_price']/meta[@itemprop='price']/@content" ).extract()) currency = response.css('[itemprop=priceCurrency]::attr(content)') currency = currency.extract()[0] if currency else 'EUR' price = product.get('price', '') price = price.replace(',', '.') if price and re.match(' *\d+\.?\d* *\Z', price): cond_replace_value(product, 'price', Price(currency, price))
def _populate_from_html(self, response, product): self._populate_hardcoded_fields(product) cond_set(product, 'title', response.css('#itemTitle::text').extract()) cond_set( product, 'price', response.css('[itemprop=price]::text , ' '#mm-saleDscPrc::text').extract(), self._unify_price) seller = response.xpath('//div[@class="mbg"]/a/span/text()').extract() if seller: seller = seller[0].strip() product["marketplace"] = [{ "name": seller, "price": product.get("price", None) }] cond_replace(product, 'image_url', response.css('[itemprop=image]::attr(src)').extract()) xpath = '//*[@id="vi-desc-maincntr"]/node()[normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) cond_replace(product, 'url', response.css('[rel=canonical]::attr(href)').extract()) xpath = '//td[@class="attrLabels" and contains(text(), "Brand:")]' \ '/following-sibling::td/span/text()' cond_set(product, 'brand', response.xpath(xpath).extract()) if not product.get('brand', None): dump_url_to_file(response.url) xpath = '//td[@class="attrLabels" and contains(text(), "Model:")]' \ '/following-sibling::td/span/text()' cond_set(product, 'model', response.xpath(xpath).extract()) reseller_id_regex = "-\/([^\/&?\.\s]+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id)
def parse_product(self, response): product = response.meta['product'] populate_from_open_graph(response, product) cond_set( product, 'title', response.xpath('//meta[@property="og:title"]/@content').extract(), conv=string.strip) if not product.get('brand', None): brand = guess_brand_from_first_words( product.get('title', None).strip()) if brand: product['brand'] = brand cond_replace( product, 'image_url', response.css( ".main-image-container>img::attr(src)" ).extract(), lambda url: urlparse.urljoin(response.url, url) ) prod_description = response.css( ".product-details-container .description-copy p::text" ) cond_set_value(product, 'description', "\n".join( x.strip() for x in prod_description.extract() if x.strip())) sku = response.css( ".product-details-container .caption-2::text").extract() if sku: sku = re.findall('\d+', sku[0]) else: sku = None cond_set(product, 'sku', sku, string.strip) cond_set(product, 'reseller_id', sku, string.strip) price_now = response.css( ".product-details-container .right-side .price .sale::text" ).extract() if not price_now: price = response.css( ".product-details-container .right-side .price span::text" ).extract() else: price = price_now cond_set( product, 'price', price, conv=string.strip, ) if price: product['price'] = Price( price=product['price'].replace(u'\xa3', '').strip(), priceCurrency='GBP') related_products = self._parse_related(response) cond_set_value(product, 'related_products', related_products) cond_set_value(product, 'locale', 'en-GB') sample = response.xpath('//select[@id="SizeKey"]/option/text()').extract() variants = [] for index, i in enumerate(sample): if index > 0: var = i.replace('Size ', '') variants.append(var.strip()) variant_list = [] for variant in variants: variant_item = {} properties = {} if 'out of stock' in variant: properties['size'] = variant.replace(' (out of stock)', '') else: properties['size'] = variant variant_item['price'] = price[0].replace(u'\xa3', '').strip() variant_item['in_sock'] = False if 'out of stock' in variant else True variant_item['properties'] = properties variant_item['selected'] = False variant_list.append(variant_item) product['variants'] = variant_list return product
def _populate_from_html(self, response, product): """ @returns items 1 1 @scrapes title description locale """ product = response.meta.get('product', SiteProductItem()) product['reseller_id'] = self._parse_reseller_id(response.url) if u'>this product is currently unavailable' in response.body_as_unicode( ).lower(): product['no_longer_available'] = True return mv = MacysVariants() mv.setupSC(response) product['variants'] = mv._variants() if product.get('variants'): # One-variation product if len(product.get('variants')) == 1: product['upc'] = product.get('variants')[0]['upc'] if response.xpath('//li[@id="memberItemsTab"]').extract(): price = response.xpath( "//div[@id='memberProductList']/div[1]/" "div[@class='productPriceSection']/div/span[last()]/text()" ).re(FLOATING_POINT_RGEX) else: price = response.xpath("//div[@id='priceInfo']/div/span/text()" ).re(FLOATING_POINT_RGEX) if response.css('.priceSale::text'): price = response.css('.priceSale::text').re(FLOATING_POINT_RGEX) if not price: price = response.xpath('//*[contains(@id, "priceInfo")]').re( FLOATING_POINT_RGEX) if not price: price = response.xpath( '//*[contains(@class, "singlePrice")][contains(text(), "$")]' ).re(FLOATING_POINT_RGEX) if not price: # TODO Move to another method, populate_from_json json_product_data = response.xpath( './/script[@id="productMainData"]/text()').extract() json_product_data = json.loads( json_product_data[0]) if json_product_data else None if json_product_data: price = [json_product_data.get('salePrice')] in_stock = json_product_data.get('inStock', None) if in_stock is not None: if in_stock == "true": product['is_out_of_stock'] = False else: product['is_out_of_stock'] = True if price: product['price'] = Price(price=price[0], priceCurrency='USD') if not product.get("image_url") or \ "data:image" in product.get("image_url"): image_url = response.xpath( "//img[contains(@id, 'mainView')]/@src").extract() if image_url: product["image_url"] = image_url[0] if not product.get('image_url'): cond_set( product, 'image_url', response.xpath('//*[contains(@class,' ' "productImageSection")]//img/@src').extract()) if not product.get('image_url'): cond_set( product, 'image_url', response.xpath( '//*[contains(@class, "mainImages")]' '//*[contains(@class, "imageItem")]//img/@src').extract()) if not product.get("image_url") or \ "data:image" in product.get("image_url"): img_src = response.xpath( '//*[contains(@class, "imageItem") ' 'and contains(@class, "selected")]/img/@src').extract() if img_src: product['image_url'] = img_src[0] title = response.css('#productTitle::text').extract() if not title: title = response.xpath( '//*[contains(@class, "productTitle")]' '[contains(@itemprop, "name")]/text()').extract() if title: cond_replace(product, 'title', [''.join(title).strip()]) if not product.get('title', None): title = response.xpath( '//h1[contains(@class,"productName")]//text()').extract() if title: product['title'] = title[0].strip() path = '//*[@id="memberProductDetails"]/node()[normalize-space()]' desc = response.xpath(path).extract() if not desc: desc = response.xpath( '//*[@id="productDetails"]/node()[normalize-space()]').extract( ) if desc: desc = [d for d in desc if 'id="adPool"' not in d] cond_set_value(product, 'description', desc, ''.join) if not product.get('description', ''): product['description'] = (' '.join( response.css('#product-detail-control ::text').extract())) if not product.get('description', ''): desc = response.xpath( ".//*[@id='longDescription']/text()").extract() product['description'] = desc[0] if desc else '' locale = response.css('#headerCountryFlag::attr(title)').extract() if not locale: locale = response.xpath( '//meta[@property="og:locale"]/@content').extract() cond_set(product, 'locale', locale) brand = response.css('#brandLogo img::attr(alt)').extract() if not brand: brand = response.xpath( './/*[@class="productTitle"]/a[@class="brandNameLink"]/text()' ).extract() if not brand: brand = guess_brand_from_first_words(product['title'].replace( u'®', '')) brand = [brand] cond_set(product, 'brand', brand) if product.get('brand', '').lower() == 'levis': product['brand'] = "Levi's" product_id = response.css('#productId::attr(value)').extract() if not product_id: product_id = response.xpath( '//*[contains(@class,"productID")]' '[contains(text(), "Web ID:")]/text()').extract() if product_id: product_id = [ ''.join([c for c in product_id[0] if c.isdigit()]) ] if product_id: # Reviews url = "http://macys.ugc.bazaarvoice.com/7129aa/%s" \ "/reviews.djs?format=embeddedhtml" % (product_id[0],) r = requests.get(url) resp = r.text resp = re.findall("var materials=(.*)", resp) if resp: resp = resp[0] data = json.loads(resp[0:-1]) hxs = HtmlXPathSelector(text=data["BVRRSourceID"]) num_of_reviews = hxs.xpath( '//div[@id="BVRRQuickTakeSummaryID"]' '/div/div/div/div/div/div/div/div/span' '/span[contains(@class, "BVRRNumber")]/text()').extract() if num_of_reviews: num_of_reviews = int(num_of_reviews[0].replace(',', '')) array = hxs.xpath( '//div/span[@class="BVRRHistAbsLabel"]/text()' ).extract() if array: rating_by_star = {} array = list(array) array.reverse() count = 0 review_sum = 0 for i in range(0, 5): rating_by_star[i + 1] = array[i].replace(',', '') count += int(array[i].replace(',', '')) review_sum += (i + 1) * int(array[i].replace( ',', '')) average_rating = round( float(review_sum) / float(count), 2) br = BuyerReviews(num_of_reviews, average_rating, rating_by_star) cond_set_value(product, 'buyer_reviews', br) cond_set_value(product, 'buyer_reviews', ZERO_REVIEWS_VALUE) # Related Products if product_id: aj_url = "http://www1.macys.com/sdp/rto/request/recommendations" headers = { 'Content-type': 'application/x-www-form-urlencoded', } aj_body = { 'productId': product_id[0], 'visitorId': '0', 'requester': 'MCOM-NAVAPP', 'context': 'PDP_ZONE_A' } r = requests.post(aj_url, data=urllib.urlencode(aj_body), headers=headers) data = json.loads(r.text) rp = [] rel_prod_links = [] if data.get('recommendedItems'): for el in data["recommendedItems"]: url, title = "", "" link = "http://www1.macys.com/shop/catalog/" \ "product/newthumbnail/json?" \ "productId=%s&source=118" % (el["productId"],) rel_prod_links.append(link) r = requests.get(link) data = json.loads(r.text) try: title = data["productThumbnail"]["productDescription"] url = "http://www1.macys.com/" + \ data["productThumbnail"]["semanticURL"] except Exception: pass if title or url: rp.append(RelatedProduct(title, url)) if rp: recomm = {'Customers Also Shopped': rp} product["related_products"] = recomm
def _populate_from_html(self, response, product): """ @returns items 1 1 @scrapes title description locale """ product = response.meta.get('product', SiteProductItem()) mv = MacysVariants() mv.setupSC(response) product['variants'] = mv._variants() if response.xpath('//li[@id="memberItemsTab"]').extract(): price = response.xpath( "//div[@id='memberProductList']/div[1]/" "div[@class='productPriceSection']/div/span[last()]/text()" ).re(FLOATING_POINT_RGEX) else: price = response.xpath( "//div[@id='priceInfo']/div/span/text()" ).re(FLOATING_POINT_RGEX) if response.css('.priceSale::text'): price = response.css('.priceSale::text').re(FLOATING_POINT_RGEX) if not price: price = [p.strip() for p in response.xpath('//*[@id="priceInfo"]//text()').extract() if p.strip()] if not price: price = response.xpath('//*[contains(@id, "priceInfo")]').re(FLOATING_POINT_RGEX) if not price: price = response.xpath('//*[contains(@class, "singlePrice")][contains(text(), "$")]') if price: product['price'] = Price(price=price[0], priceCurrency='USD') if not product.get("image_url") or \ "data:image" in product.get("image_url"): image_url = response.xpath( "//img[contains(@id, 'mainView')]/@src").extract() if image_url: product["image_url"] = image_url[0] if not product.get('image_url'): cond_set( product, 'image_url', response.xpath('//*[contains(@class,' ' "productImageSection")]//img/@src').extract() ) if not product.get('image_url'): cond_set( product, 'image_url', response.xpath('//*[contains(@class, "mainImages")]' '//*[contains(@class, "imageItem")]//img/@src').extract() ) if not product.get("image_url") or \ "data:image" in product.get("image_url"): img_src = response.xpath('//*[contains(@class, "imageItem") ' 'and contains(@class, "selected")]/img/@src').extract() if img_src: product['image_url'] = img_src[0] title = response.css('#productTitle::text').extract() if not title: title = response.xpath('//*[contains(@class, "productTitle")]' '[contains(@itemprop, "name")]/text()').extract() title = title[0].strip() if title else '' if not product.get('title', None): title = response.xpath('//h1[contains(@class,"productName")]//text()').extract() title = title[0].strip() if title else '' if title: cond_replace(product, 'title', [''.join(title).strip()]) path = '//*[@id="memberProductDetails"]/node()[normalize-space()]' desc = response.xpath(path).extract() if not desc: desc = response.xpath( '//*[@id="productDetails"]/node()[normalize-space()]' ).extract() if desc: desc = [d for d in desc if 'id="adPool"' not in d] cond_set_value(product, 'description', desc, ''.join) locale = response.css('#headerCountryFlag::attr(title)').extract() if not locale: locale = response.xpath( '//meta[@property="og:locale"]/@content' ).extract() cond_set(product, 'locale', locale) brand = response.css('#brandLogo img::attr(alt)').extract() if not brand: brand = response.xpath('.//*[@class="productTitle"]/a[@class="brandNameLink"]/text()').extract() if not brand: brand = guess_brand_from_first_words(product['title'].replace(u'®', '')) brand = [brand] cond_set(product, 'brand', brand) if product.get('brand', '').lower() == 'levis': product['brand'] = "Levi's" product_id = response.css('#productId::attr(value)').extract() self._parse_reviews(response, product) # Related Products if product_id: aj_url = "http://www1.macys.com/sdp/rto/request/recommendations" headers = { 'Content-type': 'application/x-www-form-urlencoded', } aj_body = { 'productId': product_id[0], 'visitorId': '0', 'requester': 'MCOM-NAVAPP', 'context': 'PDP_ZONE_A' } r = requests.post( aj_url, data=urllib.urlencode(aj_body), headers=headers ) data = json.loads(r.text) rp = [] rel_prod_links = [] if data.get('recommendedItems'): for el in data["recommendedItems"]: url, title = "", "" link = "http://www1.macys.com/shop/catalog/" \ "product/newthumbnail/json?" \ "productId=%s&source=118" % (el["productId"],) rel_prod_links.append(link) r = requests.get(link) data = json.loads(r.text) try: title = data["productThumbnail"]["productDescription"] url = "http://www1.macys.com/" + \ data["productThumbnail"]["semanticURL"] except Exception: pass if title or url: rp.append(RelatedProduct(title, url)) if rp: recomm = {'Customers Also Shopped': rp} product["related_products"] = recomm