def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@class="product-name"]/h1/text()') loader.add_value('url', response.url) loader.add_value('brand', 'Le Creuset') loader.add_value('category', 'Le Creuset') loader.add_xpath('sku', '//input[@name="product"]/@value') loader.add_xpath('identifier', '//input[@name="product"]/@value') image_url = hxs.select( '//div[@class="product-img-box"]/a/@href').extract() if image_url: loader.add_value('image_url', image_url[0]) loader.add_xpath('price', '//div[@class="prodPriceWrap"]/h2/text()') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '4.75') else: loader.add_value('shipping_cost', '0') loader.add_value('stock', '1') item = loader.load_item() metadata = LeCreusetMeta() item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) meta = response.meta products = hxs.select( '//div[contains(@class, "item-details")]/div/header/h3/a/@href' ).extract() products += hxs.select( '//div[contains(@class, "setProduct")]/div/h5/a/@href').extract() if products: for product in products: url = urljoin_rfc(get_base_url(response), product) yield Request(url, callback=self.parse_product, meta=meta) return category = hxs.select( '//ol[contains(@class, "hof-breadcrumbs")]/li/a[@itemprop="breadcrumb"]/text()' ).extract()[-1] sku = hxs.select('//div[@class="product-code"]/text()').re( r'Product code:(.*)')[0].strip() name = hxs.select( '//span[@itemprop="name"]/text()').extract()[0].strip() loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', response.meta['brand']) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', sku) image_url = hxs.select( '//img[contains(@class, " featuredProductImage")]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) price = hxs.select( '//div[@id="productDetailsRefinementBlock"]/div/span/p[@class="priceNow"]/span[@class="value"]/text()' ).extract() if not price: price = hxs.select( '//span[@id="productPriceContainer"]/p[@class="price"]/text()' ).extract() loader.add_value('price', price[0]) price_was = ' '.join( map( lambda x: x.strip(), hxs.select( '//div[@id="productDetailsRefinementBlock"]//p[@class="priceWas"]/span//text()' ).extract())).strip() item = loader.load_item() metadata = LeCreusetMeta() metadata['promotion'] = price_was item['metadata'] = metadata yield item
def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select( '//div[@class="product_list"]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), url)) if not hxs.select('//span[@class="product"]/h1/text()'): return loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//span[@class="product"]/h1/text()') loader.add_value('url', response.url) loader.add_value('brand', 'Le Creuset') loader.add_xpath( 'category', '//div[@class="text_breadcrumbs"]/a[position()>1]//text()') loader.add_xpath( 'sku', 'substring-after(//font[@size="1" and contains(text(), "Ref:")]/text(), ": ")' ) loader.add_xpath( 'identifier', 'substring-after(//font[@size="1" and contains(text(), "Ref:")]/text(), ": ")' ) image_url = hxs.select('//img[@class="fullimage1"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) loader.add_xpath('price', '//h3[@class="product_price"]/prices/span[2]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', '//h3[@class="product_price"]//text()') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '4.95') else: loader.add_value('shipping_cost', '0') if hxs.select( '//div[@class="stock-message"]/span[contains(.//text(), "In stock") or contains(.//text(), "plenty of stock in")]' ): loader.add_value('stock', '1') else: loader.add_value('stock', '0') item = loader.load_item() metadata = LeCreusetMeta() metadata['promotion'] = ''.join( hxs.select( '//div[@class="special-offer-message"]/span/text()').extract()) item['metadata'] = metadata yield item
def parse(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['Barcode'].lower()) loader.add_value('sku', row['SKU']) loader.add_value('name', row['Descritpion'].decode('utf')) loader.add_value('price', row['Price']) loader.add_value('brand', 'Le Creuset') loader.add_value('image_url', row['Product images']) product = loader.load_item() metadata = LeCreusetMeta() metadata['asin'] = row['ASIN'] product['metadata'] = metadata yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1/span/text()') loader.add_value('url', response.url) loader.add_value('brand', 'Le Creuset') loader.add_xpath( 'category', '//ul[@class="breadcrumbs"]/li[position()>1]//span[@itemprop="title"]/text()' ) loader.add_xpath('sku', '//input[@name="productID"]/@value') loader.add_xpath('identifier', '//input[@name="productID"]/@value') image_url = hxs.select('//div[@id="thumbnails"]/a/img/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) loader.add_xpath('price', '//div[@id="price"]/text()') if loader.get_output_value('price') < 25: loader.add_value('shipping_cost', '2.99') else: loader.add_value('shipping_cost', '0') if hxs.select('//a[@id="addcartlink"]'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') item = loader.load_item() metadata = LeCreusetMeta() promotion = hxs.select( '//img[contains(@src,"percent/sale_")]/@src').re('\d+') if promotion: metadata['promotion'] = promotion[0] + '% off retail price' item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) childMap = json.loads( re.search('\'childMap\': (.*),', response.body).group(1)) prices = json.loads( re.search('\'prices\': (.*),', response.body).group(1)) skus = json.loads(re.search('\'skus\': (.*),', response.body).group(1)) stockStatuses = json.loads( re.search('\'stockStatuses\': (.*),', response.body).group(1)) selects = [] for sel in hxs.select('//div[@class="product-options"]//select'): s = [] for opt in sel.select('.//option'): if opt.select('./@value').extract()[0]: s.append(( opt.select('./@value').extract()[0], opt.select('./text()').extract()[0], )) if s: selects.append(s) if not selects: selects = [[('', ''), ('%', '')]] for k, v in list(childMap.items()): if '_%' in k: childMap[k.replace('_%', '')] = v found = False for c in itertools.product(*selects): key = [x[0] for x in c] name = [x[1] for x in c] code = childMap.get('_'.join(key)) if not code: continue code = str(code) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_value('name', name) loader.add_value('sku', skus[code]) loader.add_value('identifier', skus[code]) loader.add_value('price', prices[code][0]['purchase']) loader.add_value('url', response.url) loader.add_value('brand', 'Le Creuset') if 'In stock' in stockStatuses.get(code, ''): loader.add_value('stock', '1') else: loader.add_value('stock', '0') if loader.get_output_value('price') < 45: loader.add_value('shipping_cost', '4.95') else: loader.add_value('shipping_cost', '0') loader.add_xpath('category', '//div[@class="crumbs"]/a[position()>2]/text()') image_url = hxs.select( '//div[@id="product-image"]//img/@src').extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) item = loader.load_item() metadata = LeCreusetMeta() item['metadata'] = metadata found = True yield item if not found: self.log('No products on %s' % response.url)
def _process_product_info_product_details(self, response, product_info): """ This needs to be in separate function because used by two methods: parse_product_details and parse_ajax_price """ hxs = HtmlXPathSelector(response) promotion = ' '.join(''.join( hxs.select( '//tr[@id="regularprice_savings"]//text()').extract()).split()) metadata = LeCreusetMeta() metadata['promotion'] = promotion if response.meta.get( 'seller_identifier', None) and not product_info.get('seller_identifier', None): product_info['seller_identifier'] = response.meta[ 'seller_identifier'] check_match = response.meta.get('check_match', True) match = self.match(response.meta, self.current_search_item, product_info) if check_match and not match: self.log("[AMAZON] WARNING: product does not match: %s" % response.url) return if self.parse_options: if product_info['options'] and response.meta.get( 'parse_options', True): self.log('[AMAZON] OPTIONS FOUND => %s' % response.url) for option in product_info['options']: new_meta = response.meta.copy() new_meta.update({ 'parse_options': False, 'search_string': self.current_search, 'search_item': self.current_search_item, 'check_match': check_match }) yield Request(option['url'], self.parse_product, meta=new_meta, dont_filter=True) return else: if product_info['name_with_options']: product_info['name'] = product_info['name_with_options'] elif product_info['option_texts']: product_info['name'] += ' [' + ', '.join( product_info['option_texts']) + ']' if self.type == 'asins': url_asin = AmazonUrlCreator.get_product_asin_from_url( product_info['url']) if product_info['asin'].lower() != url_asin.lower(): self.log( "[AMAZON] product ASIN '%s' does not match url ASIN '%s'. Page: %s" % (product_info['asin'], url_asin, response.url)) return # Amazon Direct if self.amazon_direct: if self.collect_reviews and product_info.get( 'reviews_url') and response.meta.get( 'collect_reviews', True): new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['reviews_url'], callback=self.parse_reviews, meta=new_meta) else: product = self.construct_product(product_info, meta=response.meta) self.log("[AMAZON] collect parse product: %s" % product['identifier']) if self.type == 'category': yield product else: self._collect_amazon_direct(product, response.meta) # Buy Box elif self.only_buybox: if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \ self.collect_products_with_no_dealer: if self.collect_reviews and product_info.get( 'reviews_url') and response.meta.get( 'collect_reviews', True): new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['reviews_url'], callback=self.parse_reviews, meta=new_meta) else: product = self.construct_product(product_info, meta=response.meta) product['metadata'] = metadata self.log("[AMAZON] collect parse product: %s" % product['identifier']) if self.type == 'category': yield product else: self._collect_buybox(product, response.meta) elif not product_info['vendor']: # TODO: collect vendor from vendor details page self.log("[AMAZON] WARNING: product with no vendor: %s" % response.url) else: self.log("[AMAZON] WARNING: vendor not allowed: %s" % response.url) # all sellers / lowest price elif self.all_sellers or self.lowest_product_and_seller: # Go to MBC lists to get dealers prices collect_mbc = response.meta.get('collect_mbc', True) if collect_mbc and product_info.get( 'mbc_list_url_new') and self.collect_new_products: # yield mbc parse new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['mbc_list_url_new'], callback=self.parse_mbc_list, meta=new_meta) elif collect_mbc and product_info.get( 'mbc_list_url_used') and self.collect_used_products: # yield mbc parse new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['mbc_list_url_used'], callback=self.parse_mbc_list, meta=new_meta) else: if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \ self.collect_products_with_no_dealer: if self.collect_reviews and product_info.get( 'reviews_url') and response.meta.get( 'collect_reviews', True): new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['reviews_url'], callback=self.parse_reviews, meta=new_meta) else: use_seller_id_in_identifier = False \ if self.lowest_product_and_seller and not self.lowest_seller_collect_dealer_identifier else True product = self.construct_product( product_info, meta=response.meta, use_seller_id_in_identifier= use_seller_id_in_identifier) self.log("[AMAZON] collect parse product: %s" % product['identifier']) if self.type == 'category': yield product else: self._collect(product) elif not product_info['vendor']: # TODO: collect vendor from vendor details page self.log( "[AMAZON] WARNING: Could not scrape vendor from product details: %s" % response.url) self.errors.append( "Could not scrape vendor from product details: %s" % response.url) else: self.log("[AMAZON] WARNING: vendor not allowed: %s" % response.url)