def parse_option(self, response): item = response.meta['item'] data = json.loads(response.body) loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.replace_value('identifier', data['sku']) loader.replace_value('sku', data['sku']) selector = Selector(text=data['productPrices']) price = selector.css('.price-group').xpath( 'span/span//text()').extract() loader.replace_value('price', ''.join(price)) was_price = selector.css( '.price-group p span.text--strikethrough ::text').extract() loader.replace_value('url', data['url']) selector = Selector(text=data['stock']) out_of_stock = selector.css('.stock-indicator__status--inactive') loader.replace_value('stock', int(not out_of_stock)) selector = Selector(text=data['navigationTitle']) loader.add_value('name', selector.css('.sub-title::text').extract_first()) item = loader.load_item() was_price = extract_price(''.join(was_price)) metadata = MetaData() metadata['Promotions'] = was_price if was_price else '' item['metadata'] = metadata yield item
def parse_sub(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) items = hxs.select('//ol[@id="products-list"]/li') for item in items: product_id = item.select( './/span[contains(@id, "product-price-")]/@id').re( r'product-price-(\d+)') product_id = product_id[0] if product_id else '' if product_id and product_id in self.previous_crawl_data: self.log('>>> CACHED PRODUCT => %s' % product_id) loader = ProductLoader( item=Product(**self.previous_crawl_data[product_id]), selector=item) loader.replace_xpath( 'price', './/span[@id="product-price-%s"]//text()' % product_id, re=r'([\d.,]+)') if item.select('.//p[@class="availability out-of-stock"]'): loader.replace_value('stock', 0) yield loader.load_item() else: product_url = item.select( './/h2[@class="product-name"]/a/@href').extract() if product_url: yield Request(urljoin_rfc(base_url, product_url[0]), callback=self.parse_product) for url in set(hxs.select('//div[@class="pages"]//a/@href').extract()): yield Request(urljoin_rfc(base_url, url), callback=self.parse_sub)
def parse_product(self, response): row = response.meta['row'] loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['Unique product code']) loader.add_value('sku', row['Unique product code']) loader.add_value('url', response.url) loader.add_value('brand', row['Brand']) loader.add_value('category', row['Category']) loader.add_value('name', row['Product name']) image_url = response.xpath('//p[contains(@class, "product-image")]/a/@href').extract() image_url = image_url[0] if image_url else '' loader.add_value('image_url', image_url) price = response.xpath('//div[@class="add_to_cart"]//span[@class="regular-price"]//span/text()').extract() if not price: price = response.xpath('//div[@class="add_to_cart"]//p[@class="special-price"]//span[@class="price"]/text()').extract() loader.add_value('price', price[0]) out_of_stock = response.xpath('//p[@class="availability out-of-stock"]') if out_of_stock: loader.add_value('stock', 0) option_text = row['Product name'].split(' - ')[-1] options_config = re.search(r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) sku_data = re.search('var care_attribs = (.*);', response.body).group(1) sku_data = json.loads(sku_data) products = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product_id in option['products']: products[product_id] = ' - '.join((products.get(product_id, ''), option['label'])) for identifier, option_name in products.iteritems(): if sku_data[identifier]['attrib_sku']['value'] == row['Unique product code']: loader.replace_value('price', product_data['childProducts'][identifier]['finalPrice']) stock = product_data['stockInfo'][identifier]['stockQty'] if not stock: loader.replace_value('stock', 0) break item = loader.load_item() metadata = EbeddingMeta() metadata['cost_price'] = row['Cost price'] metadata['ean'] = row['EAN'] item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) main_name = hxs.select( '//meta[@property="og:title"]/@content')[0].extract() loader.add_value('name', main_name) loader.add_value('url', response.url) loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_xpath('image_url', '//meta[@property="og:image"]/@content') price = hxs.select( '//meta[@property="product:price:amount"]/@content').extract() if price: loader.add_value('price', format_price(Decimal(price[0]))) else: loader.add_value('price', '0.00') loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_xpath( 'brand', '//div[@itemprop="brand"]/div[@class="Value"]/a/span/text()') loader.add_value('shipping_cost', '3.99') stock = hxs.select( '//meta[@property="og:availability" and @content="instock"]') if not stock: loader.add_value('stock', 0) for category in hxs.select( '//div[@id="ProductBreadcrumb"]/ul/li/a/text()')[1:].extract(): loader.add_value('category', category) options = hxs.select( '//div[@class="productOptionViewSelect"]/select/option[not(contains(text(),"Please Choose"))]/text()' ).extract() for option in options[:1]: loader.replace_value('name', '{} {}'.format(main_name, option)) yield loader.load_item() if not options: yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) main_name = re.search('ecommerce.*name\': \'(.*?)\'', response.body, re.DOTALL).group(1) main_price = re.search('ecommerce.*price\': \'(.*?)\'', response.body, re.DOTALL).group(1) brand = re.search('ecommerce.*?brand\': \'(.*?)\'', response.body, re.DOTALL).group(1) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', main_name) loader.add_value('url', response.url) loader.add_value('price', response.url) loader.add_xpath('image_url', '//meta[@property="og:image"]/@content') loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_value('brand', brand) for category in hxs.select('//div[@id="breadcrumb"]/ul[@id="crumbs"]/li/a/text()')[1:].extract(): loader.add_value('category', category) options = hxs.select('//select[@name="ProductID" and @id="select_size"]/option') for option in options: identifier = option.select('./@value')[0].extract() loader.replace_value('identifier', identifier) option_name, option_price = option.select('./text()')[0].extract().strip().split(' - ') loader.replace_value('name', '{} {}'.format(main_name, option_name)) loader.replace_value('price', option_price) yield loader.load_item() if not options: yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_identifier = hxs.select( '//input[@name="product"]/@value')[0].extract() sku = '' product_name = hxs.select( '//div[@class="product-name"]/span/text()')[0].extract().strip() base_price = response.xpath( '//p[@class="special-price"]/span[@class="price"]/text()').extract( ) if not base_price: base_price = response.xpath( '//span[@class="regular-price"]/span[@class="price"]/text()' ).extract() base_price = extract_price(base_price[0]) if base_price else 0 #cart_price = hxs.select('//div[@class="cartBoxTotal"]/text()').extract() image_url = hxs.select('//img[@id="image-main"]/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' category = hxs.select( '//span[@typeof="v:Breadcrumb"]/a/text()').extract() category = category[-1] if category else '' brand = hxs.select( '//ul[@id="productDetailsList"]/li[contains(text(),"Manufactured")]/text()' ).re('Manufactured by: (.*)') options = hxs.select( '//select[@class=" required-entry product-custom-option"]/option') data_config = response.xpath('//script/text()').re( 'new Product.Config\((.+)\);') if options: for option in options: identifier = option.select('./@value').extract() if not identifier or identifier[0] == '': continue else: identifier = identifier[0] option_name = option.select('./text()').extract()[0] option_name = option_name.split(u'+\xa3')[0].strip() name = product_name + " " + option_name price = extract_price(option.select('@price').extract()[0]) identifier = product_identifier + "-" + identifier loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', identifier) loader.add_value('sku', product_identifier) loader.add_value('price', base_price + price) loader.add_value('brand', '') loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_value('category', category) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item() return loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', product_identifier) loader.add_value('sku', product_identifier) loader.add_value('url', response.url) loader.add_value('name', product_name) loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('price', base_price) if not loader.get_output_value('price'): loader.add_value('stock', 0) item = loader.load_item() if data_config: data = json.loads(data_config[0])['attributes'] products = dict() for attribute in sorted(data): for option in data[attribute]['options']: for product in option['products']: if not products.get(product): products[product] = dict() products[product]['label'] = option['label'] products[product]['price'] = extract_price( option['price']) else: products[product]['label'] += ' ' + option['label'] products[product]['price'] += extract_price( option['price']) for product in products: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', products[product]['label']) loader.replace_value('identifier', product_identifier + '-' + product) loader.replace_value('sku', product) loader.replace_value('price', base_price + products[product]['price']) yield loader.load_item() return yield item
def parse_product(self, response): if not isinstance(response, HtmlResponse): return options = response.xpath('//div[@class="optionValues"]/select/option') name = response.xpath( '//div[@id="productGeneral"]/form//h1[@id="productName"]/text()' ).extract() price = response.xpath( '//div[@id="productGeneral"]/form//h2[@id="productPrices"]/span[@class="productSalePrice"]/text()' ).extract() if not price: price = response.xpath( '//div[@id="productGeneral"]/form//h2[@id="productPrices"]/span[@class="productSpecialPrice"]/text()' ).extract() if not price: price = response.xpath( '//div[@id="productGeneral"]/form//h2[@id="productPrices"]/text()' ).extract() if price: price = price[0] price = price.replace(',', '').strip() stock = response.xpath( '//div[@id="cartAdd"]/input[@class="cssButton button_in_cart"]') if not stock: stock = 0 category = response.meta['category'].replace(u'/', u' > ') brand = response.xpath('//ul[@id="productDetailsList"]/li/text()').re( 'Manufactured by: (.*)') gtin_code = response.xpath( '//ul[@id="productDetailsList"]/li/text()').re('GTIN: (.*)') model_code = response.xpath( '//ul[@id="productDetailsList"]/li/text()').re('Model: (.*)') image_url = response.xpath( '//div[@class="MagicToolboxContainer"]/a/img/@src').extract() if image_url: image_url = response.urljoin(image_url[0]) loader = ProductLoader(item=Product(), response=response) loader.add_value('price', price) price = Decimal(loader.get_output_value('price')) if price < Decimal('100.0'): loader.add_value('shipping_cost', self.shipping_cost) if not stock: loader.add_value('stock', 0) loader.add_value('category', category) loader.add_value('brand', brand) identifier = response.xpath( '//input[@name="products_id"]/@value')[0].extract() loader.add_value('sku', gtin_code[0] if gtin_code else model_code[0]) loader.add_value('url', response.url) loader.add_value('image_url', image_url) loader.add_value('name', name) loader.add_value('identifier', identifier) if options: for option in options: option_id = option.xpath('@value')[0].extract() loader.replace_value('identifier', identifier + u'_' + option_id) option_name = option.xpath('text()')[0].extract() option_price = re.search('\( \+(.*)', option_name) loader.replace_value('name', u'{} {}'.format(name, option_name)) if option_price: option_price = option_price.group(1) option_price = re.search('([\.\d]+)', option_price.replace(',', '')).group(1) new_price = price + Decimal(option_price) if new_price < Decimal('100'): loader.replace_value('shipping_cost', self.shipping_cost) else: loader.replace_value('shipping_cost', Decimal('0.00')) loader.replace_value('price', new_price) loader.replace_value('name', u'{} {}'.format(name, option_name)) yield loader.load_item() else: yield loader.load_item()