def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath('//input[@name="product_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:] loader.add_value('category', category) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content') if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'): loader.add_value('stock', 0) sku = identifier name = loader.get_output_value('name') name_end = re.search('\S+$', name).group(0).strip(' ()') keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',') keywords = [word.strip() for word in keywords if word] shortest_keyword = min(keywords, key=len) if keywords else 'none' from_name = re.findall('\S*\d+\S*', name) if shortest_keyword.lower() == name_end.lower(): sku = name_end elif shortest_keyword.upper() == shortest_keyword: sku = shortest_keyword elif name_end.upper() == name_end: sku = name_end elif from_name: sku = max(from_name, key=len) if '(' in sku: sku = identifier loader.replace_value('sku', sku) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = re.search('\d\d\d\d', response.url).group(0) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//header[@class="prodCat"]/h1/text()') category = response.css('.bread li a::text').extract()[1:] category += response.css('.bread li:last-child::text').extract() loader.add_value('category', category) image_url = response.css('.detimg a::attr(href)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() options = response.css('.tbl').xpath('.//*[@class="tr"]') if not options: item['price'] = 0 yield item return for option in options: loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) identifier = option.xpath('.//input/@name').extract_first() loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.replace_css('price', '.tc-price .pr-now::text') loader.add_css('price', '.tc-price::text') loader.replace_css('name', '.tc-title::text') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(selector=hxs, item=Product()) loader.add_value('url', response.url) loader.add_xpath( 'brand', './/dt[text()="Brand"]/following-sibling::dd[1]/text()') loader.add_xpath('category', './/div[contains(@class, "breadcrumbs")]//a/text()') if hxs.select('//article[@id="product"]'): image_url = hxs.select( './/div[@id="amplienceContent"]//img/@src').extract() loader.replace_value('image_url', urljoin(base_url, image_url[0])) options = hxs.select( '//script[@type="text/javascript"]/text()[contains(., "productData")]' ).extract() for item in self.parse_options(hxs, base_url, loader, options): yield item for product in hxs.select('//article[@class="bdp-item"]'): image_url = product.select( './/a[contains(@id, "mainImage")]/img/@src').extract()[0] loader.replace_value('image_url', urljoin(base_url, image_url)) options = product.select( './div/div[1]//script[@type="text/javascript"]/text()' ).extract() for item in self.parse_options(product, base_url, loader, options): yield item
def parse_price_from_cart(self, response): loader = ProductLoader(item=response.meta['product'], response=response) loader.replace_xpath( 'price', '//td[@class="right"]/div[@class="prodetail-price"][1]/text()') shipping_cost = 9.9 if loader.get_output_value('price') < 200 else 0 loader.replace_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse_product(self, response): base_sku = response.xpath('//@data-ref').extract_first() identifier = re.search('p(\d+)$', url_query_cleaner(response.url)).group(1) url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format( identifier) data = json.load(urlopen(url)) attributes = [attr['values'] for attr in data['attributes']] if [] in attributes: url = add_or_replace_parameter(url, 'attributes[1]', attributes[0][0]['value_id']) data = json.load(urlopen(url)) attributes = [attr['values'] for attr in data['attributes']] variants = itertools.product(*attributes) for variant in variants: url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format( identifier) for idx, option in enumerate(variant): url = add_or_replace_parameter( url, 'attributes[{0}]'.format(idx + 1), option['value_id']) data = json.load(urlopen(url)) selection = data['selection'].values()[0] sku = selection['reference'].strip() if not sku and base_sku not in self.skus_found: sku = base_sku if sku not in self.skus.keys(): continue if sku in self.skus_found: self.logger.info('Duplicated SKU is found: %s' % sku) self.skus_found.add(sku) loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', sku) loader.add_value('identifier', selection['product_id']) loader.add_xpath('name', '//span[@id="js-product-title"]/text()') loader.add_value('name', [option['value'] for option in variant]) loader.replace_value('name', selection['title']) loader.add_value('url', response.url) loader.add_value('price', selection['price_inc']) category = response.css('div.breadcrumb a::attr(title)').extract() loader.add_value('category', category[1:]) try: image_url = [ attr['images'][0]['image'] for attr in data['attributes'][-1]['values'] ] except IndexError: image_url = response.xpath( '//div[@id="js-product-image"]//@src').extract() loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('brand', "Andrew James") item = loader.load_item() metadata = AndrewJamesMeta() metadata['asin'] = self.skus[sku]['ASIN'] item['metadata'] = metadata yield item
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', '//span[@id="productName"]//text()') loader.add_xpath('sku', '//span[@id="productEAN"]/text()[last()]') loader.add_xpath('category', '//div[@id="breadcrumb"]/ul/li[position()>1]/a/span/text()') loader.add_css('image_url', '.productImageItem ::attr(href)') brand = response.css('.brand ::text').extract_first() if brand != "null": loader.add_value('brand', brand) item = loader.load_item() p = re.compile('stockMatrix = (.+?);', re.DOTALL) data = response.xpath('//script/text()').re(p) options = json.loads(data[0]) for option in options: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) opt_iter = iter(option) opt_name = '' for attribute in response.css('.skuAttribute'): opt_name = opt_iter.next() loader.add_value('name', opt_name) colour_url = response.xpath('//input[@class="colourImageUrl"][@name="%s"]/@value' %opt_name).extract_first() if colour_url: loader.replace_value('image_url', 'http://media.littlewoods.com/i/littlewoods/%s?$1064x1416_standard$' %colour_url) loader.replace_value('identifier', opt_iter.next()) stock = opt_iter.next() if stock.startswith('Unavailable'): continue loader.replace_value('stock', int('Out of stock' not in stock)) loader.replace_value('price', opt_iter.next()) yield loader.load_item()
def parse_product(self, response): if 'aspxerrorpath' in response.url: yield Request(response.request.meta['redirect_urls'][0], self.parse_product, dont_filter=True) return loader = ProductLoader(Product(), response=response) identifier = response.xpath('//@data-feefo-vendor-ref').extract_first() loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', 'header.page-title h1::text') loader.add_css('price', 'header.product-sidebar__price h2::text') loader.add_value('sku', identifier) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[1:-1]) image_url = response.css( '.product-gallery__main-image img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) stock = response.css('.product-sidebar__stock::text').extract_first() if not 'Order Now' in stock.title(): loader.add_value('stock', 0) item = loader.load_item() if 'Discontinued' in stock.title(): item['metadata'] = {"Discontinued?": "Yes"} option_types = response.css('.product-sidebar select') if not option_types: yield item return options = [] for option_type in option_types: options.append(option_type.xpath('option[@value!="Select"]')) variants = itertools.product(*options) for variant in variants: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] for option in variant: loader.add_value('name', option.xpath('text()').extract()) identifier += '-' + option.xpath('@value').extract_first() loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) option_item = loader.load_item() option_item['metadata'] = item.get('metadata', {}) yield option_item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') sku = response.xpath('//div[@itemprop="description"]/div/div[last()]/text()').extract_first() loader.add_value('identifier', sku) loader.add_value('sku', sku) category = response.css('.breadcrumbs a::text').extract()[1:] category += response.css('.breadcrumbs li:last-of-type::text').extract() loader.add_value('category', category) image_url = response.css('img.gallery-main-image::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) if not response.css('.in-stock'): loader.add_value('stock', 0) item = loader.load_item() options = response.css('table.product-table tbody tr') for option in options: loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) sku = option.css('span.product-code::text').re('\((.+)\)')[0] name = option.css('span.product-name::text').extract_first() identifier = '-'.join((sku, hashlib.md5(item['name'] + name).hexdigest())) loader.replace_value('identifier', identifier) loader.replace_value('sku', sku) loader.add_css('price', 'span.product-price-rrp') price = option.css('td.product-price').xpath('text()[last()]').extract_first() loader.replace_value('price', price) if name not in item['name']: loader.add_value('name', name) yield loader.load_item()
def parse_product(self, response): identifier = response.xpath('//div[@itemscope]/@id').re('product-(.+)') loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') loader.add_xpath('url', '//link[@rel="canonical"]/@href') category = response.css('.breadcrumb a::text').extract()[1:] loader.add_value('category', category) loader.add_value('brand', response.meta['brand']) loader.add_xpath('image_url', '//div/@data-original-img') loader.add_value('identifier', identifier) product = loader.load_item() if not response.css('.variations'): yield product return variations = response.xpath('//form/@data-product_variations').extract_first() variations = json.loads(variations) for variation in variations: variation_loader = ProductLoader(item=Product(product), response=response) attributes = variation['attributes'].values() variation_loader.replace_value('name', product['name']) for attribute in attributes: variation_loader.add_xpath('name', '//option[@value="%s"]/text()' %attribute) variation_loader.replace_value('price', variation['display_price']) variation_loader.replace_value('identifier', variation['variation_id']) yield variation_loader.load_item()
def parse_product(self, response): if not response.css('.currency_gbp'): yield Request('https://www.granoptic.com/setCurrency/GBP', self.set_currency, dont_filter=True) return if '/contact-lenses/' in response.url: for item in self.parse_lenses(response): yield item return loader = ProductLoader(item=Product(), response=response) identifier = response.xpath( '//input[@name="id_calibre"]/@value').extract() loader.add_value('url', response.url) loader.add_css('name', '.nombre ::text') loader.add_xpath('name', '//p[contains(., "Frame:")]/text()') price = response.css('.nombre ~ .precio::text').re('\S+') loader.add_value('price', price) loader.add_css('category', '.breadcrumb a::text') loader.add_css('image_url', '.pag_producto img::attr(src)') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('identifier', identifier) loader.add_value('sku', identifier) item = loader.load_item() if len(identifier) == 1: yield item return for option in response.xpath('//input[@name="id_calibre"]'): loader = ProductLoader(item=Product(), selector=option) loader.add_value(None, item) identifier = option.xpath('@value').extract_first() loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.add_xpath('name', './../following-sibling::td[1]/text()') option_item = loader.load_item() if not option.xpath('@checked') and self.option_items.get( identifier): continue self.option_items[identifier] = option_item
def parse_product(self, response): categories = response.xpath( '//li[@class="blockBreadcrumb__item"]/a/text()').extract()[-3:] loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', '//input[@name="simpleSku"]/@value') loader.add_xpath('sku', '//input[@id="configSku"]/@value') loader.add_value('url', response.url) loader.add_xpath('name', '//h1[contains(@class, "__heading")]/text()') loader.add_xpath('name', '//input[@name="simpleSku"]/../span/text()') loader.add_xpath('image_url', '//div[@class="layoutImage"]//img/@src') loader.add_xpath('price', '//input[@id="price"]/@value') loader.add_xpath('brand', '//input[@id="brand"]/@value') loader.add_value('category', categories) loader.add_xpath('stock', '//@data-instock') item = loader.load_item() options = response.xpath('//select[@id="js-simple-selector"]/option') if not options: if loader.get_output_value('identifier'): yield item return for option in options: loader = ProductLoader(item=Product(item), selector=option) loader.replace_xpath('identifier', './@value') loader.add_xpath('name', './text()') identifier = loader.get_output_value('identifier') price = response.xpath( '//div[@data-simple-sku="%s"]//span[contains(@class, "actualPrice")]/text()' % identifier).extract() loader.replace_value('price', price) image_url = response.xpath( '//div[@data-simple-sku="%s"]/a[contains(@class, "link_selected")]/@data-product-image' % identifier).extract() loader.replace_value('image_url', image_url) loader.replace_xpath('stock', './@data-instock') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@name="product_id"]/@value').extract_first( ) or response.xpath( '//input[@name="add-to-cart"]/@value').extract_first() if not identifier: loader.add_value('stock', 0) identifier = response.xpath( '//div[@itemtype="http://schema.org/Product"]/@id').re_first( 'product-(\d+)') loader.add_value('identifier', identifier) loader.add_css('sku', 'span.sku::text') loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_css('price', '.product-price-exvat span.amount::text') loader.add_css('price', '.product-price span.amount::text') category = response.xpath( '//span[@class="posted_in"][contains(., "Categories:")]/a/text()' ).extract_first() loader.add_value('category', category) loader.add_css('image_url', 'div.single-product-main-image a::attr(href)') brand = response.xpath( '//span[@class="posted_in"][contains(., "Brands:")]/a/text()' ).extract_first() loader.add_value('brand', brand) item = loader.load_item() variations = response.xpath( '//@data-product_variations').extract_first() if not variations: yield item return variations = json.loads(variations) for variant in variations: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) loader.replace_value('identifier', variant['variation_id']) loader.replace_value('sku', variant['sku']) loader.replace_value('price', variant['display_price']) if variant['image_link']: loader.replace_value('image_url', variant['image_link']) loader.add_value('name', variant['attributes'].values()) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) css = '.nosto_product .%s ::text' loader.add_css('identifier', css % 'product_id') loader.add_css('sku', css % 'product_id') for field in ('url', 'name', 'image_url', 'brand'): loader.add_css(field, css % field) list_price = response.css(css % 'list_price').extract_first() sales_price = response.css(css % 'price').extract_first() loader.add_value('price', list_price) if 'InStock' not in response.css(css % 'availability').extract_first(): loader.add_value('stock', 0) category = response.css(css % 'category').extract_first() loader.add_value('category', category.split('/')[-1]) options_data = response.xpath('//script/text()').re( 'Product.Config.({.+})') if not options_data: item = loader.load_item() if sales_price != list_price: item['metadata'] = {'SalesPrice': Decimal(sales_price)} yield item return options_data = json.loads(options_data[0]) if len(options_data['attributes']) > 1: self.log('More than one options attributes found on %s' % response.url) return price = loader.get_output_value('price') name = loader.get_output_value('name') sales_price = Decimal(sales_price) for option in options_data['attributes'].values()[0]['options']: new_price = sales_price + Decimal(option['price']) loader.replace_value('price', price + Decimal(option['oldPrice'])) loader.replace_value('name', name + ' ' + option['label']) loader.replace_value('identifier', option['products'][0]) loader.replace_value('sku', option['products'][0]) loader.replace_xpath( 'image_url', '//li[@id="simple-product-image-%s"]/a/@href' % option['products'][0]) item = loader.load_item() if price + Decimal(option['oldPrice']) != new_price: item['metadata'] = {'SalesPrice': new_price} yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_value('url', response.url) category = response.css('div.treemenu a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'div#mainimage_holder img::attr(data-zoom-image)') identifier = response.xpath('//input[@name="fproduct_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_css('price', 'li.shelfBnormalprice::text') if loader.get_output_value('price') < 100: loader.add_value('shipping_cost', 10) item = loader.load_item() attributes = response.css('table.variabletable tr') attributes = [attr for attr in attributes if attr.xpath('td[1]/text()').extract_first() in self.options_to_extract] options = [] for attr in attributes: options.append(attr.xpath('td/select/option[not(contains(.,"Please Select"))]')) variants = itertools.product(*options) if not variants: yield item return for variant in variants: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] price = item['price'] for option in variant: identifier += '-' + option.xpath('@value').extract_first() name_and_price = option.xpath('text()').extract_first().split('(Add') loader.add_value('name', name_and_price[0]) if len(name_and_price) >1: price += extract_price(name_and_price[1]) loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.replace_value('price', price) if price >= 100: loader.replace_value('shipping_cost', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('price', '//h2[@itemprop="price"]/text()') category = response.xpath( '//div[@id="breadcrumbs"]/a/text()').extract() loader.add_value('category', category[1:-1]) image_url = response.css('img.productimage::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('shipping_cost', 10) loader.add_xpath('identifier', '//link[@rel="canonical"]/@href', re='\d+$') loader.add_xpath('sku', '//*/text()', re='Product code \#(.+)$') if response.xpath( "//*[contains(., 'SOLD OUT') or contains(., 'not available to buy online')]" ): loader.add_value('stock', 0) item = loader.load_item() options = response.xpath('//*[contains(@class, "sizeselect")]') if not options: yield item return for option in options: name = option.xpath('text()').extract_first() if not name: continue data = response.xpath('//span/text()[contains(., "size:%s")]' % name).extract_first().strip() sku = re.search('sku:(\d+)', data).group(1) if option.css('.sizeselectsoldout'): stock = 0 else: stock = re.search('qty:(\d+)', data).group(1) if not stock or not int(stock): stock = 1 loader = ProductLoader(Product(), response=response) loader.add_value(None, item) loader.add_value('name', name) loader.replace_value('identifier', sku) loader.replace_value('sku', sku) loader.replace_value('stock', stock) pr = loader.load_item() pr['metadata'] = {'size': name} yield pr
def parse_product(self, response): prod = self.parse_product_base(response) currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body) if currencyRate: currencyRate = Decimal(currencyRate.group(1)) else: currencyRate = 1 productPriceTaxExcluded = re.search("var productPriceTaxExcluded\D+([\d\.]+)", response.body) if productPriceTaxExcluded: productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1)) else: productPriceTaxExcluded = 0 idDefaultImage = re.search('var idDefaultImage=(\d+)', response.body) if idDefaultImage: idDefaultImage = idDefaultImage.group(1) data = response.xpath('//script/text()').re_first('var combinations=({.+?});') if not data: yield prod return data = json.loads(data) for identifier in data: loader = ProductLoader(Product(), response=response) loader.add_value(None, prod) loader.replace_value('identifier', '-'.join((prod['identifier'], identifier))) loader.replace_value('sku', data[identifier]['reference']) loader.replace_value('stock', data[identifier]['quantity']) option_price = Decimal(data[identifier]['price']) if option_price != 0: price = (option_price * Decimal('1.2')).quantize(Decimal('0.01')) loader.replace_value('price', price) attr_values = data[identifier]['attributes_values'] for attr in sorted(attr_values): loader.add_value('name', attr_values[attr]) image_url = prod['image_url'].replace(idDefaultImage, str(data[identifier]['id_image'])) yield loader.load_item()
def parse_lenses(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h2[@itemprop="name"]/text()') category = response.css('.breadcrumb span::text').extract() loader.add_value('category', category[1:-1]) image_url = response.xpath( '//img[@itemprop="image"]/@src').extract_first() loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath('brand', '//div[@id="Brand"]/span[@itemprop="brand"]/text()') if response.xpath('//link/@href[contains(., "OutOfStock")]'): loader.add_value('stock', 0) loader.add_xpath('identifier', '//input[@name="SKU"]/@value') loader.add_xpath('sku', '//span[@itemprop="sku"]/text()') price = response.css('.itemPrice ::text').extract() loader.add_value('price', price[-1]) item = loader.load_item() p = re.compile('var Bundles =(.+?\]);', re.DOTALL) data = response.xpath('//script/text()').re(p) if data: data = demjson.decode(data[0]) for option in data: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', option['strSize']) loader.replace_value('identifier', option['productNo']) loader.replace_value('sku', option['productNo']) loader.replace_value('price', option['price']) loader.replace_value('image_url', response.urljoin(option['img'])) yield loader.load_item() return yield item for url in response.css( '#AvialableVariants .variant::attr(href)').extract(): yield Request(url, self.parse_lenses)
def parse_product(self, response): options_selects = response.css('label.required').xpath( '../following-sibling::dd[1]').css('div.input-box').xpath('*[1]') options_config = response.xpath('//script/text()').re_first( 'Product.Config.*?({.+})') if not options_selects: for item in self.parse_simple_product(response): yield item return options = [] for option in options_selects: if option.extract().startswith('<select'): if option.xpath('option[@value!=""]'): options.append(option.xpath('option[@value!=""]')) else: options.append(option.xpath('li')) if options_config: items = self.parse_product_options_config(response) else: items = self.parse_simple_product(response) for item in items: if not options: yield item continue variants = itertools.product(*options) for variant in variants: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] + '-' + '-'.join( (option.xpath('.//@value').extract_first() for option in variant)) loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) price = item['price'] for option in variant: name = option.xpath('text()').extract_first( ) or option.xpath('.//label/text()').extract_first() name = name.split(u'+£')[0] loader.add_value('name', name) price += Decimal(option.xpath('.//@price').extract_first()) loader.replace_value('price', price) yield loader.load_item()
def parse_product_options_config(self, response): options = response.xpath('//script/text()').re_first( 'Product.Config.*?({.+})') loader = ProductLoader(Product(), response=response) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_value('url', response.url) loader.add_css('name', 'div.product-name h1::text') loader.add_css('price', 'li.bigPrice span.price::text') loader.add_xpath('sku', '//input[@name="product"]/@value') category = response.css('div.breadcrumbs a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'img#image::attr(src)') item = loader.load_item() if not options: yield item return options = json.loads(options) attributes = sorted(options['attributes'].values()) products = [ option['products'] for attr in attributes for option in attr['options'] ] products = set(itertools.chain(*products)) for product in products: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] + '-' + product loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) options = [ option for attr in attributes for option in attr['options'] if product in option['products'] ] price = item['price'] for option in options: loader.add_value('name', option['label']) price += Decimal(option['price']) loader.replace_value('price', price) yield loader.load_item()
def parse_product(self, response): base_product = True add_custom_personalization = False loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('category', 'Kits') base_data = response.xpath('//script/text()').re( 'product\w{6} =(.+?});var') hero_data = response.xpath('//script/text()').re( 'product\d{7} =(.+?});var') if base_data: base_data = json.loads(base_data[0]) if hero_data: hero_data = [json.loads(elem) for elem in hero_data] selected_hero = response.xpath( '//select[contains(@class,"heroShirts")]/option[@selected]/@value' ).extract_first() if selected_hero: hero_data = {elem['ProductID']: elem for elem in hero_data}[int(selected_hero)] base_product = False else: hero_data = hero_data[0] else: hero_data = {} if not base_data and not hero_data: return # Checking custom personalization printings = { p['PrintingTypeID']: p for p in base_data['printingitems'] } custom_printings = printings.get(1) if custom_printings and base_product: add_custom_personalization = True loader.add_value('name', base_data['Description']) loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"') if base_data['Brand']: loader.add_value('brand', base_data['Brand']['Name'].title()) loader.add_value('image_url', response.urljoin(base_data['ImageURL'])) product = loader.load_item() # Player names player_from_name = re.search('with *([\w\ \.\-]+?) (\d+)', hero_data.get('Description', ''), re.UNICODE) if player_from_name: player, number = player_from_name.groups() for data in [hero_data, base_data]: for variation in data.get('Variations', []): size = variation['Description'] loader = ProductLoader(item=Product(), response=response) loader.add_value(None, product) loader.replace_value('identifier', variation['VariationId']) option_name = data['Description'] + u' ' + size loader.replace_value('name', option_name) loader.replace_value('price', variation['PriceActual']) if data.get('ImageURL'): loader.replace_value( 'image_url', response.urljoin(base_data['ImageURL'])) if Decimal(variation['PriceActual']) < Decimal('75.00'): loader.replace_value('shipping_cost', '4.95') if not variation['IsInStock']: loader.replace_value('stock', 0) identifier = str(variation['VariationId']) item = loader.load_item() if item['identifier'] not in self.extracted_identifiers: self.extracted_identifiers.append(item['identifier']) if player_from_name and data == hero_data: item['metadata'] = { 'player': player, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item # Custom printings if add_custom_personalization: team_player_name = 'WILLIAMS' team_player_number = '10' team_player_id = 'WILLIAMS' loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', team_player_name) loader.add_value('name', team_player_number) price = Decimal(item['price']) + Decimal( str(custom_printings['PriceActual'])) loader.replace_value('price', price) if price >= Decimal('75.00'): loader.replace_value('shipping_cost', 0) identifier = '-'.join( (item['identifier'], str(custom_printings['PrintingID']), team_player_id)) loader.replace_value('identifier', identifier) custom_item = loader.load_item() custom_item['metadata'] = { 'player': team_player_name, 'number': team_player_number, 'size': size } yield custom_item # Badges printings = { elem['PrintingTypeID']: elem for elem in base_data['printingitems'] if 'New Premier League Player Badges' not in elem['PrintingDescription'] } printing = printings.get(3) if printing: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) option_name = loader.get_output_value( 'name') + u' ' + printing['PrintingDescription'] loader.replace_value('name', option_name) price = Decimal(str(variation['PriceActual'])) + Decimal( str(printing['PriceActual'])) loader.replace_value('price', format_price(price)) if price >= Decimal('75.00'): loader.replace_value('shipping_cost', 0) identifier += '-' + str(printing['PrintingID']) loader.replace_value('identifier', identifier) item = loader.load_item() if item['identifier'] not in self.extracted_identifiers: self.extracted_identifiers.append(item['identifier']) if player_from_name and data == hero_data: item['metadata'] = { 'player': player, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item
def parse_product(self, response): if 'aspxerrorpath' in response.url: yield Request(response.request.meta['redirect_urls'][0], self.parse_product, dont_filter=True) return base_product = True add_custom_personalization = False loader = ProductLoader(item=Product(), response=response) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_value('category', 'Kits') heros_data = response.xpath('//script/text()').re( 'product\d{7} =(.+?});var') base_product_data = response.xpath('//script/text()').re( 'product\w{6} =(.+?});var') if not base_product_data: for p in self.parse(response): yield p return if not heros_data: data = json.loads(base_product_data[0]) elif len(heros_data) == 1: data = json.loads(heros_data[0]) base_product = False else: data = [json.loads(x) for x in heros_data] data = {x['ProductID']: x for x in data} heros = response.css('select.heroShirts') hero = heros.xpath('option[@selected]') if not hero: data = json.loads(base_product_data[0]) else: data = data[int(hero.xpath('@value').extract_first())] base_product = False base_product_data = json.loads(base_product_data[0]) gbp_url = response.xpath( '//a[contains(@href, "?cur=GBP")]/@href').extract_first() if gbp_url: yield Request(response.urljoin(gbp_url), self.parse_product, dont_filter=True) return # Checking custom personalization printings = { p['PrintingTypeID']: p for p in base_product_data['printingitems'] } custom_printings = printings.get(1) if custom_printings and base_product: add_custom_personalization = True loader.add_value('name', data['Description']) loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"') if data['Brand']: loader.add_value('brand', data['Brand']['Name']) loader.add_value('image_url', response.urljoin(data['ImageURL'])) product = loader.load_item() player_from_name = re.search('with *([\w.\- ]+?) *(\d*|TBC) *printing', data['Description'], re.UNICODE) if player_from_name: player_name, number = player_from_name.groups() #sizes for variation in data['Variations']: size = variation['Description'] loader = ProductLoader(item=Product(), response=response) loader.add_value(None, product) loader.replace_value('identifier', variation['VariationId']) loader.add_value('name', size) loader.replace_value('price', variation['PriceActual']) if variation['PriceActual'] < 75: loader.replace_value('shipping_cost', '4.95') loader.replace_value('stock', int(variation['IsInStock'])) item = loader.load_item() if player_from_name: item['metadata'] = { 'player': player_name, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item # Custom printings if add_custom_personalization: team_player_name = 'WILLIAMS' team_player_number = '10' team_player_id = 'WILLIAMS' loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', team_player_name) loader.add_value('name', team_player_number) price = Decimal(item['price']) + Decimal( str(custom_printings['PriceActual'])) loader.replace_value('price', price) if price >= 75: loader.replace_value('shipping_cost', 0) identifier = '-'.join( (item['identifier'], str(custom_printings['PrintingID']), team_player_id)) loader.replace_value('identifier', identifier) custom_item = loader.load_item() custom_item['metadata'] = { 'player': team_player_name, 'number': team_player_number, 'size': size } yield custom_item #Badges printings = { p['PrintingTypeID']: p for p in base_product_data['printingitems'] } printing = printings.get(3) if printing: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', printing['PrintingDescription']) price = variation['PriceActual'] + printing['PriceActual'] loader.replace_value('price', price) if price >= 75: loader.replace_value('shipping_cost', 0) identifier = str(variation['VariationId']) + '-' + str( printing['PrintingID']) loader.replace_value('identifier', identifier) item = loader.load_item() if player_from_name: item['metadata'] = { 'player': player_name, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item
def parse_options(self, response): data = json.loads(response.body) identifier = url_query_parameter(response.url, 'productId') sku = url_query_parameter(response.url, 'skuId') loader = ProductLoader(Product(), response=response) loader.add_value(None, response.meta['item']) loader.replace_value('identifier', '.'.join((identifier, sku))) loader.replace_value('sku', sku) loader.replace_value('name', data['skuName']) if not data['skuName'].endswith( data['size']) and not data['skuName'].endswith( data['size'].replace(' ', '')): loader.add_value('name', data['size']) loader.replace_value('image_url', response.urljoin(data['thumbnail_url'])) loader.replace_value('price', str(data['unit_sale_price'])) loader.replace_value('stock', data['stock']) if Decimal(data['unit_sale_price']) < 20: loader.add_value('shipping_cost', '2.99') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('category', 'Kits') base_data = response.xpath('//script/text()').re( 'product\w{6} =(.+?});var') hero_data = response.xpath('//script/text()').re( 'product\d{7} =(.+?});var') if base_data: base_data = json.loads(base_data[0]) if hero_data: hero_data = [json.loads(elem) for elem in hero_data] selected_hero = response.xpath( '//select[contains(@class,"heroShirts")]/option[@selected]/@value' ).extract_first() if selected_hero: hero_data = {elem['ProductID']: elem for elem in hero_data}[int(selected_hero)] elif len(hero_data) == 1: hero_data = hero_data[0] else: hero_data = {} else: hero_data = {} if not base_data and not hero_data: return loader.add_value('name', base_data['Description']) loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"') if base_data['Brand']: loader.add_value('brand', base_data['Brand']['Name'].title()) loader.add_value('image_url', response.urljoin(base_data['ImageURL'])) loader.add_value('shipping_cost', self.shipping_cost) product = loader.load_item() # Player names player_from_name = re.search('with *([\w\ \.\-]+?) (\d+)', hero_data.get('Description', ''), re.UNICODE) if player_from_name: player, number = player_from_name.groups() for data in [hero_data, base_data]: for variation in data.get('Variations', []): size = variation['Description'] loader = ProductLoader(item=Product(), response=response) loader.add_value(None, product) loader.replace_value('identifier', variation['VariationId']) option_name = data['Description'] + u' ' + size loader.replace_value('name', option_name) loader.replace_value('price', variation['PriceActual']) if data.get('ImageURL'): loader.replace_value( 'image_url', response.urljoin(base_data['ImageURL'])) if not variation['IsInStock']: loader.replace_value('stock', 0) identifier = str(variation['VariationId']) item = loader.load_item() if self.free_delivery_over is not None and self.free_delivery_over <= item[ 'price']: item['shipping_cost'] = '0.00' if item['identifier'] not in self.extracted_identifiers: self.extracted_identifiers.append(item['identifier']) if player_from_name and data == hero_data: item['metadata'] = { 'player': player, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item # Badges printings = { elem['PrintingTypeID']: elem for elem in base_data['printingitems'] } printing = printings.get(3) if printing: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) option_name = loader.get_output_value( 'name') + u' ' + printing['PrintingDescription'] loader.replace_value('name', option_name) price = Decimal(str(variation['PriceActual'])) + Decimal( str(printing['PriceActual'])) loader.replace_value('price', format_price(price)) identifier += '-' + str(printing['PrintingID']) loader.replace_value('identifier', identifier) item = loader.load_item() if self.free_delivery_over is not None and self.free_delivery_over <= item[ 'price']: item['shipping_cost'] = '0.00' if item['identifier'] not in self.extracted_identifiers: self.extracted_identifiers.append(item['identifier']) if player_from_name and data == hero_data: item['metadata'] = { 'player': player, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item
def parse_product(self, response): suffix = 'GB' is_rx = False if '/rx-sunglasses/' in response.url: suffix = 'A1' is_rx = True loader = ProductLoader(item=Product(), response=response) base_id = response.url.split('/')[-2] try: int(base_id) except ValueError: base_id = response.xpath('//input[@checked="checked" and @class="size-select"]/@value') if base_id: base_id = base_id.extract()[0] else: base_id = response.xpath('//meta[@itemprop="sku"]/@content').extract()[0] base_id = base_id.replace('en', '').replace('GB', '') sku = 'en' + base_id + suffix loader.add_value('identifier', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//h1//span[@itemprop="name"]/text()') loader.add_xpath('name', '//span[@itemprop="color"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') loader.add_value('sku', sku) category = response.xpath('//div[@id="addressbar"]//a/span/text()').extract() loader.add_value('category', category[:-1]) loader.add_css('image_url', '.imgModello ::attr(src)') loader.add_css('brand', '.marchioProd ::text') stock = response.css('.in-stock-big.hide') if stock and stock.xpath('following-sibling::span[1]/text()').extract_first() not in self.in_stock: loader.add_value('stock', 0) rx_options = response.xpath('//label[@class="rx-type"]') size_options = response.xpath('//input[@class="size-select"]') if not response.xpath('//select[@name="size"]/option') and not rx_options and not size_options: loader.add_xpath('name', '//td[contains(., "size:")]/following-sibling::td[1]/text()[1]') yield loader.load_item() item = loader.load_item() for option in response.xpath('//select[@name="size"]/option'): loader = ProductLoader(item=Product(), selector=option) loader.add_value(None, item) loader.add_xpath('name', 'text()') sku = 'en' + option.xpath('@value').extract_first() + suffix loader.replace_value('identifier', sku) loader.replace_value('sku', sku) yield loader.load_item() for option in rx_options: rx_type = option.xpath('./@data-idlenterx').extract()[0] sku = 'en' + base_id + suffix.replace('1', rx_type) loader = ProductLoader(item=Product(), selector=option) loader.add_value(None, item) loader.add_xpath('name', './strong[1]/text()') loader.replace_value('identifier', sku) loader.replace_value('sku', sku) price = option.xpath('./following-sibling::div//*[@itemprop="price"]/@content').extract() loader.replace_value('price', price) yield loader.load_item() for option in size_options: loader = ProductLoader(item=Product(), selector=option) loader.add_value(None, item) loader.add_xpath('name', './@data-size-label') sku = 'en' + option.xpath('@value').extract_first() + suffix loader.replace_value('identifier', sku) loader.replace_value('sku', sku) yield loader.load_item() other_options = response.xpath('//ul[@class="gridMixitup"]//*[@itemtype="http://schema.org/Product"]') for option in other_options: if is_rx or size_options: url = option.xpath('.//*[@itemprop="url"]/@content').extract()[0] yield Request(url, callback=self.parse_product) continue self.log('Parsing similar product') loader = ProductLoader(item=Product(), selector=option) ident = option.xpath('.//*[@itemprop="productID"]/@content').extract()[0] ident = 'en' + ident + suffix loader.add_value('identifier', ident) loader.add_value('sku', ident) loader.add_xpath('brand', './/*[@itemprop="brand"]/*[@itemprop="name"]/@content') loader.add_value('category', category[:-1]) loader.add_xpath('url', './/*[@itemprop="url"]/@content') loader.add_xpath('image_url', './/*[@itemprop="image"]/@content') loader.add_xpath('price', './/*[@itemprop="price"]/@content') name = option.xpath('.//*[@itemprop="name"]/@content').extract()[0] name += ' ' + option.xpath('./a/div[@class="name"]/text()').extract()[0] loader.add_value('name', name) yield loader.load_item()
def parse_product(self, response): price = response.xpath( '//tr[th[contains(text(),"Price per Pack")]]//span[@class="incVAT"]/text()' ).extract() if not price: price = response.xpath( '//tr[contains(@class,"mainPrice")]//span[contains(@class,"incVAT")]/em/text()' ).extract() if not price: price = response.xpath( '//tr[contains(@class,"price")]//span[contains(@class,"incVAT")]/em/text()' ).extract() categories = response.xpath( '//nav[@class="breadcrumbs"]/a/text()')[1:-1].extract() brand = response.xpath('//meta[@itemprop="brand"]/@content').extract() sku = response.xpath('//td[@itemprop="identifier"]/text()').extract() sku = sku[0].strip() if sku else '' #identifier = response.xpath('//td[@itemprop="identifier"]/text()').extract() identifier = response.url.split('/')[-2] image_url = response.xpath( '//a[@data-lightbox="product"]/img/@src').extract() if image_url: image_url = response.urljoin(image_url[0]) name = response.xpath('//h1[@itemprop="name"]/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('url', response.url) processed_price = loader.get_output_value('price') self.log(repr(processed_price)) if processed_price and Decimal(processed_price) < Decimal('175.00'): loader.add_value('shipping_cost', Decimal('15.00')) if image_url: loader.add_value('image_url', image_url) stock = response.xpath( '//li[@itemprop="availability"]/@content').extract() if stock and stock[0] != 'in_stock': loader.add_value('stock', 0) for category in categories: loader.add_value('category', category) loader.add_value('brand', brand) options = response.xpath('//select[@name="ddlProductOptions"]/option') if not options: yield loader.load_item() return for opt in options: opt_name = opt.xpath('./text()')[0].extract() loader.replace_value('name', '{} {}'.format(name[0], opt_name)) opt_value = opt.xpath('./@value')[0].extract() opt_id = response.xpath( '//tr[contains(@class,"option_{}")]' '//td[@itemprop="identifier"]/text()'.format( opt_value)).extract() loader.replace_value('identifier', opt_value) loader.replace_value('sku', opt_value) opt_price = response.xpath( '//tr[(contains(@class,"price") or contains(@class,"mainPrice"))' ' and contains(@class,"option_{}")]' '//span[contains(@class,"incVAT")]/em/text()'.format( opt_value)).extract() loader.replace_value('price', opt_price) if Decimal(loader.get_output_value('price')) < Decimal('175.00'): loader.add_value('shipping_cost', Decimal('15.00')) else: loader.add_value('shipping_cost', Decimal('0.00')) opt_stock = response.xpath( '//ul[contains(@class,"option_{}")]' '/li[@itemprop="availability"]/@content'.format( opt_value)).extract() if opt_stock and opt_stock[0] != 'in_stock': loader.add_value('stock', 0) else: loader.add_value('stock', 1) yield loader.load_item()