def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) image_url = hxs.select('//div[@class="firstPic"]/a/img/@src').extract() product_identifier = hxs.select( '//div[@class="desc"]//img[contains(@src,"button_info")]/../@href' ).extract()[0] product_identifier = url_query_parameter(product_identifier, 'pID') product_name = hxs.select( '//div[@class="productInfo1"]/h1/text()').extract()[0].strip() product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select('//span[@class="productNewPrice"]/text()').extract() if not price: price = hxs.select('//span[@class="price"]/text()').extract() price = extract_price(price[0]) sku_text = hxs.select('//p[@class="basicData"]//text()').extract() sku = '' for txt in sku_text: if 'Art.Nr.:' in txt: sku = txt.replace('Art.Nr.:', '').strip() break product_loader.add_value('sku', sku) product_loader.add_value('price', price) product_loader.add_value('url', response.url) category = hxs.select( '//*[@id="box_categories"]//li[@class="activeCat"]/a/text()' ).extract() product_loader.add_value('category', category) search_txt = ''.join( hxs.select('//div[@class="desc"]//text()').extract()) match = re.search(r"Gewicht.*?(?::|kg)*.*?([\d,]+)", search_txt, re.DOTALL | re.IGNORECASE) if match: try: weight = float(match.group(1).replace(',', '.')) if weight <= 3: product_loader.add_value('shipping_cost', 4.90) elif weight <= 10: product_loader.add_value('shipping_cost', 8.90) elif weight <= 19: product_loader.add_value('shipping_cost', 13.90) elif weight <= 60: product_loader.add_value('shipping_cost', 22.90) elif weight <= 100: product_loader.add_value('shipping_cost', 29.90) elif weight <= 150: product_loader.add_value('shipping_cost', 39.90) elif weight <= 220: product_loader.add_value('shipping_cost', 42.90) elif weight > 220: product_loader.add_value('shipping_cost', 49) except: pass in_stock = hxs.select( '//*[@id="cart_quantity"]//input[@name="products_qty"]') if not in_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product
def parse_product_list(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) featured_product = hxs.select(u'//div[@class="featuredProduct"]') product_loader = ProductLoader(item=Product(), selector=featured_product) url = featured_product.select( u'.//div[@class="fDescription"]/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', (url.split(';')[0]).split('?')[0]) product_loader.add_xpath( 'name', u'.//div[@class="fDescription"]/a/strong/text()') identifier = featured_product.select( u'.//input[@name="/com/castorama/CastShoppingCartFormHandler.productId"]/@value' ).extract() if not identifier: identifier = featured_product.select( './/div[@class="fIllustration"]//img/@productid').extract( ) if (identifier and not identifier[0].strip()) or not identifier: identifier = re.search(r'-([\w]*)\.html', url).groups() product_loader.add_value('identifier', identifier[0]) try: product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), featured_product\ .select('.//div[@class="fIllustration"]//img/@src').extract()[0] )) except: pass price_css_classes = [{ 'tag': 'span', 'class': 'newprice' }, { 'tag': 'div', 'class': 'price' }] for price_css_class in price_css_classes: price = featured_product.select( u'.//' + price_css_class['tag'] + '[@class="' + price_css_class['class'] + '"]/text()').re(u'([0-9\,\.\xa0]+)') if price: price = price[0].replace(u'\xa0', '').replace(',', '.') product_loader.add_value('price', price) break # if not product_loader.get_output_value('price'): product_loader.add_value('stock', 1) yield product_loader.load_item() products = hxs.select( u'//div[contains(@class,"productsRow")]/div[contains(@class,"productItem")]' ) for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select( u'.//div[@class="prodDecription"]/a/@href').extract() if not url: continue url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', (url.split(';')[0]).split('?')[0]) product_loader.add_xpath( 'name', u'.//div[@class="prodDecription"]/a/text()') identifier = product.select( u'.//input[@name="/com/castorama/CastShoppingCartFormHandler.productId"]/@value' ).extract() if not identifier: identifier = product.select( './/div[@class="illustration"]//img/@productid').extract() if (identifier and not identifier[0].strip()) or not identifier: identifier = re.search(r'-([\w]*)\.html', url).groups() product_loader.add_value('identifier', identifier[0]) try: product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), product\ .select('.//div[@class="illustration"]//img/@src').extract()[0] )) except: pass price_css_classes = [{ 'tag': 'span', 'class': 'newprice' }, { 'tag': 'div', 'class': 'price' }] for price_css_class in price_css_classes: price = product.select(u'.//' + price_css_class['tag'] + '[@class="' + price_css_class['class'] + '"]/text()').re(u'([0-9\,\.\xa0]+)') if price: price = price[0].replace(u'\xa0', '').replace(' ', '').replace(',', '.') product_loader.add_value('price', price) break # if not product_loader.get_output_value('price'): product_loader.add_value('stock', 1) try: yield product_loader.load_item() except: self.log('>>> WARNING: load item error in => %s' % response.url) if not products or not featured_product: log.msg('Retrying url: %s' % response.url, level=log.WARNING) retries = response.meta.get('retries', 0) if retries < 3: yield Request(response.url, dont_filter=True, meta={'retries': retries + 1})
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) name = hxs.select( '//div[@class="product-name"]/h1/text()')[0].extract() price = hxs.select( '//div[@class="product-main-info"]//div[@class="price-box"]/' 'span[contains(@id, "product-price")]/span[@class="price"]/text()' ).extract() if not price: price = hxs.select( '//div[@class="product-main-info"]//div[@class="price-box"]/' 'p[@class="special-price"]/span[@class="price"]/text()' ).extract() price = extract_price(price[0].strip()) identifier = hxs.select('//p[@class="product-ids"]/text()').re( 'Product ID: (.*)')[0] image_url = hxs.select('//a[@id="main-image"]/@href').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) category = hxs.select( '//div[@class="breadcrumbs"]//a/text()').extract() options = hxs.select('//ul[contains(@class, "options-list")]/li') if options: i = 0 for opt in options: opt_name = opt.select( './span[@class="label"]/label/text()').extract() if not opt_name: continue opt_name = name + ' ' + opt_name[0].strip() opt_price = opt.select('./input/@price').extract() if not opt_price: continue opt_price = price + extract_price(opt_price[0]) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', opt_name) loader.add_value('price', opt_price) loader.add_value('sku', identifier) loader.add_value('identifier', '%s.%s' % (identifier, i)) if image_url: loader.add_value('image_url', image_url) if category: loader.add_value('category', category[-1]) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item() i += 1 else: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', identifier) loader.add_value('identifier', identifier) if image_url: loader.add_value('image_url', image_url) if category: loader.add_value('category', category[-1]) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item()
def parse(self, response): base_url = get_base_url(response) transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" transport.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(transport) files = sftp.listdir_attr() last = get_last_file("CRC_PRICEFEED_UK", files) date_file = datetime.fromtimestamp(last.st_mtime) hours_diff = (datetime.now() - date_file).total_seconds() / 3600 # Check file updates if hours_diff >= 72: self.errors.append('WARNING: No Update for 3 days') ''' usa_file = get_last_file("CRC_PRICEFEED_USA", files) if usa_file: usa_date_file = datetime.fromtimestamp(usa_file.st_mtime) hours_diff = (datetime.now() - usa_date_file).total_seconds() / 3600 if hours_diff <= 32: self.errors.append('WARNING: Invalid File Name, USA feed uploaded recently') ''' zip_path = HERE + '/CRC_PRICEFEED_UK.zip' xml_path = HERE + '/CRC_PRICEFEED_UK.xml' sftp.get(last.filename, zip_path) unzip(zip_path, xml_path) xmlfeed_sku = '' with open(xml_path) as f: xmlfeed_sku = f.read() sku_prices = {} tree = et.fromstring(xmlfeed_sku) for item in tree.find('priceList[@id="UKRP"]').find('prices').findall( 'price'): sku = item.find('skuId').text price = item.find('listPrice').text sku_prices[sku] = price last = get_last_file("PriceMonitorHandler", files) zip_path = HERE + '/PriceMonitorHandler.zip' xml_path = HERE + '/PriceMonitorHandler.xml' sftp.get(last.filename, zip_path) unzip(zip_path, xml_path) xmlfeed_products = '' with open(xml_path) as f: xmlfeed_products = f.read() sku_products = {} tree = et.fromstring(xmlfeed_products) for item in tree.find('skus').findall('sku'): sku_products[item.find('skuID').text] = { 'identifier': item.find('skuID').text, 'category': item.find('CategoryDescription').text, 'brand': item.find('BrandDescription').text, 'image_url': item.find('ImageURL').text, 'url': item.find('ProductURL').text, 'name': item.find('SkuDescription').text, 'sku': item.find('skuID').text, 'stock': item.find('SkuQuantity').text } for sku, price in sku_prices.iteritems(): try: product = sku_products[sku] except KeyError: log.msg('SKU not found:' + sku) continue product['price'] = price product = Product(product) loader = ProductLoader(response=response, item=product) yield loader.load_item()
def load_item(self, item, name, identifier, price, response): try: category = item.select( '//*[@id="vi-VR-brumb-lnkLst"]//a/text()').extract().pop() except IndexError: category = '' seller_id = ''.join( item.select('.//*[contains(@class, "si-content")]' '//a/*[@class="mbg-nw"]/text()').extract()) brand = response.meta['item_meta'].get('brand') if not brand: brand = filter( lambda s: s.strip() != '', item.select( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*[1]/text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', item.select( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*[1]/h2/text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', item.select( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*[1]/h3/text()').extract()) product_loader = ProductLoader(item=Product(), selector=item) for field in self._match_fields: product_loader.add_value( field, response.meta['item_meta'].get(field, None)) product_loader.add_value('name', name) product_loader.add_value('category', category) product_loader.add_value('dealer', 'eBay - ' + seller_id) product_loader.add_value('identifier', identifier) sku = item.select( '//tr[td[contains(text(), "Modell")]]/td/span/text()').extract() sku = sku[-1] if sku else '' product_loader.add_value('sku', sku) if brand: if type(brand) == list: product_loader.add_value('brand', brand[0]) else: product_loader.add_value('brand', brand) product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src') product_loader.add_value('url', item.response.url) price = extract_price( price) if price is not None else self._get_item_price(item) product_loader.add_value('price', price) # stock amount if self._extract_stock_amount: stock = '' try: in_stock = ''.join( item.select('//*[@id="qtySubTxt"]//text()').extract()) stock = '' for match in re.finditer(r"([\d]+)", in_stock): if len(match.group()) > len(stock): stock = match.group() if 'More than' in in_stock: stock = 11 except: pass if stock: product_loader.add_value('stock', stock) # shipping cost try: shipping_cost = item.select( '//*[@id="shippingSection"]//td/div/text()').extract()[0] if shipping_cost: if 'free' in shipping_cost.lower(): product_loader.add_value('shipping_cost', 0) else: product_loader.add_value('shipping_cost', extract_price(shipping_cost)) except IndexError: pass return product_loader
def parse_item(self, response): ''' skuArray.push({ productexternalid: 72833, colour: 'Light Grey/Grey', size: '49', skuNopId: 91684, skuId: 227272, price: '£90.00', priceAsDecimal: 90.0000, stockquantity: 0, preorder: true, outofstock: true, issubscribed: false, availableDate: 'Due in 02/07/2015' }); ''' hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products_data = [] collect_product = False for i, l in enumerate(response.body.split('\n')): if 'skuArray.push({' in l: collect_product = True current_product = {} continue if '});' in l and collect_product: collect_product = False products_data.append(current_product) continue if collect_product: attr_data = [a.strip() for a in l.split(':')] current_product[attr_data[0]] = eval(attr_data[1].replace( 'false', 'False').replace('true', 'True')) if isinstance(current_product[attr_data[0]], tuple): current_product[attr_data[0]] = current_product[ attr_data[0]][0] main_name = hxs.select( '//h1[@itemprop="name"]/text()').extract()[0].strip() categories = hxs.select( '//div[@id="breadcrumb"]//span[@itemprop="title"]/text()').extract( )[1:] for p in products_data: loader = ProductLoader(item=Product(), response=response) loader.add_xpath( 'image_url', '//img[@itemprop="image"]/@src', lambda a: urljoin_rfc(base_url, a[0]) if a else '') loader.add_value('identifier', p['skuId']) loader.add_value('sku', p['productexternalid']) loader.add_value('price', p['priceAsDecimal']) loader.add_value('stock', p['stockquantity']) loader.add_value('category', categories) loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('url', response.url) loader.add_value( 'name', main_name + ' - ' + p['colour'] + ' - ' + p['size']) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select( '//meta[@property="og:image"]/@content').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) category = hxs.select( '//td[contains(@class,"breadcrumb")]//a/text()').extract() if category: category = category[-1].strip() brand = hxs.select( '//div[contains(@id, "ProductDetail_Tech")]//table//tr/td[contains(text(),"Manufacturer")]/following-sibling::td/text()' ).extract() if brand: brand = brand[0].strip() out_of_stock = hxs.select( '//td[contains(@id,"productdetail-action-wrapper")]//span[contains(text(),"Call for best price!")]/text()' ) sub_products = hxs.select('//tr[@class="Multi-Child_Background"]') if sub_products: for sub_product in sub_products: loader = ProductLoader(item=Product(), selector=sub_product) loader.add_xpath( 'name', 'td[@class="productnamecolorSMALL colors_productname"]/text()' ) loader.add_xpath('sku', 'td[@class="smalltext colors_text"]/text()') loader.add_xpath('identifier', 'td[@class="smalltext colors_text"]/text()') loader.add_value('url', response.url) loader.add_xpath( 'price', 'td[@class="smalltext colors_text"]/b/div/div/span/text()') loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('brand', brand) if out_of_stock: loader.add_value('stock', 0) yield loader.load_item() else: if hxs.select('//table[@id="options_table"]//select'): select_options = [] for select in hxs.select( '//table[@id="options_table"]//select'): select_options.append( select.select('option/text()').extract()) name = hxs.select( '//span[@itemprop="name"]/text()').extract()[0] full_names = select_options[0] for i, full_name in enumerate(full_names): for options in select_options[1:]: for option in options: full_names[i] = full_names[i] + ' ' + option for full_name in full_names: loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name + ' ' + full_name) loader.add_xpath('sku', '//span[@class="product_code"]/text()') loader.add_xpath('identifier', '//span[@class="product_code"]/text()') loader.add_value('url', response.url) price = hxs.select( '//span[@itemprop="price"]/text()').extract() price = price[0] if price else 0 loader.add_value('price', price) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('brand', brand) if out_of_stock: loader.add_value('stock', 0) yield loader.load_item() else: loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//span[@itemprop="name"]/text()') loader.add_xpath('sku', '//span[@class="product_code"]/text()') loader.add_xpath('identifier', '//span[@class="product_code"]/text()') loader.add_value('url', response.url) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('brand', brand) if out_of_stock: loader.add_value('stock', 0) price = hxs.select( '//span[@itemprop="price"]/text()').extract() price = price[0] if price else 0 loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) sold_out = hxs.select( "//form/img[contains(concat('',@src,''), 'soldout')]" ).extract() # Fill up the Product model fields # identifier = url = response.url name = hxs.select("//div[@class='product-order']/h1/text()").extract()[0] shipping_cost = '' price = '' if sold_out: pass else: price = hxs.select( "//div[@id='pit']//li[@class='rbsalep']/text()").extract() if not price: price = hxs.select( "//div[@id='pit']//ul/table//table//tr[2]/td[2]/text()" ).extract() if not price: price = '' if not price: log.msg(' ::::: Base price :::::') log.msg(response.url) price = hxs.select('//input[@id="baseprice"]/@value').extract() if price: price = price[0] else: price = '' category = response.meta["cat_name"] if 'cat_name' in response.meta else response.meta['_product']['category'] image_url = hxs.select("//div[@class='details-left']/table/tr/td/a/img/@src").extract() brand = hxs.select('//div[@class="about-item"]/ul/li[contains(b/text(),"Manufacturer:")]/text()').extract() if not brand: self.log("ERROR brand not found") brand = '' sku = hxs.select('//div[@class="about-item"]/ul/li[contains(b/text(),"SKU:")]/text()').extract() if not sku: self.log("ERROR sku not found") sku = '' else: sku = sku[0] l = ProductLoader(response=response, item=Product()) instock = hxs.select('//form[@id="cartForm"]//div[@id="addtocart"]/@id').extract() if instock: l.add_value("stock", int(1)) else: outofstock = hxs.select('//form[@id="cartForm"]/img[contains(@src,"soldout.gif")]/@src').extract() if outofstock: l.add_value("stock", int(0)) else: self.log("ERROR outofstock not found, instock not found") options = hxs.select('//select[@id="Options"]/option[@value!="Select Options"]') if options: for option in options: l = ProductLoader(response=response, item=Product()) l.add_value('url', url) option_name = option.select('text()').extract()[0] option_id = option.select('@value').extract()[0] l.add_value('name', name + ' - ' +option_name) l.add_value('price', price) l.add_value('sku', sku) l.add_value("identifier", sku+'-'+option_id) l.add_value('category', category) l.add_value('image_url', image_url) l.add_value('shipping_cost', shipping_cost) if instock: l.add_value("stock", int(1)) else: l.add_value("stock", int(0)) if brand: l.add_value('brand', brand) yield l.load_item() else: l.add_value('url', url) l.add_value('name', name) l.add_value('price', price) l.add_value('sku', sku) l.add_value("identifier", sku) l.add_value('category', category) l.add_value('image_url', image_url) l.add_value('shipping_cost', shipping_cost) if brand: l.add_value('brand', brand) yield l.load_item()
def _start_requests(self): yield Request( 'http://www.advantage-catering-equipment.co.uk/sterling-pro-triple-door-bottle-cooler.html', callback=self.parse_product, meta={'product': Product()})
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) name = ''.join(hxs.select('//h1//text()').extract()).strip() product_loader.add_value('name', name) product_loader.add_value('brand', 'CamelBak') category = hxs.select( '//div[@class="breadcrumb"]/ul/li/a/text()').extract()[1:] product_loader.add_value('category', category) options_values = hxs.select( '//script[contains(text(), "var allVariants={")]/text()').re( r'var variantsAray=(\[.*\]);') if options_values: options_values = eval(options_values[0]) options = hxs.select( '//script[contains(text(), "var allVariants={")]/text()').re( r'allVariants={"variants":(\[.*\,])\}\;') if options: options = eval(options[0]) option_images = {} media_json = re.findall("var mediaJSON='(.*)';if", response.body) if media_json and media_json[0]: images = json.loads(media_json[0]) for image in images["imageList"]: sku = image.get('skuId', None) if sku: option_image = hxs.select('//div[@data-value="' + image['colour'] + '"]/img/@src').extract() image_url = option_image[0] if option_image else '' if option_image: image_url = add_or_replace_parameter( option_image[0], 'wid', '500') image_url = add_or_replace_parameter( image_url, 'hei', '500') option_images[image['skuId']] = image_url else: option_images[image['skuId']] = '' initial_image = images['initialImage']['imageURL'] product_loader.add_value('image_url', initial_image) product = product_loader.load_item() if options and options_values: for option in options: prod = Product(product) sku = option['skuId'] prod['identifier'] = sku prod['sku'] = sku prod['name'] = prod['name'].strip() + ' ' + ' '.join( option[k] for k in options_values if option[k] is not 'null').decode('utf-8') prod['price'] = extract_price(option['RP']) if option['isInStock'] != 'true': prod['stock'] = 0 if option_images and option_images.get(sku, ''): prod['image_url'] = option_images.get(sku, '') if prod['price'] < 50: prod['shipping_cost'] = 5.99 yield prod else: yield product
def parse_product_option(self, response): if "The item is not currently available." in response.body: return option_name = response.meta.get('option_name') option_id = response.meta.get('option_id') category = response.meta.get('category') name = response.meta.get('name') url = response.meta.get('url') sku = response.xpath( '//span[@itemprop="productID"]/text()').extract_first() name += ' ' + option_name price = response.xpath( '//input[@name="ActProdPrice"]/@value').extract_first() price = extract_price(price) image_url = response.xpath('//*[@id="main_img"]/@src').extract_first() brand = response.xpath( '//input[@name="ProdMfgName"]/@value').extract_first() out_of_stock = response.xpath( '//div[@class="outofstockdiv itemgroup-outofstock"]' ).extract_first() identifier = response.xpath( '//input[@name="ProdID"]/@value').extract_first() identifier += '_' + option_id options_containers = response.xpath( '//div[@class="prodpageoptionvalue"]/select') combined_options = [] for options_container in options_containers: element_options = [] for option in options_container.xpath('./option[@value!=""]'): option_id = option.xpath('./@value').extract_first() option_name = option.xpath('./text()').extract_first() option_name, option_price = extract_option_price(option_name) element_options.append((option_id, option_name, option_price)) combined_options.append(element_options) if len(options_containers) > 1: combined_options = list(itertools.product(*combined_options)) for combined_option in combined_options: o_name, o_price, o_option_id = name, price, identifier for option in combined_option: o_option_id = o_option_id + '_' + option[0] if 'do not add' not in option[1].lower(): o_name = o_name + ' ' + option[1] o_price = o_price + option[2] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', o_name) loader.add_value('identifier', o_option_id) loader.add_value('sku', sku) loader.add_value('category', category) loader.add_value('url', url) loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('price', o_price) loader.add_value('brand', brand) if out_of_stock: loader.add_value('stock', 0) if o_price < self.free_shipping_over: loader.add_value('shipping_cost', self.shipping_cost) option_item = loader.load_item() metadata = KitBagMeta() metadata['size'] = response.meta['size'] player_found = False for team, players in self.teams.iteritems(): for player_id, player in players.iteritems(): product_name = option_item['name'].upper() player_name = player['name'].decode('utf') if player_name.upper( ) in product_name or product_name.split( )[0] == player_name.upper(): metadata['player'] = player_name metadata['number'] = player['number'] player_found = True break if player_found: break option_item['metadata'] = metadata yield option_item else: o_name, o_price, o_option_id = name, price, identifier if combined_options: for option in combined_options[0]: o_option_id = identifier + '_' + option[0] if 'do not add' not in option[1].lower(): o_name = name + ' ' + option[1] o_price = price + option[2] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', o_name) loader.add_value('identifier', o_option_id) loader.add_value('sku', sku) loader.add_value('category', category) loader.add_value('url', url) loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('price', o_price) loader.add_value('brand', brand) if out_of_stock: loader.add_value('stock', 0) if o_price < self.free_shipping_over: loader.add_value('shipping_cost', self.shipping_cost) option_item = loader.load_item() metadata = KitBagMeta() metadata['size'] = response.meta['size'] player_found = False for team, players in self.teams.iteritems(): for player_id, player in players.iteritems(): product_name = option_item['name'].upper() player_name = player['name'].decode('utf') if player_name.upper( ) in product_name or product_name.split( )[0] == player_name.upper(): metadata['player'] = player_name metadata['number'] = player['number'] player_found = True break if player_found: break option_item['metadata'] = metadata yield option_item else: loader = ProductLoader(item=Product(), response=response) loader.add_value('name', o_name) loader.add_value('identifier', o_option_id) loader.add_value('sku', sku) loader.add_value('category', category) loader.add_value('url', url) loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('price', o_price) loader.add_value('brand', brand) if out_of_stock: loader.add_value('stock', 0) if o_price < self.free_shipping_over: loader.add_value('shipping_cost', self.shipping_cost) option_item = loader.load_item() metadata = KitBagMeta() metadata['size'] = response.meta['size'] player_found = False for team, players in self.teams.iteritems(): for player_id, player in players.iteritems(): product_name = option_item['name'].upper() player_name = player['name'].decode('utf') if player_name.upper( ) in product_name or product_name.split( )[0] == player_name.upper(): metadata['player'] = player_name metadata['number'] = player['number'] break if player_found: break option_item['metadata'] = metadata yield option_item
def _start_requests(self): yield Request('http://www.notebooksbilliger.de/logitech+k830+illuminated+living+room+keyboard/eqsqid/dc034145-ba5e-417d-b751-99748adbb8b8', meta={'product':Product()}, callback=self.parse_product)
def parse_product(self, response): log.msg(response.url) base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) identifier = response.url.split('/')[-1].split('-')[-1].split('.')[0] log.msg('Identifier: %s' % identifier) log.msg(repr(self.seen_ids)) if identifier in self.seen_ids: return else: self.seen_ids.append(identifier) loader.add_value('identifier', identifier) sku = hxs.select('//p[@class="pmeta"]/text()').re('(\d+)') loader.add_value('sku', sku) name = hxs.select('//div[@class="prod-box"]/h1//text()').extract() extra_data = name[1].strip() if len(name) > 1 else '' loader.add_value('name', name[0]) #price price = re.sub( '[\r\n\t]+', ' ', hxs.select( '//h5[@class="product-price"]//div[contains(@id,"StaticPrice")]/span/text()[normalize-space()]' )[0].extract()) loader.add_value('price', price) #image_url image_url = hxs.select('//img[@class="product-image"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) #brand loader.add_value('brand', 'Le Creuset') #category category = hxs.select('//ul[@class="breadcrumbs"]')[0].select( './/a/text()').extract() loader.add_value('category', ' > '.join(category[2:])) #shipping_cost price = Decimal(loader.get_output_value('price')) if price < 20.00: loader.add_value('shipping_cost', '2.00') elif 20.00 <= price < 40.00: loader.add_value('shipping_cost', '4.99') product = loader.load_item() options = hxs.select('.//select/option[contains(@class,"%s")]' % identifier) if options: sid = hxs.select( '//input[@type="hidden" and @name="SID"]/@value')[0].extract() stock_url = 'http://www.hartsofstur.com/cgi-bin/st000001.pl?ACTION=GETSTOCK&REF=%(identifier)s&SID=%(sid)s×tamp=%(timestamp)s' items = [] for option in options: item = copy.deepcopy(product) option_name = option.select('./text()')[0].extract().strip() option_identifier = option.select('./@class').re('_(\d+)_')[0] self.seen_ids.append(option_identifier) item['identifier'] = "%s_%s" % (identifier, option_identifier.strip()) item['name'] += ' %s %s' % (option_name, extra_data) item['name'] = item['name'].strip() items.append(item) yield Request(stock_url % { 'identifier': identifier, 'sid': sid, 'timestamp': int(time.time()) }, meta={'items': items}, callback=self.parse_stock) else: product['name'] += ' %s' % extra_data yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', response.meta['name']) loader.add_xpath('price', '//*[@id="price-text"]/span/text()') yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) url = response.url sku = response.meta['sku'] sec_sku = response.meta['notes'] name = response.meta['name'].encode('ascii', 'ignore') main_product = hxs.select("//div[@id='Product-MainProduct']") main_products = hxs.select("//div[@id='Product-MainProductContainer']//div[@class='Product-SubProduct']") secondary_products = hxs.select("//div[@id='Product-SubProductContainer']//div[@class='Product-SubProduct']") main_product_sku = main_product.select(".//div[@id='Product-lblItem']/span[@id='lblItem']/text()").extract() if not main_product_sku: logging.error("NO MAIN SKU! %s" % url) else: main_product_sku = main_product_sku[0] if main_product_sku == sku or main_product_sku == sec_sku: # extract main product price = main_product.select(".//div[@class='Product-Price']/span[@id='lblClubPrice']/b/font/text()").re("\$(.*)") if not price: logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' % (sku, name, url)) return price = price[0].strip() product = Product() loader = ProductLoader(item=product, response=response, selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) yield loader.load_item() return elif main_products: for product in main_products: product_sku = product.select(".//div[@class='Product-SubProductNumber']/font/text()").re("#(.+)") if not product_sku: logging.error("NO MAIN SKU! %s" % url) else: product_sku = product_sku[0] if product_sku == sku or product_sku == sec_sku: # extract secondary product price = product.select(".//span[contains(@id, 'lblClubPrice')]/b/font/text()").re("\$(.*)") if not price: logging.error('ERROR!! NO SEC PRICE!! %s "%s" "%s"' % (sku, name, url)) return price = price[0].strip() product = Product() loader = ProductLoader(item=product, response=response, selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) yield loader.load_item() return elif secondary_products: for product in secondary_products: product_sku = product.select(".//div[@class='Product-SubProductNumber']/text()").re("#(.+)") if not product_sku: logging.error("NO SECONDARY SKU! %s" % url) else: product_sku = product_sku[0] if product_sku == sku or product_sku == sec_sku: # extract secondary product price = product.select(".//span[contains(@id, 'lblClubPrice2')]/b/font/text()").re("\$(.*)") if not price: logging.error('ERROR!! NO SEC PRICE!! %s "%s" "%s"' % (sku, name, url)) return price = price[0].strip() product = Product() loader = ProductLoader(item=product, response=response, selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) yield loader.load_item() return else: logging.error("No products found!")
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) # identifier = url = response.url # sku = # metadata = category = response.meta["cat_name"] image_url = hxs.select( "//div[contains(@class, 'product-img-box')]" "/p[contains(@class, 'product-image')]/img/@src").extract() # brand = # shipping_cost = colours = hxs.select("//div[@class='colours']//input").extract() if colours and len(colours) > 1: _script = hxs.select( "//script[contains(text(), 'spConfig')]/text()" ).extract()[0].split("(") script = "".join(_script[1:]).split(',"priceFromLabel"')[0] + '}' js = json.loads(script) for s in js['attributes']['76']['options']: color = s['label'] code = s['products'][0] u = js['childProducts'].get(code) name = hxs.select( "//div[@class='product-name']/h1/text()" ).extract()[0] + " " + color price = u['finalPrice'] if not price: price = "" l = ProductLoader(response=response, item=Product()) # l.add_value('identifier', identifier) l.add_value('url', url) l.add_value('name', name) l.add_value('price', price) # l.add_value('sku', sku) # l.add_value('metadata', metadata) l.add_value('category', category) l.add_value('image_url', image_url) # l.add_value('brand', brand) # l.add_value('shipping_cost', shipping_cost) yield l.load_item() else: name = hxs.select( "//div[@class='product-name']/h1/text()").extract() price = hxs.select( "//div[@class='price-box']//span[@class='regular-price']" "/span/text()").extract() if not price: price = "" l = ProductLoader(response=response, item=Product()) # l.add_value('identifier', identifier) l.add_value('url', url) l.add_value('name', name) l.add_value('price', price) # l.add_value('sku', sku) # l.add_value('metadata', metadata) l.add_value('category', category) l.add_value('image_url', image_url) # l.add_value('brand', brand) # l.add_value('shipping_cost', shipping_cost) yield l.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) brand = response.meta.get('brand', '') loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@id="productname"]/text()') loader.add_value('url', response.url) loader.add_value('brand', brand) #categories = hxs.select('//ul[@class="breadcrumbs"]//a/text()').extract()[0:-1] loader.add_value('category', response.meta.get('brand', '')) identifier = response.xpath( '//input[@name="product"]/@value').extract() if not identifier: log.msg('PRODUCT WHIOUT IDENTIFIER: ' + response.url) return loader.add_value('sku', identifier[0]) loader.add_value('identifier', identifier[0]) image_url = response.css('.main-image img::attr(src)').extract() if image_url: loader.add_value('image_url', image_url[0]) price = response.xpath('//span[@id="product-price-' + identifier[0] + '" and @class="price"]/text()').extract() if not price: price = response.xpath('//span[@id="product-price-' + identifier[0] + '"]/span[@class="price"]/text()').extract() if not price: price = hxs.select( '//div[@id="product_price"]//span[@class="price"]/text()' ).extract() loader.add_value('price', price[-1]) in_stock = response.xpath('//p[@class="availability in-stock"]') if not in_stock: loader.add_value('stock', '0') if loader.get_output_value('price') <= 49.99: loader.add_value('shipping_cost', 2.95) item = loader.load_item() options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) prices[product] = prices.get( product, 0) + extract_price(option['price']) for option_identifier, option_name in products.iteritems(): option_item = deepcopy(item) option_item['identifier'] += '-' + option_identifier option_item['name'] += option_name option_item['price'] += prices[option_identifier] if not option_item['price']: option_item['stock'] = 0 if option_item['price'] <= 49.99: option_item['shipping_cost'] = 2.95 yield option_item else: options_bundle = re.search(r'new Product.Bundle\((.*)\)', response.body) if options_bundle: log.msg('OPTION BUNDLE: ' + response.url) combined_options = [] product_data = json.loads(options_bundle.groups()[0]) for id, options in product_data['options'].iteritems(): element_options = [] for option_id, option in options['selections'].iteritems(): option_id = option_id option_name = option['name'] option_attr = (option_id, option_name) element_options.append(option_attr) combined_options.append(element_options) combined_options = list(itertools.product(*combined_options)) options = [] for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get( 'desc', '') + ' ' + option[1] final_option['identifier'] = final_option.get( 'identifier', '') + '-' + option[0] options.append(final_option) for option in options: option_item = deepcopy(item) option_item['identifier'] += option['identifier'] option_item['name'] += option['desc'] #option_item['price'] += prices[option_identifier] if not option_item['price']: option_item['stock'] = 0 if option_item['price'] <= 49.99: option_item['shipping_cost'] = 2.95 yield option_item else: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) category = response.xpath( '//table[@class="history-menu-table"]//a/text()').extract()[1:-2] image_url = response.xpath( '//img[@id="ModelsDisplayStyle1_ImgModel"]/@src').extract() image_url = response.urljoin(image_url[0]) if image_url else '' product_brand = response.xpath( '//a[@class="brand-image-link"]/@title').extract() product_brand = product_brand[0].strip() if product_brand else '' shipping_cost = response.xpath( '//span[@id="ModelsDisplayStyle1_LblPostageCostValue"]/text()' ).extract() shipping_cost = extract_price(shipping_cost[0]) if shipping_cost else 0 name = ' '.join( response.xpath( '//h1/span[not(@class="models-page-title-price")]/text()'). extract()) options = response.xpath('//tr[contains(@class, "item-row")]') if options: # options for option in options: name2 = option.xpath( './/a[contains(@class, "option-text")]/text()').extract() if not name2: name2 = option.xpath( './/span[contains(@class, "option-text")]/text()' ).extract() option_name = name + ' ' + name2[0].strip() if name2 else name price = option.xpath( './/span[contains(@class, "price-label")]/text()').extract( )[0] sku = option.xpath( './/td[contains(@class, "item-part-code")]/text()' ).extract()[0].strip() identifier = option.xpath( './/a[@class="add-to-basket-button"]/@href').re( 'StockID=(\d+)') if not identifier: identifier = option.xpath( './/a[@class="request-stock-alert-link"]/@onclick').re( 'StockID=(\d+)') identifier = identifier[0] loader = ProductLoader(item=Product(), selector=option) loader.add_xpath('identifier', identifier) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('name', option_name) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('brand', product_brand) in_stock = option.xpath( './/td[contains(@class, "item-in-stock")]') if not in_stock: loader.add_value('stock', 0) else: stock_level = in_stock.re('\d+') if stock_level: loader.add_value('stock', int(stock_level[0])) if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', shipping_cost) yield loader.load_item() if not options: options = response.xpath( '//input[contains(@id, "HidStockOptionDetails")]') if options: for option in options: option_data = json.loads( option.xpath('@value').extract()[0]) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name + ' ' + option_data['option']) loader.add_value('price', extract_price(str(option_data['price']))) loader.add_value('identifier', option_data['stockID']) loader.add_value('image_url', image_url) loader.add_value('category', category) loader.add_value('sku', option_data['partcode']) loader.add_value('brand', product_brand) stock_level = re.findall( '\d+', json.loads(option.xpath('@value').extract()[0]) ['stockLevelText']) if stock_level: loader.add_value('stock', int(stock_level[0])) else: self.log('POSSIBLE OUT OF STOCK : ' + response.url) if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', shipping_cost) yield loader.load_item() else: self.log(' >>> NO OPTIONS FOUND: ' + response.url) price = "".join( hxs.select(".//span[@class='bigprice']/text()").re( r'([0-9\,\. ]+)')).strip() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('identifier', response.url) loader.add_value('image_url', image_url) loader.add_value('category', category) loader.add_xpath('sku', './td[position()=2]/text()') loader.add_value('brand', product_brand) if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) redirected_urls = response.meta.get('redirect_urls', None) if redirected_urls: log.msg('Skips product, redirected url: ' + str(redirected_urls[0])) return image_url = hxs.select('//a[@id="cloud_zoom"]/img/@src').extract() try: product_identifier = hxs.select( '//input[@name="product"]/@value').extract()[0].strip() except: product_identifier = hxs.select( '//form[@id="product_addtocart_form"]/@action').re( r'/product/(\d+)')[0] product_name = hxs.select( '//div[@class="product-name"]/h1/text()').extract()[0].strip() category = response.meta.get('category') brand = hxs.select( '//div[contains(@class, "product-shop")]/a/img/@title').extract() brand = brand[0].strip() if brand else '' out_of_stock = hxs.select( '//p[@class="availability out-of-stock"]').extract() options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) for identifier, option_name in products.iteritems(): product_loader = ProductLoader(item=Product(), selector=hxs) sku = product_identifier + '_' + identifier product_loader.add_value('identifier', sku) product_loader.add_value('sku', sku) product_loader.add_value('name', product_name + option_name) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) price = float(product_data['basePrice']) product_loader.add_value('price', round(price, 2)) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) if price < 25: product_loader.add_value('shipping_cost', 2.99) else: product_loader.add_value('shipping_cost', 0) if out_of_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('sku', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select('//*[@id="product-price-{}"]//text()'.format( product_identifier)).extract() price = ''.join(price).strip() if price == '': price = hxs.select('//*[@id="old-price-{}"]//text()'.format( product_identifier)).extract() price = ''.join(price).strip() price = extract_price(price) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) if price < 25: product_loader.add_value('shipping_cost', 2.99) else: product_loader.add_value('shipping_cost', 0) if out_of_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brand = hxs.select('//meta[@property="og:brand"]/@content').extract() brand = brand[0] if brand else '' category = response.meta.get('category', '') product_config_reg = re.search('var spConfig=new Product.Config\((.*)\).*var original_product_name;', response.body, flags=re.DOTALL) if not product_config_reg: product_config_reg = re.search('var spConfig = new Product.Config\((.*)\).*var original_product_name;', response.body, flags=re.DOTALL) if not product_config_reg: product_config_reg = re.findall(re.compile('var spConfig = new Product.Config\((.*)\).*'), response.body) product_config_reg = product_config_reg[0] if product_config_reg else '' if product_config_reg: try: products = json.loads(product_config_reg.group(1)) except: products = json.loads(product_config_reg) for attr_id, attribute in products[u'attributes'].items(): for option in attribute['options']: option = option['productsData'][0] loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', option[0]) loader.add_value('url', response.url) loader.add_value('image_url', option[3][0] if option[3] else '') loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('stock', option[4]) loader.add_value('name', option[5]) loader.add_value('shipping_cost', extract_price(option[6])) loader.add_value('price', option[2]) loader.add_value('sku', option[1]) item = loader.load_item() if item['identifier'] not in self.identifiers: self.identifiers.append(item['identifier']) yield item else: stock = hxs.select('//span[@class="stock_value"]/span/text()').re(r'(\d+)') price = hxs.select('//span[contains(@id, "product-price-")]/span[@class="price"]/text()').extract() if not price: price = hxs.select('//span[contains(@id, "product-price-") and @class="price"]/text()').extract() if not price: price = hxs.select('//span[contains(@class, "old-price")]/span[@class="price"]/text()').extract() shipping_cost = hxs.select('//span[contains(@id, "product-price-")]/span[contains(@class, "price-delivery")]/text()').extract() if not shipping_cost: shipping_cost = hxs.select('//span[contains(@id, "product-price-") and contains(@class, "price")]' '/following-sibling::span[contains(@class, "price-delivery")]/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_value('url', response.url) loader.add_xpath('image_url', '//div[@id="product-images"]//img[@class="img-responsive"]/@src') loader.add_value('brand', brand) loader.add_value('category', category) if stock: loader.add_value('stock', stock[0]) else: loader.add_value('stock', 0) loader.add_xpath('name', '//div[contains(@class, "product-name")]/*[self::h1 or self::h2]/text()') loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price) loader.add_xpath('sku', '//div[contains(@class, "product-name")]//span[@class="sku_value"]/text()') item = loader.load_item() if 'identifier' not in item: self.log("Warning: no identifier found, skiping product") return if item['identifier'] not in self.identifiers: self.identifiers.append(item['identifier']) yield item
def parse_product(self, response): if 'aspxerrorpath' in response.url: yield Request(response.request.meta['redirect_urls'][0], self.parse_product, dont_filter=True) aud_url = response.xpath( '//a[contains(@href, "?cur=AUD")]/@href').extract_first() if aud_url: yield Request(response.urljoin(aud_url), self.parse_product, dont_filter=True) return base_product = True add_custom_personalization = False loader = ProductLoader(item=Product(), response=response) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_value('category', 'Kits') heros_data = response.xpath('//script/text()').re( 'product\d{7} =(.+?});var') base_product_data = response.xpath('//script/text()').re( 'product\w{6} =(.+?});var') if not base_product_data: for p in self.parse(response): yield p return if not heros_data: data = json.loads(base_product_data[0]) elif len(heros_data) == 1: data = json.loads(heros_data[0]) base_product = False else: data = [json.loads(x) for x in heros_data] data = {x['ProductID']: x for x in data} heros = response.css('select.heroShirts') hero = heros.xpath('option[@selected]') if not hero: data = json.loads(base_product_data[0]) else: data = data[int(hero.xpath('@value').extract_first())] base_product = False base_product_data = json.loads(base_product_data[0]) #Checking custom personalization printings = { p['PrintingTypeID']: p for p in base_product_data['printingitems'] } custom_printings = printings.get(1) if custom_printings and base_product: add_custom_personalization = True loader.add_value('name', data['Description']) loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"') if data['Brand']: loader.add_value('brand', data['Brand']['Name']) loader.add_value('image_url', response.urljoin(data['ImageURL'])) product = loader.load_item() player_from_name = re.search('(?!Sponsor).*with *([\w\ \.\-]+?) (\d+)', data.get('Description', ''), re.UNICODE) if player_from_name: player_name, number = player_from_name.groups() #sizes for variation in data['Variations']: size = variation['Description'] loader = ProductLoader(item=Product(), response=response) loader.add_value(None, product) loader.replace_value('identifier', variation['VariationId']) loader.add_value('name', size) loader.replace_value('price', variation['PriceActual']) if self.free_delivery_over is not None and self.free_delivery_over > loader.get_output_value( 'price'): loader.replace_value('shipping_cost', self.shipping_cost) loader.replace_value('stock', int(variation['IsInStock'])) item = loader.load_item() if player_from_name: item['metadata'] = { 'player': player_name, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item base_size_items = [item] #Custom printings if add_custom_personalization: team_player_name = 'WILLIAMS' team_player_number = '10' team_player_id = 'WILLIAMS' loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', team_player_name) loader.add_value('name', team_player_number) price = Decimal(item['price']) + Decimal( str(custom_printings['PriceActual'])) loader.replace_value('price', price) if self.free_delivery_over is not None and price >= self.free_delivery_over: loader.replace_value('shipping_cost', 0) identifier = '-'.join( (item['identifier'], str(custom_printings['PrintingID']), team_player_id)) loader.replace_value('identifier', identifier) custom_item = loader.load_item() custom_item['metadata'] = { 'player': team_player_name, 'number': team_player_number, 'size': size } yield custom_item base_size_items.append(custom_item) #Badges printing = printings.get(3) if not printing: continue for base_item in base_size_items: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, base_item) loader.add_value('name', printing['PrintingDescription']) price = Decimal(base_item['price']) + Decimal( str(printing['PriceActual'])) loader.replace_value('price', price) if self.free_delivery_over is not None and price >= self.free_delivery_over: loader.replace_value('shipping_cost', 0) identifier = base_item['identifier'] + '-' + str( printing['PrintingID']) loader.replace_value('identifier', identifier) badge_item = loader.load_item() badge_item['metadata'] = base_item['metadata'].copy() yield badge_item
def parse_product(self, response): hxs = HtmlXPathSelector(text=response.body_as_unicode()) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) identifier = hxs.select('//input[@id="catentryId"]/@value').extract() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = ''.join( hxs.select('//div[@itemprop="price"]//span[@class="price"]//text()' ).extract()).strip() loader.add_value('price', price) categories = hxs.select( '//ul[@class="breadcrumbs"]//li[not(@class="home")]/a/span/text()' ).extract()[1:] loader.add_value('category', categories) image_url = hxs.select('//img[@id="productMainImage"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) brand = hxs.select( '//li[contains(text(), "BRAND")]/span/text()').extract() loader.add_value('brand', brand) item = loader.load_item() if not item.get('name'): log.msg('Using BeautifulSoup: ' + response.url) loader = ProductLoader(response=response, item=Product()) soup = BeautifulSoup(response.body) loader.add_value('url', response.url) identifier = soup.find('input', attrs={'id': 'catentryId'}) identifier = _soup_el_get_attr(identifier, 'value') loader.add_value('identifier', identifier) loader.add_value('sku', identifier) name = soup.find('h1', attrs={'itemprop': 'name'}).text loader.add_value('name', name) categories = [ li.a.span.text for li in soup.find('ul', attrs={ 'class': 'breadcrumbs' }).findAll('li') if li.a ][2:] loader.add_value('category', categories) price = soup.find('div', attrs={ 'itemprop': 'price' }).find('span', attrs={ 'class': 'price' }).text loader.add_value('price', price) image_url = soup.find('img', attrs={'id': 'productMainImage'}) if image_url: image_url = _soup_el_get_attr(image_url, 'src') loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url)) brand = '' for li in soup.findAll('li'): if 'BRAND' in li.text.upper(): brand = li.span.text break loader.add_value('brand', brand) item = loader.load_item() if item['identifier']: yield item else: if item['identifier']: yield item if not item.get('name'): request = self.retry(response, "No name for product: " + response.url) if request: yield request return
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//h1[@class="product-name"]/text()').extract() try: identifier = hxs.select( '//input[contains(@name, "ProductID")]/@value').extract()[0] except IndexError: retries = int(response.meta.get('retries', 0)) if retries < self.max_retry_times: retries += 1 req = Request(response.url, meta={ 'retries': retries, 'dont_merge_cookies': True, 'cookiejar': int(time.time()), 'dont_redirect': True }, dont_filter=True, callback=self.parse_product) yield req return sku = hxs.select('//div[contains(@class, "list-item-sku-wrap")]/text()' ).re('SKU: (.*)') sku = sku[0].strip() if sku else '' price = hxs.select( '//div[@class="price-wrap"]/div[contains(@class, "sale") and contains(@class, "inc-vat")]/span[@itemprop="price"]/text()' ).extract() if not price: price = hxs.select( '//div[@class="price-wrap"]/div[contains(@class, "regular") and contains(@class, "inc-vat")]/text()' ).extract() price = extract_price(price[0]) loader = ProductLoader(item=Product(), response=response) loader.add_value('price', price) in_stock = 'IN STOCK' in ''.join( hxs.select('//span[contains(@class, "stock-hint")]/text()'). extract()).strip().upper() if not in_stock: loader.add_value('stock', 0) loader.add_value('identifier', identifier) loader.add_value('name', name) categories = hxs.select( '//span[@class="SectionTitleText"]/a/text()').extract() loader.add_value('category', categories) brand = hxs.select('//ul/li[contains(text(), "Brand:")]/text()').re( 'Brand: (.*)') brand = brand[0].strip() if brand else '' loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('url', response.url) image_url = hxs.select( '//img[contains(@class, "product-image")]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) if loader.get_output_value('price') >= 50: loader.add_value('shipping_cost', 0) else: loader.add_value('shipping_cost', 3.99) item = loader.load_item() options = hxs.select('//select[@id="variantSelector"]/option') if options: for option in options: option_item = deepcopy(item) option_id = option.select('@value').extract()[0] option_name = option.select('text()').extract() if not option_name or option_item['name'].upper( ) not in option_name[0].upper(): option_name = option_item['name'] + ' ' + ''.join( hxs.select('//div[@class="misc-text-promo"]/text()'). extract()).strip() else: option_name = option_name[0] price = hxs.select( '//div[@id="variant-info-' + option_id + '"]/div[@class="price-wrap"]/div[contains(@class, "sale") and contains(@class, "inc-vat")]/span[@itemprop="price"]/text()' ).extract() if not price: price = hxs.select( '//div[@id="variant-info-' + option_id + '"]/div[@class="price-wrap"]/div[contains(@class, "regular") and contains(@class, "inc-vat")]/text()' ).extract() price = extract_price(price[0]) price = (price / Decimal('1.2')).quantize(Decimal('1.00')) option_item['price'] = price option_item['name'] = option_name.strip() option_item[ 'identifier'] = option_item['identifier'] + '-' + option_id yield option_item else: yield item
def _start_requests(self): yield Request( 'http://www.comtech.de/Computer-und-Zubehoer/Eingabegeraete/Maeuse/Logitech-Performance-Maus-MX', meta={'product': Product()}, callback=self.parse_product)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response=response) name = hxs.select('//h1/span[@itemprop="name"]/text()').extract()[0] price = ''.join(''.join( hxs.select( '//form//p[@class="special-price"]//span[@class="price"]/text()' ).extract()).split()) if not price: price = ''.join(''.join( hxs.select( '//span[@class="regular-price"]//span[@class="price"]/text()' ).extract()).split()) price = extract_price(price) brand = '' categories = hxs.select( '//div[@itemprop="breadcrumb"]/a/text()').extract()[1:] l = ProductLoader(item=Product(), response=response) image_url = hxs.select( '//ul[@id="product-img-main"]//img/@src').extract() image_url = image_url[0] if image_url else '' l.add_value('image_url', image_url) l.add_value('url', response.url) l.add_value('name', name) l.add_value('price', price) l.add_value('brand', brand) l.add_value('category', categories) sku = hxs.select('//span[@itemprop="sku"]/text()').extract() sku = sku[0] if sku else '' l.add_value('sku', sku) identifier = hxs.select('//input[@name="product"]/@value').extract() l.add_value('identifier', identifier[0]) item = l.load_item() promotions = hxs.select( '//div[@class="bb-price-group" and //span[contains(text(), "Was")]]//span/text()' ).extract() metadata = MetaData() metadata['Promotions'] = ' '.join(promotions) if promotions else '' item['metadata'] = metadata available_options = hxs.select( '//select[contains(@name, "bundle_option")]/option[not(@value="")]/@value' ).extract() if not available_options: available_options = hxs.select( '//input[contains(@id, "bundle-option") and not(@value="0" or @value="1")]/@value' ).extract() options_bundle = re.search(r'new Product.Bundle\((.*)\)', response.body) if options_bundle and available_options: log.msg('OPTION BUNDLE: ' + response.url) combined_options = [] product_data = json.loads(options_bundle.groups()[0]) for id, options in product_data['options'].iteritems(): element_options = [] for option_id, option in options['selections'].iteritems(): if option_id not in available_options: continue option_name = hxs.select('//option[@value="' + option_id + '"]/text()').extract() if not option_name: option_name = hxs.select( '//li[input[@value="' + option_id + '"]]//label/text()').extract() option_name = option_name[0].split(u'\xa0')[0].strip() option_price = option['priceInclTax'] option_attr = (option_id, option_name, option_price) element_options.append(option_attr) combined_options.append(element_options) combined_options = [ combined_option for combined_option in combined_options if combined_option ] combined_options = list(itertools.product(*combined_options)) options = [] for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get( 'desc', '') + ' ' + option[1] final_option['identifier'] = final_option.get( 'identifier', '') + '-' + option[0] final_option['price'] = final_option.get('price', 0) + option[2] options.append(final_option) for option in options: option_item = deepcopy(item) option_item['identifier'] += option['identifier'] option_item['name'] += option['desc'] option_item['price'] += extract_price(str(option['price'])) yield option_item else: options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) price = option.get('price') if option.get( 'price', 0) else option.get('oldPrice') prices[product] = prices.get( product, 0) + extract_price(price) for option_identifier, option_name in products.iteritems(): option_item = deepcopy(item) option_item['identifier'] += '-' + option_identifier option_item['name'] += option_name option_item['price'] = extract_price( product_data['childProducts'][option_identifier] ['finalPrice']) yield option_item else: yield item
def parse_products(self, response): json_data = json.loads(response.body) products = json.loads(json_data.get('d')) for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) try: brand = product_el[u'ProductManufacturer'][ u'TyreManufacturerName'] except: brand = '' winter_tyre = product_el[u'ProductAttributes'][u'IsWinter'] # skip winter tyres if winter_tyre: continue for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand try: full_name = product_el[u'ProductTreadPattern'][u'TreadName'] except: full_name = '' # Fix name changes if full_name in self.new_old_names: full_name = self.new_old_names[full_name] loader.add_value('name', full_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.get('TyreID') loader.add_value('url', 'http://www.tyresonthedrive.com') image_url = 'http://www.tyresonthedrive.com/img/treads/' + product_el[ u'ProductTreadPattern'][u'TreadPatternImage'] + '.jpg' loader.add_value('image_url', image_url) loader.add_value('identifier', identifier) price = product_el[u'CheapestPriceTwoDay'][u'OneTyrePriceIncVat'] if not price: loader.add_value('stock', 0) loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = str( product_el[u'ProductAttributes'][u'Profile']) metadata['rim'] = str(product_el[u'ProductAttributes'][u'Rim']) metadata['speed_rating'] = str( product_el[u'ProductAttributes'][u'Speed']) metadata['load_rating'] = str( product_el[u'ProductAttributes'][u'Load']) metadata['width'] = str( product_el[u'ProductAttributes'][u'Section']) metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' metadata['xl'] = 'Yes' if product_el[u'ProductAttributes'][ u'IsExLoad'] else 'No' metadata['run_flat'] = 'Yes' if product_el[u'ProductAttributes'][ u'IsRunFlat'] else 'No' man_mark = product_el[u'ProductAttributes'][u'OEMFitment'] metadata['manufacturer_mark'] = find_man_mark( man_mark) if man_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_products(self, response): hxs = HtmlXPathSelector(response) product_data = response.meta['product_data'] width = product_data['Width'] aspect_ratio = product_data['Aspect Ratio'] rim = product_data['Rim'] speed_rating = product_data['Speed rating'] alt_speed = product_data['Alt Speed'] name_reg = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (width, rim, speed_rating.upper()) name_reg2 = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (width, rim, alt_speed.upper()) name_reg3 = r'(.+?)\s*%s.+%s.?[\s]*(.*)' % (width, rim) products = hxs.select('//div[@id="product-listing"]//div[@class="product"]/..') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) try: url = product_el.select('.//div[@class="title"]/a/@href')[0].extract() except: continue loader.add_value('url', url) loader.add_value('identifier', product_el.select(".//span[@class='addcompare']/input/@id").extract()[0].split(":")[1]) # loader.add_value('identifier', re.search('productId_(\d+)_', url).groups()[0]) loader.add_xpath('price', './/span[@class="prodPirce"]/text()') try: name = product_el.select('.//div[@class="title"]/a/text()')[0].extract() except: continue if not re.search(r'(\(.*\))', name): # name = name.replace('/', '') m = re.search(name_reg, name) if not m: m = name_parts = re.search(name_reg2, name) if not m: m = name_parts = re.search(name_reg3, name) if m: name_parts = m.groups() else: self.log('Failed parsing ' + name) self.log('URL: ' + response.url) self.log('Params: ' + ", ".join(map(str, [width, rim, speed_rating.upper()]))) continue else: name_parts = [] name_parts.append(name.split()[0]) load_rating_reg = re.search(r'(\d+)%s' % speed_rating.upper(), name) if not load_rating_reg: load_rating_reg = re.search(r'(\d+)%s' % alt_speed.upper(), name) if not load_rating_reg: self.log('Failed parsing ' + name) self.log('URL: ' + response.url) self.log('Params: ' + ", ".join(map(str, [width, rim, speed_rating.upper()]))) continue name_parts.append(load_rating_reg.groups()[0]) name_parts.append(' '.join(name.split()[1:]).split('(')[0]) loader.add_value('name', name_parts[-1].replace('XL', '').replace('ROF', '').replace('RFT', '')) brand = name_parts[0] loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_xpath('image_url', './/a[contains(@class, "tyre")]/img/@src') m = MicheldeverMeta() m['aspect_ratio'] = aspect_ratio m['rim'] = rim m['width'] = width m['speed_rating'] = speed_rating.upper() m['load_rating'] = name_parts[1] if 'ROF' in name.upper() or 'RFT' in name.upper(): m['run_flat'] = 'Yes' else: m['run_flat'] = 'No' if 'XL' in name.upper(): m['xl'] = 'Yes' else: m['xl'] = 'No' m['full_tyre_size'] = '/'.join((m['width'], m['aspect_ratio'], m['rim'], m['load_rating'], m['speed_rating'])) # m['alternative_speed_rating'])) m['fitting_method'] = 'Fitted' m['manufacturer_mark'] = self._get_manufacturer_code(name_parts[-1]) product = loader.load_item() product['metadata'] = m if not is_product_correct(product): self.log('The product is not correct: %r' % product) continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product next_page = hxs.select('//span[@class="nextlink"]/a/@href') if next_page: yield Request(next_page.extract()[0], callback=self.parse_products, meta=response.meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_found = hxs.select('//div[@id="primary_block"]') if not product_found: return product_id = hxs.select('//input[@name="id_product"]/@value').extract()[0] name = hxs.select('//div[@id="dfCenter"]//h1/text()').extract()[0] category = hxs.select('//div[@class="breadcrumb"]/a/text()').extract()[1:] image_url = hxs.select('//img[@id="bigpic"]/@src').extract() if image_url: image_url = image_url[0] product_url = response.url product_brand = hxs.select('//div[@id="short_description_content"]//p[1]//text()').extract()[0] product_brand = product_brand.replace(' di ', ' da ') product_brand = product_brand.replace(' by ', ' da ') try: if len(product_brand) > 20: product_brand = re.search(' da.+?[,.]', product_brand).group(0) except: pass product_brand = product_brand.split(' da ')[-1] product_brand = product_brand.strip().strip('.,') if len(product_brand) > 20: title = hxs.select('//title/text()').extract()[0] s = SequenceMatcher(a=product_brand.title(), b=title.title()) m = s.find_longest_match(0, len(s.a), 1, len(s.b)) product_brand = s.a[m[0]:m[0]+m[-1]].strip() if len(product_brand) < 7 or ' ' not in product_brand: product_brand = None currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body) if currencyRate: currencyRate = Decimal(currencyRate.group(1)) else: currencyRate = 1 taxRate = re.search("var taxRate\D+([\d\.]+)", response.body) if taxRate: taxRate = Decimal(taxRate.group(1)) else: taxRate = 0 reduction_percent = re.search("var reduction_percent\D+([\d\.]+)", response.body) if reduction_percent: reduction_percent = Decimal(reduction_percent.group(1)) else: reduction_percent = 0 reduction_price = re.search("var reduction_price\D+([\d\.]+)", response.body) if reduction_price: reduction_price = Decimal(reduction_price.group(1)) else: reduction_price = 0 productPriceTaxExcluded = re.search("var productPriceTaxExcluded\D+([\d\.]+)", response.body) if productPriceTaxExcluded: productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1)) else: productPriceTaxExcluded = 0 idDefaultImage = re.search('var idDefaultImage = (\d+)', response.body) if idDefaultImage: idDefaultImage = idDefaultImage.group(1) if re.search('addCombination.*?;', response.body): # here we parse option tags for more product options. option_value_xpath = '//div[@id="attributes"]//select/option/@value' option_values = hxs.select(option_value_xpath).extract() option_text_xpath = '//div[@id="attributes"]//select/option//text()' option_texts = hxs.select(option_text_xpath).extract() # build the lookup table. options = {} for i in range(len(option_values)): options[option_values[i]] = option_texts[i] # addCombination(5631, new Array('259'), 11, 109.99, 0, -1, 'GGT3050', 0.00, 1); for x in re.finditer('addCombination\((.*?)\);', response.body): s = x.group(0).split(',') offset = Decimal(s[-6]) # determining place of options keys option_key_start = 1 option_key_end = len(s) - 7 # parsing option keys option_texts = [] opt = '' for i in range(option_key_start, option_key_end): try: opt = re.sub('[^\d]+', '', s[i]) option_text = options[opt] except: pass if len(option_text) > 0: option_texts.append(option_text.strip()) price = productPriceTaxExcluded + offset * currencyRate tax = (taxRate / Decimal('100')) + 1 price = price * tax reduction = Decimal('0') if reduction_price or reduction_percent: reduction = price * (reduction_percent / Decimal('100')) + reduction_price price = price - reduction price = round(price, 2) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name + ' ' + ' '.join(option_texts)) image_id = s[-4].strip(" '") if image_url and image_id != "-1" and image_id != idDefaultImage: loader.add_value('image_url', image_url.replace('-' + idDefaultImage + '-', '-' + image_id + '-')) else: loader.add_value('image_url', image_url) loader.add_value('brand', product_brand) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('identifier', '%s-%s' % (product_id, re.search(r'(\d+)', s[0]).group(1))) loader.add_value('sku', s[-3].strip("' ").decode('utf8')) yield loader.load_item() else: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_xpath('price', '//*[@id="our_price_display"]/text()', lambda x: extract_price_eu(x[0]) if x else Decimal('0')) loader.add_value('category', category) loader.add_value('identifier', product_id) loader.add_xpath('sku', '//*[@id="product_reference"]/span/text()') loader.add_value('brand', product_brand) yield loader.load_item()
def _start_requests(self): yield Request('http://www.banneke.com/Whisky/Whiskey/International/Amrut_Malt_Whisky_aus_Indien_46_0.70', callback=self.parse_product, meta={'product': Product()})
def parse_product(self, response): hxs = HtmlXPathSelector(response) many = hxs.select( '//div[contains(@class,"product-listing")]//h3/a/@href').extract() if not many: many = hxs.select( '//div[contains(@class,"listing-product")]//h3/a/@href' ).extract() if many: for url in many: yield Request(urljoin(get_base_url(response), url), callback=self.parse_product) return price = hxs.select( '//span[@class="now"]/span[@itemprop="price"]/text()').extract() if not price: price = hxs.select( '//div[@class="product-price"]//span[@itemprop="price"]/text()' ).extract() if not price: if response.meta.get('tries', 0) < 3: self.log("Try: %s. Retrying page: %s" % (response.meta.get('tries', 0) + 1, response.url)) yield Request(response.url, callback=self.parse_product, dont_filter=True, meta={ 'handle_httpstatus_list': [404], 'tries': response.meta.get('tries', 0) + 1 }) return else: self.log('Gave up trying: %s' % response.url) self.log('No price found on page: %s' % response.url) return else: price = price[0] loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('identifier', 'substring(//h2[@id="manu"]/@content, 5)') loader.add_xpath('identifier', '//strong[@itemprop="mpn"]/text()') loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') loader.add_value('price', extract_price(price)) loader.add_xpath('sku', 'substring(//h2[@id="manu"]/@content, 5)') loader.add_xpath('sku', '//strong[@itemprop="mpn"]/text()') loader.add_xpath( 'category', '//div[contains(@class, "breadcrumb")]//a/span/text()') img = hxs.select('//img[@itemprop="image"]/@src').extract() if img: loader.add_value('image_url', urljoin(get_base_url(response), img[0])) loader.add_xpath( 'brand', '//div[@itemprop="brand"]/meta[@itemprop="name"]/@content') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '2.99') else: loader.add_value('shipping_cost', 0) loader.add_xpath('stock', '//span[@itemprop="quantity"]/text()') yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) url = response.url sku = response.meta['sku'] sec_sku = response.meta['notes'] name = response.meta['name'].encode('ascii', 'ignore') main_product = hxs.select("//div[@id='Product-MainProduct']") main_products = hxs.select("//div[@id='Product-MainProductContainer']//div[@class='Product-SubProduct']") secondary_products = hxs.select("//div[@id='Product-SubProductContainer']//div[@class='Product-SubProduct']") main_product_sku = main_product.select("div[@id='Product-lblItem']/span[@id='lblItem']/text()").extract() if not main_product_sku: logging.error("NO MAIN SKU! %s" % url) else: main_product_sku = main_product_sku[0] if main_product_sku == sku or main_product_sku == sec_sku: # extract main product price = main_product.select(".//div[@class='Product-Price']/span[@id='lblClubPrice']/b/font/text()").re("\$(.*)") if not price: logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' % (sku, name, url)) return price = price[0].strip() product = Product() loader = ProductLoader(item=product, response=response, selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) yield loader.load_item() return elif main_products: for product in main_products: product_sku = product.select("div[@class='Product-SubProductNumber']/font/text()").re("#(.+)") if not product_sku: logging.error("NO MAIN SKU! %s" % url) else: product_sku = product_sku[0] if product_sku == sku or product_sku == sec_sku: # extract secondary product price = product.select(".//span[contains(@id, 'lblClubPrice')]/b/font/text()").re("\$(.*)") if not price: logging.error('ERROR!! NO SEC PRICE!! %s "%s" "%s"' % (sku, name, url)) return price = price[0].strip() product = Product() loader = ProductLoader(item=product, response=response, selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) yield loader.load_item() return elif secondary_products: for product in secondary_products: product_sku = product.select("div[@class='Product-SubProductNumber']/text()").re("#(.+)") if not product_sku: logging.error("NO SECONDARY SKU! %s" % url) else: product_sku = product_sku[0] if product_sku == sku or product_sku == sec_sku: # extract secondary product price = product.select(".//span[contains(@id, 'lblClubPrice2')]/b/font/text()").re("\$(.*)") if not price: logging.error('ERROR!! NO SEC PRICE!! %s "%s" "%s"' % (sku, name, url)) return price = price[0].strip() product = Product() loader = ProductLoader(item=product, response=response, selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) yield loader.load_item() return else: logging.error("No products found!")