def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') loader.add_value('url', response.url) loader.add_xpath('brand', '//div[@id="brandlink"]//img/@alt') loader.add_xpath('sku', '//span[@class="barcode"]/text()') if not loader.get_output_value('sku'): loader.add_xpath('sku', '//meta[@itemprop="gtin13"]/@content') loader.add_value('identifier', response.url.split('p')[-1]) image_url = hxs.select( '//meta[@property="og:image"]/@content').extract()[0] loader.add_value('image_url', urljoin_rfc(base_url, image_url)) price = loader.get_output_value('price') if price < Decimal(25): loader.add_value('shipping_cost', '6.95') else: loader.add_value('shipping_cost', '0') categories = hxs.select('//ul[@id="breadcrumb"]//a/text()').extract() categories = [ x.strip() for x in categories if x.lower().strip() != 'home' ][:3] loader.add_value('category', categories) if not hxs.select('//h4[@class="product_instock"]') and not hxs.select( '//button[@class="buynow"]'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@class="CB_box_prodview"]//h2/text()') loader.add_value('url', response.url) price = ''.join( hxs.select('//div[@class="viewprod_price"]//text()').extract()) loader.add_value('price', price) loader.add_xpath('sku', '//div[@class="viewprod_right"]//div/text()', re='Barcode: (.*)') log.msg(loader.get_output_value('sku')) log.msg(response.meta['sku']) if loader.get_output_value('sku') == response.meta['sku']: yield loader.load_item() else: prods = response.meta['products'] if prods: yield Request(urljoin_rfc(get_base_url(response), prods[0]), callback=self.parse_product, meta={ 'sku': response.meta['sku'], 'products': prods[1:] })
def parse_product(response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('identifier', '//input[@name="sku"]/@value') loader.add_xpath( 'name', '//div[attribute::id="cat-product-detail-info"]/h1[1]/text()') loader.add_value('brand', 'Lego') loader.add_value('url', response.url) loader.add_xpath('price', '//*[@id="cat-prod-det-reg-price"]/text()') if not loader.get_output_value('price'): loader.add_xpath( 'price', '//*[@id="cat-product-details-sale-price"]/span/text()') if not loader.get_output_value('price'): return image_url = hxs.select( '//div[attribute::id="cat-product-detail-img"]/div[1]/a[1]/img[1]/@src' ).extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) for category in hxs.select( '//div[@id="cat-product-detail"]/div[@id="bc"]/div[@class="fl"]/a/text()' )[1:].extract(): loader.add_value('category', category) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('sku', re.search('product/(\d+)', response.url).groups()) name = hxs.select(u'//h1[@class="pagetitle"]/text()').extract()[0].strip() bottle_size = hxs.select(u'//div[child::strong[contains(text(), "Bottle Size") or contains(text(), "Size of Bottle")]]/span/text()') if not bottle_size: bottle_size = hxs.select(u'//div[contains(text(),"Size of Bottle")]/span/text()') name += ' ' + bottle_size.extract()[0].strip() loader.add_value('name', name) loader.add_xpath('price', u'//div[@class="cardPrice"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//div[@class="salePrice"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//div[@class="regularPrice"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//div[@class="regularprice"]/text()') site_sku = hxs.select(u'//span[@class="itemnumber"]/text()').re(u'- (.*)')[0].strip() search_sku = response.meta['sku'].strip() if site_sku == search_sku: yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath( 'name', './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()' ) #if not accept_product(loader.get_output_value('name')): # continue loader.add_xpath( 'url', './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href' ) loader.add_xpath('price', './/*[@class="newPrice"]//span/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) #loader.add_value('sku', response.meta['sku']) #loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')) and \ valid_price(response.meta['price'], loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None search_results = [] for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3/a/span/text()') if not loader.get_output_value('name'): loader.add_xpath('name', './/h3/a/text()') loader.add_xpath('url', './/h3/a/@href') loader.add_xpath('price', './/ul/li/a/span/text()', re='\$(.*)') if not loader.get_output_value('price'): loader.add_xpath('price', './/div[@class="newPrice"]//span[contains(@class,"price")]/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku'].lower()) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')): pr = loader search_results.append(pr) # if pr: # yield pr.load_item() if search_results: cur_prod = search_results[0] next_prods = search_results[1:] yield Request(cur_prod.get_output_value('url'), callback=self.parse_mfrgids, meta={'mfrgid': response.meta['mfrgid'], 'name': response.meta['name'], 'cur_prod':cur_prod, 'next_prods':next_prods}, dont_filter=True)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_xpath('sku', '//span[@itemprop="mpn"]/text()') loader.add_xpath('name', '//h1/span[@itemprop="name"]/text()') price = hxs.select( '//form//p[@class="special-price"]/span[@class="price"]/text()' ).extract() if not price: price = hxs.select( '//form//span[@class="regular-price"]/span[@class="price"]/text()' ).extract() if not price: price = hxs.select( '//meta[@property="og:price:amount"]/@content').extract() price = price[0] if price else 0 loader.add_value('price', price) categories = hxs.select( '//div[@class="breadcrumbs"]//li[not(@class="home")]/a/text()' ).extract() loader.add_value('category', categories) image_url = hxs.select( '//meta[@property="og:image"]/@content').extract() if image_url: loader.add_value('image_url', image_url[0]) brand = hxs.select('//a[@class="brand-link"]/text()').re( 'View All (.*) Prod') loader.add_value('brand', brand) out_of_stock = hxs.select( '//form//p[@class="availability out-of-stock"]') if out_of_stock or not loader.get_output_value('price'): loader.add_value('stock', 0) if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', 2.95) item = loader.load_item() product_swatches = hxs.select('//div[@class="product-swatches"]') options = hxs.select( '//select[contains(@class, "bundle-option")]/option') if options and not product_swatches: for option in options: option_item = deepcopy(item) option_item['identifier'] += '-' + option.select( '@value').extract()[0] option_item['name'] += ' ' + option.select( 'text()').extract()[0].split(' - ')[0] yield option_item else: yield item
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('sku', re.search('product/(\d+)', response.url).groups()) name = hxs.select( u'//h1[@class="pagetitle"]/text()').extract()[0].strip() bottle_size = hxs.select( u'//div[child::strong[contains(text(), "Bottle Size") or contains(text(), "Size of Bottle")]]/span/text()' ) if not bottle_size: bottle_size = hxs.select( u'//div[contains(text(),"Size of Bottle")]/span/text()') name += ' ' + bottle_size.extract()[0].strip() loader.add_value('name', name) loader.add_xpath('price', u'//div[@class="cardPrice"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//div[@class="salePrice"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//div[@class="regularPrice"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//div[@class="regularprice"]/text()') site_sku = hxs.select(u'//span[@class="itemnumber"]/text()').re( u'- (.*)')[0].strip() search_sku = response.meta['sku'].strip() if site_sku == search_sku: yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath("name", './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()') # if not accept_product(loader.get_output_value('name')): # continue loader.add_xpath("url", './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href') loader.add_xpath("price", './/*[@class="newPrice"]//span/text()') loader.add_value("sku", response.meta["sku"]) loader.add_value("identifier", response.meta["sku"]) # loader.add_value('sku', response.meta['sku']) # loader.add_value('identifier', response.meta['sku']) if ( loader.get_output_value("price") and (pr is None or pr.get_output_value("price") > loader.get_output_value("price")) and valid_price(response.meta["price"], loader.get_output_value("price")) ): pr = loader if pr: yield pr.load_item()
def parse_node(self, response, node): if not isinstance(response, XmlResponse): return identifier = node.select(u'./product-url/text()').re( r'product/([^/]+)/') identifier = identifier[0] loader = ProductLoader(item=Product(), selector=node) url = node.select(u'./product-url/text()').extract()[0] loader.add_value('url', url) loader.add_xpath('name', u'./title/text()') price = node.select(u'./price/text()').extract()[0].replace(',', '.') loader.add_value('price', price) loader.add_xpath('category', u'merchant-category/text()') loader.add_xpath('image_url', u'image-url/text()') loader.add_value('sku', identifier) loader.add_value('identifier', identifier) if loader.get_output_value('price') > 399: loader.add_value('shipping_cost', '0') else: loader.add_value('shipping_cost', '25') if loader.get_output_value('price'): return loader.load_item() else: return Product()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) identifier = re.findall("product_id = '(\d+)'", response.body)[0] loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//div[@class="page-header"]/h1/text()') price = ''.join( hxs.select( '//div[@class="price price-large"]/div[@class="price"]/span[@itemprop="price"]/text()' ).extract()) loader.add_value('price', extract_price_eu(price)) loader.add_xpath( 'sku', '//tr/td[contains(strong/text(), "Bestelcode")]/../td[2]/text()') loader.add_value('category', 'Lego') img = hxs.select( '//div[@id="productgallery-image-display"]//img/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('brand', 'lego') if loader.get_output_value('price'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') if loader.get_output_value('price') < 20: loader.add_value('shipping_cost', 2.95) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0] base_price = hxs.select(u'//p[@class="special-price"]/span[@class="price"]/text()').extract() if not base_price: base_price = hxs.select(u'//span[@class="regular-price"]/span[@class="price"]/text()').extract() base_price = base_price[0] product_options = hxs.select(u'//ul[@class="options-list"]/li') if product_options: for option in product_options: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) name_with_option = name + u' %s' % option.select(u'./span[@class="label"]/label/text()').extract()[0] loader.add_value('name', name_with_option) extra_price = option.select(u'./span[@class="label"]/label/span/span/text()').extract() if extra_price: extra_price = extra_price[0].replace(u'\xa3', u'') base_price = base_price.replace(u'\xa3', u'') loader.add_value('price', Decimal(base_price) + (Decimal(extra_price) if extra_price else Decimal('0.00'))) if loader.get_output_value('price'): yield loader.load_item() else: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', base_price) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): import re hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', '//h1//text()') loader.add_xpath('price', '//div[@class="buybutton"]//nobr//text()') sku = ''.join(hxs.select('//td[contains(text(), "Artikelkod")]/text()').extract()) try: loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0]) except: self.log('No SKU for %s' % (response.url)) loader.add_value('category', response.meta.get('category')) img = hxs.select('//meta[@property="og:image"]/@content').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('identifier', loader.get_output_value('image_url').split('/')[-1].split('-')[0]) else: loader.add_value('identifier', loader.get_output_value('url').split('/')[-1]) loader.add_value('brand', 'lego') if loader.get_output_value('price') > 1500: loader.add_value('shipping_cost', '0') else: loader.add_value('shipping_cost', '49') if hxs.select('//div[@class="buybutton" and @onclick]'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="product-right"]//div[@class="pp-name"]/h1/text()').extract()[0].strip() main_price = hxs.select(u'//div[@class="product-right"]//div[@class="pp-price"]/span/span/text()').extract()[0] product_options = hxs.select(u'//select[@class="ekm-productoptions-dropdown-option"]') if product_options: body = response.body.replace('\xc2', ' ') if product_options.select(u'../select[@onchange]'): set_option_price = True for option in product_options.select(u'./option'): name_with_option = name + u' %s' % option.select(u'./text()').extract()[0].strip() option_value = option.select(u'./@value').extract()[0] price = re.search('== \'%s\'.*?_EKM_PRODUCTPRICE.*?= \'([\d\.]+?)\'' % option_value, body, re.DOTALL).groups()[0]\ if set_option_price else main_price loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name_with_option) loader.add_value('price', price) loader.add_value('url', response.url) if loader.get_output_value('price'): yield loader.load_item() else: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', main_price) if loader.get_output_value('price'): yield loader.load_item()
def parse(self, response): old_prices = {} reader = csv.DictReader(StringIO(response.body)) for row in reader: old_prices[row['Product No.']] = row with open(os.path.join(HERE, 'legodk_products.csv')) as f: reader = csv.DictReader(f) for row in reader: loader = ProductLoader(response=response, item=Product()) loader.add_value('sku', row['Product No.']) loader.add_value('category', row['Theme']) loader.add_value('brand', 'LEGO') loader.add_value( 'name', row['Item Description English'].decode('utf8')) price = row.get('RRP price DKK') if not price: old_product = old_prices.get(row['Product No.']) price = '0.0' identifier = row['Item no'].lower() if old_product: price = old_product.get('RRP price DKK') identifier = old_product['Item no'].lower() loader.add_value('price', price) loader.add_value('identifier', identifier) else: loader.add_value('price', price) loader.add_value('identifier', row['Item no'].lower()) if not loader.get_output_value('identifier') in self.seen_ids: self.seen_ids.add(loader.get_output_value('identifier')) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) product = hxs.select('//td[@r="1"]') if not product: product = hxs.select('//table[@r="1"]') if not product and response.meta.get('_retries', 0) >= 3: #log.msg('ALERT! ' + response.url) #f = open(os.path.join(HERE, response.meta['sku'] + '.html'), 'w') #f.write(response.body) #f.close() return elif not product: retries = response.meta.get('_retries', 0) yield Request(response.url, meta={'sku': response.meta['sku'], '_retries': retries + 1}, dont_filter=True) return loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/div[@class="ittl"]//a[@class="vip"]/text()') loader.add_xpath('url', './/div[@class="ittl"]//a[@class="vip"]/@href') loader.add_xpath('price', './/div[@class="prices"]//span[@class="amt"]/text()') loader.add_xpath('price', './/div[@class="prices"]//span[@class="g-b amt"]/text()') loader.add_xpath('price', './/td[@class="prc"]//div[@class="g-b"]/text()') loader.add_xpath('price', './/*[@itemprop="price"]/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower() \ and valid_price(response.meta['price'], loader.get_output_value('price')): yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src') if not loader.get_output_value(u'image_url'): soup = BeautifulSoup(response.body) image_url = soup.find(lambda tag: tag.name == u'img' and tag.findParent(u'tr', id=u'prodImageContainer')) if image_url: loader.add_value('image_url', image_url.get(u'src')) loader.add_xpath('brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()') loader.add_xpath('price', u'//b[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="price"]/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku'].lower()) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract()[0].strip() multiple_options = hxs.select(u'//select[@class="mpv_itemalst"]//option') if multiple_options and not u'requested' in response.meta: for option in multiple_options: formname = u'aspNetForm' formdata = {u'ctl00$MainContent$ItemAList' : option.select(u'./@value').extract()[0], u'__EVENTTARGET' : u'ctl00$MainContent$ItemAList', u'__EVENTARGUMENT' : u''} req = FormRequest.from_response(response, formname=formname, formdata=formdata, meta={u'requested': True}, dont_click=True, callback=self.parse_product) yield req if multiple_options: name += u' %s' % multiple_options.select(u'../option[@selected]/text()').extract()[0].strip() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_xpath('price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()') if loader.get_output_value('price'): yield loader.load_item()
def parse_products(self, hxs, response): products = hxs.select('//div[@class="productList clear"]//div[starts-with(@class, "promoCell")]') for p in products: loader = ProductLoader(item=Product(), selector=p) name = p.select('.//p[@class="para1"]//text()').extract() name = ' '.join([n.strip() for n in name]) name = re.sub(' +', ' ', name) loader.add_xpath('url', './/a[starts-with(@class, "border")]/@href') loader.add_value('name', name) loader.add_xpath('sku', './/p[@class="border"]/text()', re='Item: (.*)') loader.add_xpath('price', './/p[@class="para3"]/text()', re='Our Price: (.*)') if not loader.get_output_value('price'): yield Request(loader.get_output_value('url'), callback=self.parse_products2) continue if not p.select('.//p[@class="para3"]/text()').re('Our Price: (.*)')[0].startswith('$')\ and response.meta.get('ret', 0) < 3: yield Request(response.url, dont_filter=True, meta={'ret': response.meta.get('ret', 0) + 1}) return yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) identifier = hxs.select('//h1[@itemprop="name"]/@id').re( "product_name_([0-9]+)") if identifier: identifier = identifier[0] else: log.msg('Product without identifier: ' + response.url) return loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = hxs.select( '//div[@id="productdetail"]/div/span/meta[@itemprop="price"]/@content' ).extract().pop() price = extract_price(price) loader.add_value('price', price) try: loader.add_value( 'sku', re.findall('(\d+)', loader.get_output_value('name'))[-1]) except: pass loader.add_xpath( 'category', '//div[@id="widget_breadcrumb"]/ul/li[last() - 1]/a/text()') loader.add_xpath('image_url', '//a[@id="PD_image_zoom"]/@href') loader.add_value('brand', 'lego') if loader.get_output_value('identifier'): yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) soup = BeautifulSoup(product.extract()) loader.add_value( 'name', soup.find('h3', attrs={ 'class': 'newaps' }).findAll('span')[0].string) loader.add_value( 'url', soup.find('h3', attrs={ 'class': 'newaps' }).findAll('a')[0]['href']) loader.add_value( 'price', soup.find('ul', attrs={ 'class': 'rsltL' }).findAll('span')[0].string) #loader.add_value('sku', response.meta['sku']) #loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')) and \ valid_price(response.meta['price'], loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse_products(self, hxs, response): print response.encoding model_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Model"]/preceding-sibling::*) + 1').extract() description_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Description"]/preceding-sibling::*) + 1').extract() price_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Price"]/preceding-sibling::*) + 1').extract() if model_pos and description_pos and price_pos: model_pos = model_pos[0].split('.')[0] description_pos = description_pos[0].split('.')[0] price_pos = price_pos[0].split('.')[0] products = hxs.select('//td[starts-with(@class, "orderinfo") and position()=%s \ and not(text()="Model")]/..' % model_pos) for product in products: loader = ProductLoader(selector=product, item=Product()) url = response.url model_url = product.select('.//td[starts-with(@class, "orderinfo") \ and position()=%s]//a/@href' % model_pos).extract() if model_url: url = urljoin_rfc(get_base_url(response), model_url[0]) loader.add_value('url', url) loader.add_xpath('name', './/td[starts-with(@class, "orderinfo") and position()=%s]/text()' % description_pos) loader.add_xpath('price', './/td[starts-with(@class, "orderinfo") and position()=%s]//text()' % price_pos) if not loader.get_output_value('price') or not loader.get_output_value('name').strip(): continue yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0] base_price = hxs.select(u'//p[@class="special-price"]/span[@class="price"]/text()').extract() if not base_price: base_price = hxs.select(u'//span[@class="regular-price"]/span[@class="price"]/text()').extract() base_price = base_price[0] product_options = hxs.select(u'//ul[@class="options-list"]/li') if product_options: for option in product_options: loader = ProductLoader(item=Product(), response=response) loader.add_value("url", response.url) name_with_option = name + u" %s" % option.select(u'./span[@class="label"]/label/text()').extract()[0] loader.add_value("name", name_with_option) extra_price = option.select(u'./span[@class="label"]/label/span/span/text()').extract() if extra_price: extra_price = extra_price[0].replace(u"\xa3", u"") base_price = base_price.replace(u"\xa3", u"") loader.add_value( "price", Decimal(base_price) + (Decimal(extra_price) if extra_price else Decimal("0.00")) ) if loader.get_output_value("price"): yield loader.load_item() else: loader = ProductLoader(item=Product(), response=response) loader.add_value("url", response.url) loader.add_value("name", name) loader.add_value("price", base_price) if loader.get_output_value("price"): yield loader.load_item()
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) product = hxs.select('//div[@class="product-page"]') if product: loader = ProductLoader(item=Product(), selector=product) name = product.select( './/h1[@class="main-heading standard-header"]/a/text()' ).extract() name2 = product.select( './/h1[@class="main-heading standard-header"]/text()').extract( ) if name: price = "".join( product.select('.//span[@id="price"]/text()').re( r'([0-9\,\. ]+)')).strip() loader.add_value('name', name[0].strip() + ' ' + name2[0].strip()) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value( 'name').lower(): yield loader.load_item() else: products = hxs.select('.//div[@id="searchResults"]/a') if products: for product in products: name = product.select( './span[@class="brandName"]/text()').extract() name2 = product.select( './span[@class="productName"]/text()').extract() if name and name2: product_name = name[0].strip() + ' ' + name2[0].strip() product_words = product_name.lower().strip().split(' ') search_words = response.meta['name'].lower().replace( '+', ' ').split(' ') diff = [ w for w in search_words if not w in product_words ] if not diff: price = "".join( product.select( './span[@class="price-6pm"]/text()').re( r'([0-9\,\. ]+)')).strip() loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', product_name) loader.add_value( 'url', urljoin_rfc( base_url, product.select('.//@href').extract()[0])) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value( 'name').lower(): yield loader.load_item() break
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('sku', '//input[@id="txtEMLNEID"]/@value') loader.add_value('identifier', ':'.join(hxs.select('//input[@id="txtEMLNEID"]/@value').extract() + hxs.select('//input[@id="txtEMSZEID"]/@value').extract())) loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') loader.add_xpath('price', '//div[@id="net"]/text()') loader.add_xpath('category', '//div[@id="bread_crumb"]/a[3]/text()') img = hxs.select('//img[@id="product_image"]/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) brand = ''.join(hxs.select('//img[contains(@src, "/brands/")]/@src').extract()) loader.add_value('brand', brand.split('/')[-1].split('.')[0].replace('-', ' ')) if loader.get_output_value('price'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') size = ''.join(hxs.select('normalize-space(//select[@onchange="jump(this.value)"]//option[@selected="selected"]/text())').extract()) loader.add_value('name', '-'.join(size.split('-')[:-1])) price_adj = {} for cfg in re.findall('adjustment\((\d+),(\d+),([\d.,]+)\)', response.body): price_adj[(cfg[0], cfg[1])] = float(cfg[2]) # Include only options that change the price opt_groups = [] for sel in hxs.select('//select[@onchange!="jump(this.value)" and @id!="quantity"]'): try: id = sel.select('./@id').re('\d+')[0] except: continue opts = [] for opt in sel.select('.//option'): value = opt.select('./@value').extract()[0] text = opt.select('normalize-space(./text())').extract()[0] if (id, value) in price_adj and float(price_adj[id, value]) != 0.0: opts.append((price_adj[id, value], text, value)) if opts: opt_groups.append(opts) prod = loader.load_item() if prod.get('identifier'): if response.url in self.SKIP_OPTIONS or not loader.get_output_value('price'): yield prod else: yield prod for opt_price, opt_name, opt_id in multiply(opt_groups): p = Product(prod) p['name'] = p['name'] + ' ' + opt_name p['price'] = p['price'] + Decimal(opt_price).quantize(Decimal('1.00')) p['identifier'] = p['identifier'] + ':' + opt_id if opt_id else p['identifier'] + '-' yield p for url in hxs.select('//select[@onchange="jump(this.value)"]//option/@value').extract(): yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_product)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@class="pro-name"]/text()') loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand')) categories = hxs.select( '//div[contains(@class,"breadcrumbs")]/a/text()').extract() for category in categories[2:]: loader.add_value('category', category) identifier = hxs.select( './/input[@type="hidden" and @name="product_id"]/@value' )[0].extract() loader.add_value('identifier', identifier) # sku = hxs.select('').extract() # loader.add_value('sku', sku) image_url = hxs.select('//div[@class="image"]/a/@href').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) loader.add_value('shipping_cost', '0.00') price = hxs.select( '//div[@class="total-price"]/span[@class="price-total"]/text()' ).extract() loader.add_value('price', price) self.log(loader.get_output_value('price')) if Decimal(loader.get_output_value('price')) < Decimal('30.00'): loader.add_value('shipping_cost', '2.99') stock = hxs.select( './/div[@class="stock-level"]/span[contains(text(),"In Stock")]') if not stock: loader.add_value('stock', 0) item = loader.load_item() options = hxs.select( '//div[@class="options"]/div/select/option[not(contains(text(),"Select"))]' ) for option in options: option_name = option.select('./text()')[0].extract().strip() option_item = deepcopy(item) option_item['identifier'] = '{}-{}'.format( identifier, option.select('./@value')[0].extract()) option_item['name'] += ' ' + option_name yield option_item else: yield item
def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select( '//div[@class="product_list"]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), url)) if not hxs.select('//span[@class="product"]/h1/text()'): return loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//span[@class="product"]/h1/text()') loader.add_value('url', response.url) loader.add_value('brand', 'Le Creuset') loader.add_xpath( 'category', '//div[@class="text_breadcrumbs"]/a[position()>1]//text()') loader.add_xpath( 'sku', 'substring-after(//font[@size="1" and contains(text(), "Ref:")]/text(), ": ")' ) loader.add_xpath( 'identifier', 'substring-after(//font[@size="1" and contains(text(), "Ref:")]/text(), ": ")' ) image_url = hxs.select('//img[@class="fullimage1"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) loader.add_xpath('price', '//h3[@class="product_price"]/prices/span[2]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', '//h3[@class="product_price"]//text()') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '4.95') else: loader.add_value('shipping_cost', '0') if hxs.select( '//div[@class="stock-message"]/span[contains(.//text(), "In stock") or contains(.//text(), "plenty of stock in")]' ): loader.add_value('stock', '1') else: loader.add_value('stock', '0') item = loader.load_item() metadata = LeCreusetMeta() metadata['promotion'] = ''.join( hxs.select( '//div[@class="special-offer-message"]/span/text()').extract()) item['metadata'] = metadata yield item
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None i = 0 for product in products: i += 1 product_loader = ProductLoader(item=Product(), selector=product) name = product.select( './/h3[@class="newaps"]/a/span/text()').extract() if not name: if i == 1: self.log("ERROR name not found") continue product_loader.add_value('name', name[0]) price = product.select( './/ul[@class="rsltL"]//span[1]/text()').extract() if not price: price = product.select( './/ul[contains(@class,"rsltGridList grey")]//span[1]/text()' ).extract() if not price: self.log("ERROR price not found2") continue product_loader.add_value('price', price[0]) url = product.select('.//h3[@class="newaps"]/a/@href').extract() if not url: self.log("ERROR url not found") else: product_loader.add_value('url', url[0]) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku']) #self.log("price: " + str(product_loader.get_output_value('price')) + ", price_meta: " + str(response.meta['price']) + ", url: " + response.url) if product_loader.get_output_value('price') and \ (pr is None or pr.get_output_value('price') > product_loader.get_output_value('price')) and \ valid_price(response.meta['price'], product_loader.get_output_value('price')): pr = product_loader if pr: yield pr.load_item()
def parse_node(self, response, node): if not isinstance(response, XmlResponse): return loader = ProductLoader(item=Product(), selector=node) loader.add_xpath('url', u'./product-url/text()') loader.add_xpath('name', u'./title/text()') price = node.select(u'./price/text()').extract()[0].replace(',', '.') loader.add_value('price', price) log.msg(json.dumps({'name': loader.get_output_value('name'), 'price': price})) if loader.get_output_value('price'): return loader.load_item() else: return Product()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1[contains(@class,"fpProdutTitle")]/text()') price = hxs.select(u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceXL")]/text()').extract() if price: price = price[0] + '.' + hxs.select(u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceXL")]/sup/text()').re(u'(\d+)')[0] product_loader.add_value('price', price) if product_loader.get_output_value('name') and product_loader.get_output_value('price'): yield product_loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(selector=hxs, item=Product()) name = hxs.select( '//div[@class="product-name"]/span/text()').extract()[0].strip() loader.add_value('name', name) loader.add_value('url', response.url) price = hxs.select( '//div[@class="buy-container"]//p[@class="special-price"]/span[@class="price"]/text()' ).extract() if not price: price = hxs.select( '//div[@class="buy-container"]//span[@class="regular-price"]/span[@class="price"]/text()' ).extract() price = price[0] if price else 0 loader.add_value('price', extract_price(price)) img_url = hxs.select('//img[@id="image-0"]/@src').extract() if img_url: loader.add_value('image_url', urljoin(base_url, img_url[0])) loader.add_xpath( 'category', '//li[span/text()="Thema"]/span[@class="data"]/text()') loader.add_value('brand', 'Lego') identifier = hxs.select('//input[@name="product"]/@value').extract() if not identifier: log.msg('ERROR >>> Product without identifier: ' + response.url) return loader.add_value('identifier', identifier[0]) loader.add_xpath( 'sku', '//li[span/text()="Artikelnummer"]/span[@class="data"]/text()') out_of_stock = hxs.select('//span[@class="out-of-stock-msg"]') if out_of_stock or loader.get_output_value('price') <= 0: loader.add_value('stock', 0) if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', 2.99) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//ul[@id="finder-data"]/li') if not products: return product = products[0] loader = ProductLoader(item=Product(), selector=product) name = "".join(product.select('./a/div/h5/span/text()').extract()) if name: name2 = "".join(product.select('./a/div/h5/text()').extract()) url = product.select('./a/@href').extract()[0] price = "".join(product.select('./a/div[@class="p-price"]/text()').re(r'([0-9\,\. ]+)')).strip() if not price: price = "".join(product.select('./a/div[@class="p-price"]/span[@class="sale-price"]/text()').re(r'([0-9\,\. ]+)')).strip() loader.add_value('name', name.strip() + ' ' + name2.strip()) loader.add_value('url', urljoin_rfc(base_url,url)) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower(): yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) product = hxs.select('//table[@class="buybox"]') if not product: return loader = ProductLoader(item=Product(), selector=product) name = product.select('.//h1[@class="stylename"]/text()').extract() if name: log.msg(name[0].lower() + ' - ' + response.meta['name'].lower().replace('+', ' ')) product_words = name[0].lower().strip().split(' ') search_words = response.meta['name'].lower().replace('+', ' ').split(' ') diff = [w for w in search_words if not w in product_words] #if name[0].lower() == response.meta['name'].lower().replace('+', ' '): if not diff: price = "".join(product.select('.//span[@class="price"]/span/text()').re(r'([0-9\,\. ]+)')).strip() loader.add_value('name', name[0]) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower(): yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[@class="productCellWrapper"]') if not products: return for product in products: loader = ProductLoader(item=Product(), selector=product) brand = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="brand"]/text()').extract()).strip() style = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="styleName color"]/text()').extract()).strip() name = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="styleName name"]/text()').extract()).strip() name = brand + ' ' + name + ' ' + style product_words = name.lower().split(' ') search_words = response.meta['name'].lower().split() diff = [w for w in search_words if not w in product_words] if not diff: url = product.select('.//div[@class="productBrandTitleColor"]/a/@href').extract()[0] price = "".join(product.select('.//div[@class="price"]/span[@class="salePrice"]/text()').re(r'([0-9\,\. ]+)')).strip() if not price: price = "".join(product.select('.//div[@class="price"]/text()').re(r'([0-9\,\. ]+)')).strip() loader.add_value('name', name) loader.add_value('url', urljoin_rfc(base_url,url)) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower(): yield loader.load_item() break """
def parse_product(self, response): base_url = get_base_url(response) search_sku = response.meta['sku'] hxs = HtmlXPathSelector(response) main_name = hxs.select(u'//h3[@class="product-name"]/text()').extract() main_price = hxs.select(u'//div[@class="special_price"]//span[@class="price"]').extract() if not main_name and not main_price: return main_name = main_name[0].strip() if main_price: main_price = main_price[0].strip() subproducts = hxs.select(u'//table[@id="super-product-table"]//tr')[1:] subproducts += hxs.select(u'//table[@class="inner-table"]//tr') if subproducts: for p in subproducts: product_data = [s.strip() for s in p.select(u'.//td//text()').extract()[:-1] if s.strip() != ''] loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', main_name + ' ' + ' '.join(product_data[0:-1]).strip()) if product_data[-1].startswith('$'): loader.add_value('price', product_data[-1]) loader.add_value('sku', search_sku) sku = product_data[0] if sku in search_sku and loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1[@class="productDetailHeader"]/text()') if hxs.select(u'//span[@class="productDetailSelling"]/text()'): product_loader.add_xpath('price', u'//span[@class="productDetailSelling"]/text()') else: product_loader.add_value('price', '') product_loader.add_xpath('sku', u'//input[@type="hidden" and (@name="hidProductId" or @name="inv")]/@value') product_loader.add_xpath('category', u'//td[@class="smallPrint"]/a[position()=2 and contains(text(),"Products")]/../a[3]/text()') img = hxs.select(u'//a[@class="smallPrint" and @rel="lightbox"]/@href').extract() if img: img = urljoin_rfc(get_base_url(response), img[0]) product_loader.add_value('image_url', img) if hxs.select(u'//a[contains(@href,"BrandName")]/@href'): product_loader.add_xpath('brand', u'substring-after(//a[contains(@href,"BrandName")]/@href,"=")') else: brands = hxs.select(u'//strong[@class="sideBarText"]/text()').extract() brands = [b.strip() for b in brands] for brand in brands: if product_loader.get_output_value('name').startswith(brand): product_loader.add_value('brand', brand) break else: product_loader.add_xpath('brand', u'normalize-space(substring-before(substring-after(//title/text(), " - "), " - "))') # product_loader.add_xpath('shipping_cost', u'//div[@class="DetailRow"]/div[contains(text(),"Shipping")]/../div[2]/text()') yield product_loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) soup = BeautifulSoup(response.body) try: price = soup.find('span', {'class': 'price ours'}).text except AttributeError: self.log('price not found {}'.format(response.url)) return image_url = soup.find('img', itemprop='image')['src'] identifier = soup.find('form', id='product_addtocart_form') identifier = identifier['action'].split('product/')[-1].split('/')[0] loader.add_value('image_url', image_url) loader.add_value('price', price) name = soup.find('h1', itemprop='name').text.strip() loader.add_value('name', name) loader.add_value('category', response.meta.get('category', '')) brand = soup.find('span', itemprop='manufacturer').text.replace(' ', '').split('by', 1)[1].strip() loader.add_value('brand', brand) loader.add_value('url', response.url) sku = soup.find('input', id='eye') loader.add_value('identifier', identifier) if sku: loader.add_value('sku', sku['value']) shipping_cost = '5.98' if loader.get_output_value('price') <= Decimal(59): shipping_cost = '9.98' loader.add_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse_product(self, response): import re hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) pprice = hxs.select('//div[@class="price_bottom_bg"]/span[@class="fontBold125emR"]/text()').extract() if not pprice: pprice = hxs.select('//div[@class="price_bottom_bg"]//span[contains(@class, "prodPrcNowCatgLister")]/text()').extract() if pprice: price = extract_price_eu(pprice[0]) else: self.errors.append('WARNING: No price in %s' % response.url) return loader.add_xpath('identifier', '//b[contains(text(), "SKU:")]/../text()') loader.add_value('url', response.url) loader.add_xpath('name', '//div[@class="product-name"]/text()') loader.add_value('price', price) loader.add_xpath('sku', '//b[contains(text(), "Artikelnummer:")]/../text()') loader.add_value('category', response.meta.get('category')) img = hxs.select('//div[@id="product-view-media-main-image"]//img/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('brand', 'lego') if loader.get_output_value('price') > 20: loader.add_value('shipping_cost', '0') # loader.add_xpath('stock', '1') yield loader.load_item()
def parse_product(self, response): import re hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('identifier', 'substring-after(//div[@class="code"]/text(), " ")') if not loader.get_output_value('identifier'): loader.add_xpath('identifier', 'substring-after(//*/@data-code, " ")') loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') sku = ''.join(hxs.select('//h1/text()').extract()) try: loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0]) except: self.log('No SKU for %s' % (response.url)) loader.add_xpath('price', '//span[@itemprop="price"]/text()') loader.add_xpath( 'category', '//div[@class="paths"]/ul/li[1]/span[last()]//a/text()') img = hxs.select('//div[@class="images"]//img/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('brand', 'lego') loader.add_value('shipping_cost', '49') if hxs.select('//select[@name="num"]'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('identifier', '//td/span[contains(text(), "Artikelnr")]/../../td[2]/text()') loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') loader.add_value('price', extract_price_eu(''.join(hxs.select('//td[@class="myshp_info_price_value"]//text()').extract()))) sku = ''.join(hxs.select('//h1/text()').extract()) try: loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0]) except: self.log('No SKU for %s' % (response.url)) loader.add_xpath('category', '//td/span[contains(text(), "Categorie")]/../../td[2]/text()') img = hxs.select('//div[@id="myshp_info_image_large"]//a/@href').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('brand', 'lego') if loader.get_output_value('price') > 75: loader.add_value('shipping_cost', '0') else: loader.add_value('shipping_cost', '4.95') # loader.add_xpath('stock', '1') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@class="product-name"]/h1/text()') loader.add_value('url', response.url) loader.add_value('brand', 'Le Creuset') loader.add_value('category', 'Le Creuset') loader.add_xpath('sku', '//input[@name="product"]/@value') loader.add_xpath('identifier', '//input[@name="product"]/@value') image_url = hxs.select( '//div[@class="product-img-box"]/a/@href').extract() if image_url: loader.add_value('image_url', image_url[0]) loader.add_xpath('price', '//div[@class="prodPriceWrap"]/h2/text()') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '4.75') else: loader.add_value('shipping_cost', '0') loader.add_value('stock', '1') item = loader.load_item() metadata = LeCreusetMeta() item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0] product_loader.add_value('name', product_name) image_url = hxs.select('//*[@id="zoom1"]/@src').extract()[0] product_loader.add_value('image_url', urljoin_rfc(base_url, image_url)) product_loader.add_value('url', response.url) identifier = hxs.select('//input[@name="id"]/@value').extract()[0] product_loader.add_value('identifier', identifier) sku = hxs.select('//div[@class="product_band"]/p/span/text()').re('(\d+)') sku = sku[0] if sku else '' product_loader.add_value('sku', sku) price = hxs.select('//span[@class="campaignprice-value"]/text()').extract() if not price: price = hxs.select('//span[@itemprop="price"]/text()').extract() if price: price = price[0].strip().replace(',', '.') product_loader.add_value('price', price) category = hxs.select('//ul[@class="breadcrumbs"]/li/a/text()').extract() category = category[-2] if category else '' product_loader.add_value('category', category) if product_loader.get_output_value('price')<100: product_loader.add_value('shipping_cost', 2.90) yield product_loader.load_item()
def parse(self, response): ''' First goes into the main categories, this site stores in cache the current page, this is necessary to go to the next page. ''' hxs = HtmlXPathSelector(response) next_page = hxs.select('//div[@class="page-navigation"]/a[contains(text(),"Next")]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) products = hxs.select('//div[@id="list-product-list"]//div[contains(@class,"list-product-item")]') if products: for product in products: loader = ProductLoader(item=Product(), selector=product) name = ''.join(product.select('.//div[@class="name"]/a/text()').extract()) if name: loader.add_value('name', name) # identifier = product.select('').extract() # if identifier: # identifier = identifier[0] # loader.add_value('identifier', identifier) url = ''.join(product.select('.//div[@class="name"]/a/@href').extract()) if url: url = urljoin_rfc(get_base_url(response), url.split(';')[0]) loader.add_value('url', url) price = product.select('.//div[@class="price-info"]//span[@class="current-price"]/text()').extract() if price: price = round(float(re.findall("\d+.\d+", price[0].replace(',', ''))[0])/1.2, 2) loader.add_value('price', price) yield Request(loader.get_output_value('url'), meta={'loader': loader}, callback=self.parse_product)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) product_name = hxs.select( '//h1/span[@class="title"]/text()')[0].extract() product_price = hxs.select('//div[@class="price"]/span/p/strong/text()' ).re('([\d\.]+) kr.')[0] product_code = sku = hxs.select( '//div[@class="moreItem"]/span[@class="title" and contains(text(),"mer:")]/following-sibling::span/text()' ).extract() image_url = hxs.select('//a[@class="jqzoom"]/img/@src').extract() category = hxs.select( '//div[@class="moreItem"]/span[@class="title" and text()="Form:"]/following-sibling::span/text()' ).extract() category = category[0] if category else '' loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('url', response.url) loader.add_value('sku', sku) loader.add_value('identifier', product_code) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('category', category) product_price = extract_price( product_price.replace('.', '').replace(',', '.')) loader.add_value('price', product_price) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) options = hxs.select(u'//script').re('Product\.Bundle\((.*)\)') if options: options = json.loads(options[0]) mandatory_options = hxs.select(u'//div[@class="input-box"]//input[@type="hidden"]') name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0].strip() price = Decimal(0.0) exclude = set() for mandatory_option in mandatory_options: option = mandatory_option.select(u'./@name').re('bundle_option\[(.*)\]')[0] selection = mandatory_option.select(u'./@value').extract()[0] option = options['options'][option]['selections'][selection] name += u' %s' % option['name'].strip() price += Decimal(option['price']).quantize(Decimal('0.01')) exclude.add(mandatory_option) option_keys = set(options['options'].keys()).difference(exclude) for option in option_keys: selection_keys = options['options'][option]['selections'].keys() for selection in selection_keys: selection_name = options['options'][option]['selections'][selection]['name'] selection_price = options['options'][option]['selections'][selection]['price'] selection_price = Decimal(selection_price).quantize(Decimal('0.01')) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('name', name + u' %s' % selection_name.strip()) loader.add_value('price', price + selection_price) if loader.get_output_value('price'): yield loader.load_item() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', u'//div[@class="product-name"]/h1/text()') loader.add_xpath('price', u'//span[@class="regular-price"]/span[@class="price"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//div[@class="price-box"]//p[@class="minimal-price" or @class="price-from"]/span[@class="price"]/text()') if loader.get_output_value('price'): yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) soup = BeautifulSoup(product.extract()) loader.add_value('name', soup.find('h3', attrs={'class': 'newaps'}).findAll('span')[0].string) loader.add_value('url', soup.find('h3', attrs={'class': 'newaps'}).findAll('a')[0]['href']) loader.add_value('price', soup.find('ul', attrs={'class': 'rsltL'}).findAll('span')[0].string) loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price'): if (pr is None or pr.get_output_value('price') > loader.get_output_value('price')): if valid_price(response.meta['price'], loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', u'//div[@id="product"]/h1/text()') loader.add_xpath('price', u'//p[@class="price"]/span[@class="our_price"]/text()') if loader.get_output_value('price'): yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath("name", './/h3[@class="title"]/a/text()') loader.add_xpath("url", './/h3[@class="title"]/a/@href') loader.add_xpath("price", './/td[@class="toeOurPrice"]/a/text()') loader.add_value("sku", response.meta["sku"]) if loader.get_output_value("price") and ( pr is None or pr.get_output_value("price") > loader.get_output_value("price") ): pr = loader if pr: yield pr.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()') loader.add_xpath('url', './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href') loader.add_xpath('price', './/*[@class="subPrice"]/a[contains(text(), "new")]' + '/following-sibling::*[@class="price"]/text()') loader.add_xpath('price', './/*[@class="newPrice"]//span/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) price = join(hxs.select(u'//div[contains(@class, "goods_price")]/text()').extract()) price = price.strip().replace(" ","") product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('name', response.meta["name"]) product_loader.add_value('url', response.url) product_loader.add_value('price', price) product_loader.add_value('sku', response.meta["sku"]) if product_loader.get_output_value('price'): return product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@class="CB_box_prodview"]//h2/text()') loader.add_value('url', response.url) price = ''.join(hxs.select('//div[@class="viewprod_price"]//text()').extract()) loader.add_value('price', price) loader.add_xpath('sku', '//div[@class="viewprod_right"]//div/text()', re='Barcode: (.*)') log.msg(loader.get_output_value('sku')) log.msg(response.meta['sku']) if loader.get_output_value('sku') == response.meta['sku']: yield loader.load_item() else: prods = response.meta['products'] if prods: yield Request(urljoin_rfc(get_base_url(response), prods[0]), callback=self.parse_product, meta={'sku': response.meta['sku'], 'products': prods[1:]})
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', u'//div[@class="productShortInfo"]/h3/a/text()') if not loader.get_output_value('name'): loader.add_xpath('name', u'//h1[@itemprop="name"]/text()') price = hxs.select(u'//div[@class="price"]/strong/text()').extract()[0].replace(',', '.') loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) yield loader.load_item()
def parse_node(self, response, node): if not isinstance(response, XmlResponse): return loader = ProductLoader(item=Product(), selector=node) url = node.select(u'./product-url/text()').extract()[0] loader.add_value('sku', url.split('/')[-2]) loader.add_value('url', url) loader.add_xpath('name', u'./title/text()') price = node.select(u'./price/text()').extract()[0].replace(',', '.') loader.add_value('price', price) if loader.get_output_value('price'): return loader.load_item() else: return Product()
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) product = hxs.select('//div[@class="product-page"]') if product: loader = ProductLoader(item=Product(), selector=product) name = product.select('.//h1[@class="main-heading standard-header"]/a/text()').extract() name2 = product.select('.//h1[@class="main-heading standard-header"]/text()').extract() if name: price = "".join(product.select('.//span[@id="price"]/text()').re(r'([0-9\,\. ]+)')).strip() loader.add_value('name', name[0].strip() + ' ' + name2[0].strip()) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower(): yield loader.load_item() else: products = hxs.select('.//div[@id="searchResults"]/a') if products: for product in products: name = product.select('./span[@class="brandName"]/text()').extract() name2 = product.select('./span[@class="productName"]/text()').extract() if name and name2: product_name = name[0].strip() + ' ' + name2[0].strip() product_words = product_name.lower().strip().split(' ') search_words = response.meta['name'].lower().replace('+', ' ').split(' ') diff = [w for w in search_words if not w in product_words] if not diff: price = "".join(product.select('./span[@class="price-6pm"]/text()').re(r'([0-9\,\. ]+)')).strip() loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', product_name) loader.add_value('url', urljoin_rfc(base_url,product.select('.//@href').extract()[0])) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower(): yield loader.load_item() break
def parse_product(self, response): hxs = HtmlXPathSelector(response) if hxs.select('//span[@id="totalNoResultsSlotAtTop"]'): return loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@id="headerContainer"]/h1/text()') loader.add_value('url', response.url) loader.add_xpath('price', '//span[contains(@class, "mfProductDescriptionAndPrice")]/text()') loader.add_xpath('sku', '//dt[text()="Manufacturer Part No:"]/following-sibling::dd/text()') sku = loader.get_output_value('sku') if sku.lower() != response.meta['sku'].lower(): return yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # name = hxs.select(u'//div[@id="productDetail"]//h1[@class="productDetailTitle"]/text()').extract()[0].strip() # options = hxs.select(u'//td[@id="optionProductList"]') # if options: # name += u' %s' % hxs.select(u'//ul[@id="active"]/li/a/text()').extract()[0].strip() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', u'//h1[@class="product-name"]/text()') loader.add_xpath('price', u'//div[@class="p-prod-price"]/span/span[@class="price-alt"]/span/text()') if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) name = hxs.select('//td[@style="padding-left:10px;"]/h1/text()').extract() loader = ProductLoader(item=Product(), response=response) if name: price = "".join(hxs.select('.//p[@class="productDesc"]/span[@class="price"]/text()').re(r'([0-9\,\. ]+)')).strip() loader.add_value('name', name[0].strip() ) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower(): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//div[@class="headingbox"]/h1/text()') price = hxs.select('//span[@class="ourpricefeat"]/text()') if price: price_re = price.re('(\d+(?:\.\d+))') if price_re: product_loader.add_value('price', price_re[0]) if not product_loader.get_output_value('price'): product_loader.add_value('price', 0) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('url', response.url) yield product_loader.load_item()