def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None search_results = [] for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3/a/span/text()') if not loader.get_output_value('name'): loader.add_xpath('name', './/h3/a/text()') loader.add_xpath('url', './/h3/a/@href') loader.add_xpath('price', './/ul/li/a/span/text()', re='\$(.*)') if not loader.get_output_value('price'): loader.add_xpath('price', './/div[@class="newPrice"]//span[contains(@class,"price")]/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku'].lower()) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')): pr = loader search_results.append(pr) # if pr: # yield pr.load_item() if search_results: cur_prod = search_results[0] next_prods = search_results[1:] yield Request(cur_prod.get_output_value('url'), callback=self.parse_mfrgids, meta={'mfrgid': response.meta['mfrgid'], 'name': response.meta['name'], 'cur_prod':cur_prod, 'next_prods':next_prods}, dont_filter=True)
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//td[@class="Description_ProductList"]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//a/@title') price = item.select(u'../..//span[@class="Price_Productlist"]/text()').extract()[0] price = price.strip().rstrip(' DKK').replace('.', '').replace(',', '.') if price == u'Ring for pris!': price = 0 product_loader.add_value('price', price) url = item.select(u'.//a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) yield product_loader.load_item() level = response.meta.get('level', 1) sub_url = u'//table[@id="ProductMenu_Table"]/../' + u'/'.join([u'table/tr/td'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1})
def parse(self, response): hxs = HtmlXPathSelector(response) product = hxs.select('//td[@r="1"]') if not product: product = hxs.select('//table[@r="1"]') if not product and response.meta.get('_retries', 0) >= 3: #log.msg('ALERT! ' + response.url) #f = open(os.path.join(HERE, response.meta['sku'] + '.html'), 'w') #f.write(response.body) #f.close() return elif not product: retries = response.meta.get('_retries', 0) yield Request(response.url, meta={'sku': response.meta['sku'], '_retries': retries + 1}, dont_filter=True) return loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/div[@class="ittl"]//a[@class="vip"]/text()') loader.add_xpath('url', './/div[@class="ittl"]//a[@class="vip"]/@href') loader.add_xpath('price', './/div[@class="prices"]//span[@class="amt"]/text()') loader.add_xpath('price', './/div[@class="prices"]//span[@class="g-b amt"]/text()') loader.add_xpath('price', './/td[@class="prc"]//div[@class="g-b"]/text()') loader.add_xpath('price', './/*[@itemprop="price"]/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower() \ and valid_price(response.meta['price'], loader.get_output_value('price')): yield loader.load_item()
def parse(self, response): if response.url in self.junk_urls: return hxs = HtmlXPathSelector(response) for item in hxs.select(u'//div[@class="item_wrapper"]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//div[@class="name"]/a/text()') price = item.select(u'.//div[@class="price"]/text()[last()]').extract()[0] price = price.strip().lstrip('Kr. ').replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//div[@class="name"]/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) yield product_loader.load_item() level = response.meta.get('level', 1) sub_url = u'//div[@id="shopnav"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1})
def parse_pagination(self, response): URL_BASE = 'http://www.dv247.com/' hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="listItem clearfix"]') for product in products: loader = ProductLoader(item=Product(), selector=product) name = ''.join(product.select('.//a//text()').extract()) loader.add_value('name', name) relative_url = product.select('.//a/@href')[0].extract() url = urljoin_rfc(URL_BASE, relative_url) loader.add_value('url', url) loader.add_xpath('price', './/li[@class="price"]/text()') yield loader.load_item() #next page next_pages = hxs.select('//div[@class="listPaging"]') next_ten = [] if next_pages: next_ten = next_pages[0].select('.//a[text()="Next 10"]/@href').extract() if next_pages: next_pages = next_pages[0].select('.//a[not(@class="selectedpage") and not(text()="Next 10") and not(text()="Previous 10")]/@href').extract() for page in next_pages: url = urljoin_rfc(URL_BASE, page) yield Request(url, callback=self.parse_pagination) if next_ten: next_ten_url = urljoin_rfc(URL_BASE, next_ten[0]) yield Request(next_ten_url, callback=self.parse_pagination)
def parse_page(self, response): base_url = get_base_url(response) base_url_func = functools.partial(urljoin_rfc, base_url) hxs = HtmlXPathSelector(response) cats = hxs.select("//ul[@id='nav']//a/@href").extract() for url in cats: yield Request(urljoin_rfc(base_url, url), callback=self.parse_page) # next page hxs = HtmlXPathSelector(response) url = hxs.select("//div[@class='pagerLine']//a[@class='next']/@data-query").extract() if url: yield Request(urljoin_rfc(base_url, url[0]), callback=self.parse_page) # products for z in hxs.select("//div[@class='products']//li"): #name = z.select(".//div[@class='detailsInnerWrap']/a[@class='name']/text()").extract() loader = ProductLoader(selector=z, item=Product()) loader.add_xpath('identifier', "@data-product-url", first, re="articleNumber=(\d+)") loader.add_xpath('sku', "@data-product-url", first, re="articleNumber=(\d+)") loader.add_xpath('url', "@data-product-url", first, base_url_func) loader.add_xpath('name', ".//div[@class='detailsInnerWrap']/span[@class='brand']/text()") loader.add_xpath('name', ".//div[@class='detailsInnerWrap']/a[starts-with(@class, 'name')]/text()") price = z.select(".//p[@class='price']/ins//text()") \ or z.select(".//p[@class='price']//text()") \ or z.select(".//p[@class='price']/del//text()") price = ''.join(price.extract()).replace(',', '.').replace(u'\xa0', '') loader.add_value('price', price) yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//*[@id="products-list"]/li') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'div[@class="product-details left"]/h2/a/text()') price = product.select('div[@class="product-shop left"]/div/div/p/span/span/text()') if price: price = price[0] else: price = product.select('div[@class="product-shop left"]/div/div/span/text()') if price: price = price[0] else: price = product.select('div[@class="product-shop left"]/div/div/p/span/text()') if len(price)==1: price = price[0] else: price = price[1] loader.add_value('price', price) loader.add_xpath('url', 'div[@class="product-details left"]/h2/a/@href') yield loader.load_item() next = hxs.select('//div[@class="right-nav right"]/a/@href').extract() if next: url = next[0] yield Request(url, callback=self.parse_products)
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h2/text()') product_loader.add_xpath('price', u'//em[contains(@class,"ProductPrice")]/text()') product_loader.add_xpath('sku', u'//span[@class="VariationProductSKU"]/text()') product_loader.add_xpath('category', u'//div[@id="ProductBreadcrumb"]/ul/ul/li[2]/a/text()') product_loader.add_xpath('image_url', u'//div[@class="ProductThumbImage"]/a/img/@src') product_loader.add_xpath('brand', u'//div[@class="Value"]/a/text()') product_loader.add_value('shipping_cost', '') options = hxs.select(u'//div[@class="DetailRow"]//ul/li/label/input/../..') if options: product_id = hxs.select(u'//input[@name="product_id"]/@value').extract()[0] product_orig = product_loader.load_item() for opt in options: name = opt.select(u'.//input/../text()[2]').extract() if not name: name = opt.select(u'concat(.//input/../span[1]/text(),.//input/../span[2]/text())').extract() var = opt.select(u'.//input/@value').extract() product = Product(product_orig) product['name'] = (product['name'] + ' ' + name[0].strip()).strip() yield Request('http://www.midwestunlimited.com/remote.php' + '?w=GetVariationOptions&productId=' + product_id + '&options=' + var[0], meta={'product': product}, callback=self.parse_price) else: yield product_loader.load_item()
def parse_products(self, hxs, response): print response.encoding model_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Model"]/preceding-sibling::*) + 1').extract() description_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Description"]/preceding-sibling::*) + 1').extract() price_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Price"]/preceding-sibling::*) + 1').extract() if model_pos and description_pos and price_pos: model_pos = model_pos[0].split('.')[0] description_pos = description_pos[0].split('.')[0] price_pos = price_pos[0].split('.')[0] products = hxs.select('//td[starts-with(@class, "orderinfo") and position()=%s \ and not(text()="Model")]/..' % model_pos) for product in products: loader = ProductLoader(selector=product, item=Product()) url = response.url model_url = product.select('.//td[starts-with(@class, "orderinfo") \ and position()=%s]//a/@href' % model_pos).extract() if model_url: url = urljoin_rfc(get_base_url(response), model_url[0]) loader.add_value('url', url) loader.add_xpath('name', './/td[starts-with(@class, "orderinfo") and position()=%s]/text()' % description_pos) loader.add_xpath('price', './/td[starts-with(@class, "orderinfo") and position()=%s]//text()' % price_pos) if not loader.get_output_value('price') or not loader.get_output_value('name').strip(): continue yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract() if name: name = name[0].strip() url = response.url url = urljoin_rfc(get_base_url(response), url) items = hxs.select('//div[@class="Item"]') for item in items: loader = ProductLoader(item=Product(), selector=item) loader.add_value('url', url) #loader.add_value('name', name[0]) sku = ''.join(item.select('./text()').extract()) n = name if sku: n += ' ' + sku.strip() loader.add_value('name', n) loader.add_xpath('price', './/span[@class="price"]/text()') loader.add_xpath('price', './div[@class="price"]/span/text()') yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//div[@class="prelement"]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//a/text()') price = item.select(u'.//p[@class="prpri"]/text()').extract()[0] price = price.strip().lstrip('Pris: DKK ').replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) yield product_loader.load_item() level = response.meta.get('level', 1) sub_url = u'//ul[@id="pMenul0"]/../' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1})
def parse_products(self, hxs, response): products = hxs.select('//div[@class="productList clear"]//div[starts-with(@class, "promoCell")]') for p in products: loader = ProductLoader(item=Product(), selector=p) name = p.select('.//p[@class="para1"]//text()').extract() name = ' '.join([n.strip() for n in name]) name = re.sub(' +', ' ', name) loader.add_xpath('url', './/a[starts-with(@class, "border")]/@href') loader.add_value('name', name) loader.add_xpath('sku', './/p[@class="border"]/text()', re='Item: (.*)') loader.add_xpath('price', './/p[@class="para3"]/text()', re='Our Price: (.*)') if not loader.get_output_value('price'): yield Request(loader.get_output_value('url'), callback=self.parse_products2) continue if not p.select('.//p[@class="para3"]/text()').re('Our Price: (.*)')[0].startswith('$')\ and response.meta.get('ret', 0) < 3: yield Request(response.url, dont_filter=True, meta={'ret': response.meta.get('ret', 0) + 1}) return yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//tr[contains(@class,"product-item")]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()') price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0] price = price.strip().replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) # If quantity field is not present on page, there are subproducts qty = item.select(u'.//input[@name="products_qty"]').extract() if qty: yield product_loader.load_item() else: yield Request(url, callback=self.parse_sub) level = response.meta.get('level', 1) sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1}) next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract() if next_url: next_url = urljoin_rfc(get_base_url(response), next_url[0]) yield Request(next_url, meta={'level': level})
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath("name", '//div[@id="ProductDetails"]//h2/text()') loader.add_value("url", response.url) loader.add_xpath("price", '//div[@id="ProductDetails"]//em[contains(@class,"ProductPrice")]/text()') loader.add_xpath("sku", '//div[@id="ProductDetails"]//span[contains(@class,"VariationProductSKU")]/text()') yield loader.load_item()
def parse(self, response): BASE_URL = 'http://www.virginmobile.com/vm/' hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="webapp_shophome_3col_spotlight"]') for product in products: loader = ProductLoader(item=Product(), selector=product) xpath = 'div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()' if product.select(xpath): loader.add_xpath('name', xpath) loader.add_xpath('price', 'div/div/div/div/div/div/p/span/text()') relative_url = product.select('div/div/div/div/div/p/a/@href') if relative_url: url = urljoin_rfc(BASE_URL, relative_url.extract()[0], response.encoding) loader.add_value('url', url) else: xpath = 'div/div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()' if product.select(xpath): loader.add_xpath('name', xpath) loader.add_xpath('price', 'div/div/div/div/div/div/div/p/span/text()') relative_url = product.select('div/div/div/div/div/div/p/a/@href') if relative_url: url = urljoin_rfc(BASE_URL, relative_url.extract()[0], response.encoding) loader.add_value('url', url) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath("name", './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()') # if not accept_product(loader.get_output_value('name')): # continue loader.add_xpath("url", './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href') loader.add_xpath("price", './/*[@class="newPrice"]//span/text()') loader.add_value("sku", response.meta["sku"]) loader.add_value("identifier", response.meta["sku"]) # loader.add_value('sku', response.meta['sku']) # loader.add_value('identifier', response.meta['sku']) if ( loader.get_output_value("price") and (pr is None or pr.get_output_value("price") > loader.get_output_value("price")) and valid_price(response.meta["price"], loader.get_output_value("price")) ): pr = loader if pr: yield pr.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract()[0].strip() multiple_options = hxs.select(u'//select[@class="mpv_itemalst"]//option') if multiple_options and not u'requested' in response.meta: for option in multiple_options: formname = u'aspNetForm' formdata = {u'ctl00$MainContent$ItemAList' : option.select(u'./@value').extract()[0], u'__EVENTTARGET' : u'ctl00$MainContent$ItemAList', u'__EVENTARGUMENT' : u''} req = FormRequest.from_response(response, formname=formname, formdata=formdata, meta={u'requested': True}, dont_click=True, callback=self.parse_product) yield req if multiple_options: name += u' %s' % multiple_options.select(u'../option[@selected]/text()').extract()[0].strip() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_xpath('price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()') if loader.get_output_value('price'): yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) nextPageLink = hxs.select('//div[@id="center-main"]//a[@class="right-arrow"]/@href') if nextPageLink: yield Request(self._get_products_url(response, nextPageLink[0].extract()), callback=self.parse_products) products = hxs.select('//div[@id="center-main"]//div[@class="details"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath("name", "a/text()") loader.add_xpath("sku", 'div[@class="sku"]/span/text()') # few prices were under div class desc price_selector = product.select('.//div[@class="price-row"]/span[@class="price-value"]/span/text()') if price_selector: price = price_selector[0].extract() else: price = "0.0" loader.add_value("price", price) relative_url = product.select("a/@href")[0].extract() loader.add_value("url", urljoin_rfc(get_base_url(response), relative_url)) yield loader.load_item()
def parse_page(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="item"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'h2/a/text()') relative_url = product.select('h2/a/@href').extract()[0] url = urljoin_rfc('http://www.dolphinmusic.co.uk/', relative_url, response.encoding) loader.add_value('url', url) loader.add_xpath('price', 'div[@class="pricing"]/p[@class="price"]/text()') yield loader.load_item() next_page = hxs.select('//*[@id="categoryMain"]/div[@class="pagination"]/ul/li/a/@href').extract() if not next_page: relative_urls = hxs.select('//*[@id="sidebar"]/ul[@id="refineCat"]/li/a/@href').extract() for relative_url in relative_urls: url = urljoin_rfc('http://www.dolphinmusic.co.uk/', relative_url, response.encoding) yield Request(url, callback=self.parse_page) else: next_url = next_page[-1] if self._is_next(next_url): url = urljoin_rfc('http://www.dolphinmusic.co.uk/', next_url, response.encoding) yield Request(url, callback=self.parse_page)
def parse_product(self, response): hxs = HtmlXPathSelector(response) opt_groups = [] def fix_options(o): try: return (o[0], o[1].replace(',', '')) except: return (o[0], '0') for option in hxs.select(u'//div[@class="input-box"]//select'): opt_list = option.select(u'./option[position() != 1]/text()').extract() opt_list = [o.replace('+$', '$').split('$') for o in opt_list] opt_groups.append([fix_options(o) for o in opt_list]) for opt_name, opt_price in multiply(opt_groups): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h2[@class="title"]/text()') product_loader.add_xpath('price', u'//span[contains(@class,"sale-price")]/text()') product_loader.add_xpath('sku', u'substring-after(//span[contains(@class,"meta-sku")]/text(),":")') product_loader.add_xpath('category', u'//ul[@class="breadcrumb"]/li[2]/a/@title') product_loader.add_xpath('image_url', u'//div[@class="teaser-large"]/img/@src') product_loader.add_xpath('brand', u'substring-after(//div[@class="product-meta"]/span[contains(text(),"Manufacturer:")]/text(),":")') product_loader.add_value('shipping_cost', '') product = product_loader.load_item() product['name'] = (product['name'] + ' ' + opt_name).strip() product['price'] = product['price'] + Decimal(opt_price) yield product
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//*[@id="area-2"]//div[@class="grid-25"]') if products: for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'div/h3/a/@href') if product.select('div/h3/a/abbr/@title'): loader.add_xpath('name', 'div/h3/a/abbr/@title') else: loader.add_xpath('name','div/h3/a/text()') price = product.select('div/div/p[@class="prd-amount"]/strong/text()').extract()[0] loader.add_value('price', self._encode_price(price)) yield loader.load_item() else: products = hxs.select('//*[@id="area-2"]//tr[@class="prd first"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'td/h3/a/@href') loader.add_xpath('name', 'td/h3/a/text()') if product.select('td/p/strong/text()').extract(): price = product.select('td/p/strong/text()').extract()[0] else: if product.select('td/div/p/strong/text()').extract(): price = product.select('td/div/p/strong/text()').extract()[0] loader.add_value('price', self._encode_price(price)) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//span[@itemprop="price"]/text()') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@class="productName fn"]/text()') loader.add_xpath('price', '//li[@class="price"]//text()') loader.add_xpath('sku', '//div[starts-with(@class, "specificationContent")]' + '//td[contains(text(), "Manufacturer ID")]/following-sibling::td/text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//*[@id="header"]/text()') loader.add_value('url', response.url) price = ''.join(hxs.select('//*[@id="productdesc"]/font/font/text()').extract()).replace('.','').replace(',','.') if price: price = price.split(':')[-1] loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_xpath('name', '//h1[@id="partNameId"]/text()') loader.add_value('url', response.url) loader.add_xpath('price', '//font[@class="txt-purchaseprice20blue"]/text()') sku = ''.join(hxs.select('//b[contains(text(), "Model #:")]/../text()').extract()).strip() loader.add_value('sku', sku) yield loader.load_item()
def parse_products(self, hxs, response): products = hxs.select('//h3[@class="product_name"]/../..') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3[@class="product_name"]/a/text()') url = product.select('.//h3[@class="product_name"]/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) loader.add_value('url', url) loader.add_xpath('price', './/p[@class="price"]/text()') yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) if self.products.has_key(response.url): sku = self.products[response.url] loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//*[@id="feature_content_info"]/h1/text()') loader.add_xpath('price', '//*[@id="productBuy"]/p/span/text()') return loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//*[@itemprop="price"]/text()') loader.add_value('url', response.url) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//div[@id="prodTITLE"]//h1/text()') product_loader.add_xpath('price', '//div[@id="prodDETAILS"]//span[@class="price"]/text()') product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('url', response.url) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_xpath('name', '//div[@id="productDetail"]//h1[@class="productDetailTitle"]/text()') loader.add_xpath('price', '//div[@id="productDetail"]//span[contains(@class,"price")]/text()') sku = hxs.select('//div[@id="productDetail"]//p[1]')[0].re('Ref\. Code: (\d+)') loader.add_value('sku', sku) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) row = response.meta['row'] loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['PRODUCT_NUMBER']) loader.add_value('sku', row['PRODUCT_NUMBER']) loader.add_xpath('brand', '//div[@class="product-detail-logo"]/a/img/@alt') categories = hxs.select( '//ul[@id="breadcrumbs"]/li/a/text()').extract()[1:-1] loader.add_value('category', categories) loader.add_xpath('name', '//h1[@class="content-title"]/text()') price = hxs.select( '//article[@class="aside-content"]/h2/span/text()').extract() if not price: price = hxs.select( '//article[@class="aside-content"]/h2/text()').extract() loader.add_value('price', price) loader.add_value('url', response.url) image_url = hxs.select( '//div[@class="product-detail-feature-img"]/a/img/@src').extract() image_url = urljoin_rfc(get_base_url(response), image_url[0]) if image_url else '' loader.add_value('image_url', image_url) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) name = hxs.select( '//div[@class="product-name"]/span/text()').extract()[0].strip() identifier = hxs.select('//input[@name="product"]/@value').extract()[0] price = hxs.select( '//form[@id="product_addtocart_form"]//span[@class="price"]/text()' ).extract() price = extract_price(price[0]) loader = ProductLoader(selector=hxs, item=Product()) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) image_url = hxs.select('//img[@id="image-main"]/@src').extract() image_url = image_url[0] if image_url else '' loader.add_value('image_url', image_url) categories = hxs.select( '//div[@class="breadcrumbs"]/ul/li/a/text()').extract()[1:] loader.add_value('category', categories) loader.add_value('url', response.url) product = loader.load_item() options_containers = hxs.select( '//select[contains(@class, "product-custom-option")]') if options_containers: options = [] if len(options_containers) > 1: combined_options = [] for options_container in options_containers: element_options = [] for option in options_container.select( 'option[@value!=""]'): option_id = option.select('@value').extract()[0] option_name = option.select( 'text()').extract()[0].split(u'+\xa3')[0].strip() option_price = option.select('text()').re('(\d+.\d+)') option_price = extract_price( option_price[0]) if option_price else 0 option_attr = (option_id, option_name, option_price) element_options.append(option_attr) combined_options.append(element_options) combined_options = list(itertools.product(*combined_options)) for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get( 'desc', '') + ' ' + option[1] final_option['identifier'] = final_option.get( 'identifier', '') + '-' + option[0] final_option['price'] = final_option.get( 'price', 0) + extract_price(option[2]) options.append(final_option) else: for option in options_containers.select('option[@value!=""]'): final_option = {} final_option['desc'] = ' ' + option.select( 'text()').extract()[0].split('(+')[0].strip() final_option['identifier'] = '-' + option.select( '@value').extract()[0] option_price = option.select('text()').re('\(\+(.*)\)') final_option['price'] = extract_price( option_price[0]) if option_price else 0 options.append(final_option) yield product for option in options: if not option['price']: continue option_product = deepcopy(product) option_product['identifier'] = option_product[ 'identifier'] + option['identifier'] option_product[ 'name'] = option_product['name'] + option['desc'] option_product[ 'price'] = option_product['price'] + option['price'] option_product['sku'] = option_product['identifier'] yield option_product else: yield product
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[@class="product-wrapper"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) name = product.select('.//h3//text()').extract()[0] product_loader.add_value('name', name) sku = '' for match in re.finditer(r"([\d,\.]+)", name): if len(match.group()) > len(sku): sku = match.group() product_loader.add_value('sku', sku) image_url = product.select( './div[@class="product-image"]//img/@data-original').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = product.select('./div[@class="product-price"]//span[@class="price-amount"]/text()').extract()[0].strip()\ .strip(' Kr').replace('.', '') product_loader.add_value('price', extract_price(price)) if product_loader.get_collected_values( 'price' ) and product_loader.get_collected_values('price')[0] < 1500: product_loader.add_value('shipping_cost', '49') buy_button = product.select( './div[@class="product-buttons"]/a[@class="buy-button"]') if not buy_button: product_loader.add_value('stock', 0) url = product.select( './div[@class="product-buttons"]/a[@class="button-info"]/@href' ).extract()[0] product_loader.add_value('url', urljoin_rfc(base_url, url)) identifier = product.select( './div[@class="product-name"]//@data-productid').extract()[0] product_loader.add_value('identifier', identifier) product = product_loader.load_item() yield product pages = hxs.select('//a[@class="paging-link-box"]/@href').extract() for url in pages: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product_list)
def parse_product(self, response): """ No shipping cost found """ hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brand = hxs.select( '//table[@id="product-attribute-specs-table"]//th[@class="label" and contains(text(), "Manufacturer")]/following-sibling::*/text()' ).extract()[0] loader = ProductLoader(response=response, item=Product()) #price = hxs.select('//*[@id="price-including-tax-6649"]//text()').re(r'[\d.,]+') price = None if not price: price = hxs.select( '//div[@class="productBox"]//div[@class="price-box"]/p[@class="price-to"]/span[@class="price-including-tax"]/span[@class="price"]/text()' ).extract() if not price: price = hxs.select( '//div[@class="productBox"]//div[@class="price-box"]//span[@class="price-including-tax"]/span[@class="price"]/text()' ).extract() loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('price', '0.0') loader.add_value('stock', '0') loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('brand', brand) loader.add_xpath('name', '//div[@class="product-name"]/h1/text()') image_url = hxs.select('//img[@id="zoom"]/@src').extract() if not image_url: image_url = hxs.select('//a[@id="ma-zoom1"]/@href').extract() loader.add_value('image_url', image_url) loader.add_value('category', response.meta.get('category', '')) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) meta = response.meta products = hxs.select( '//form//div[contains(@class, "resultList")]/article' '//*[contains(@class, "productTitle")]/a/@href').extract() if products: for x in self.parse(response): yield x return base_url = get_base_url(response) price = hxs.select( '//div[@class="row"]//span[@class="currentPrice"]/ins[@itemprop="price"]/text()' ).extract() if not price: price = "0.0" else: price = price.pop() identifier = response.url.split('/')[-1].split('-')[0] try: main_name = hxs.select( '//span[@itemprop="name"]/text()').extract()[0].strip() except: main_name = '' try: brand = hxs.select( '//span[@itemprop="brand"]/text()').extract()[0].strip() except: brand = '' product_name = brand + ' ' + main_name image_url = hxs.select('//img[@itemprop="image"]/@src').extract() stock = hxs.select( '//div[contains(@class, "availability")]/div/strong[contains(@class, "available")]/i[@class="icon-ok"]' ) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('price', extract_price(price)) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('image_url', image_url) categories = hxs.select( '//div[@class="breadcrumb"]/ul/li/a/span/text()').extract()[1:] for category in categories: loader.add_value('category', category.encode(response.encoding)) if not stock: loader.add_value('stock', 0) shipping_cost = hxs.select( '//div/strong[@class="weee"]/text()').extract() if shipping_cost: shipping_cost = extract_price(shipping_cost[0]) loader.add_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) brand = hxs.select("//div[@id='fiche-produit-description-inspiration']/text()").extract() brand = brand[0].split(' par')[1].strip() if brand else '' product_name = ''.join(hxs.select('//*[@id="fiche-produit-description-titre1"]/text()').extract()).strip() img = hxs.select('//*[@id="product-main-image"]/@src').extract() category = hxs.select('//ul[@class="breadcrumb"]//span[@itemprop="title"]/text()').extract()[:-1] product_identifier = hxs.select('//input[@name="product_id"]/@value').extract()[0] for option in hxs.select('//*[@id="product-option-selector"]//option'): loader = ProductLoader(item=Product(), selector=hxs) name = option.select('./text()').extract()[0].strip() name = ' '.join(s.strip() for s in name.split('\n')) name = name.replace('(Hors stock)', '').strip() if name != '': name = product_name + ' - ' + name price = option.select('./@data-price').extract()[0].replace(u'\u20ac', '').strip() price = extract_price_eu(price) identifier = option.select('./@value').extract()[0] loader.add_value('identifier', product_identifier + '_' + identifier) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('brand', brand) loader.add_value('price', price) stock = option.select('./@data-quantity').extract()[0] if stock == '0': loader.add_value('stock', 0) if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) if price < 300: loader.add_value('shipping_cost', 19) loader.add_value('category', category) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath( 'name', '//div[contains(@class,"product-info")]//h1[@id="product-name"]/span[@itemprop="name"]/text()' ) loader.add_value('url', response.url) image_url = hxs.select('//img[@itemprop="image"]/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) loader.add_value('image_url', image_url) loader.add_value('brand', 'Stickers & Gadgets') for category in hxs.select( '//span[contains(@itemtype,"Breadcrumb")]/a/span/text()' )[1:].extract(): loader.add_value('category', category) loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER')) item = loader.load_item() reg = re.compile('ekmProductVariantData.+?(\{.+\})', re.DOTALL) options = hxs.select('//script/text()').re(reg) if options: options = options[0].replace('\r\n', '') options = re.sub(".'item8.+?}}}}", "}}", options) options = eval(options) for option in options['items']: if not option['selector']: continue loader = ProductLoader(item=item, selector=hxs) loader.add_xpath( 'name', '//div[contains(@class,"product-info")]//h1[@id="product-name"]/span[@itemprop="name"]/text()' ) for attr in option['selector']: loader.add_value('name', attr['value']) identifier = response.meta.get( 'row').get('PRODUCT_NUMBER' ) + '-' + option['properties']['item1']['value'] loader.add_value('identifier', identifier) loader.add_value('price', option['properties']['item3']['innerHTML']) yield loader.load_item() else: loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER')) price = hxs.select( '//div[contains(@class,"product-info")]//span[@itemprop="price"]/@content' ).extract() if price: price = format_price(Decimal(price[0]) * Decimal('1.2')) else: price = Decimal('0.00') loader.add_value('price', price) yield loader.load_item()
def parse_operator(self, response): hxs = HtmlXPathSelector(response) meta = response.meta tariffs = hxs.select( '//table[contains(@class, "price-plans")]/tr[td[contains(@class, "col")]]' ) name = ' '.join( hxs.select('//h3[@class="handset-name"]/text()').extract() [0].split()) for tariff in tariffs: loader = ProductLoader(selector=tariff, item=Product()) tariff_name = ' '.join(' '.join( tariff.select( 'td[@class="col1" or @class="col2" or @class="col3" or @class="col4"]/child::*/text()' ).extract()).split()) monthly_cost = tariff.select( 'td[contains(@class, "col7")]/h4/text()').extract()[0] duration = u'24' #product_code = tariff.select('input[@name="productCode"]/@value').extract()[0] net_gen = '4G' if 'generation=4G' in response.url else '3G' tariff_code = tariff.select( 'td[contains(@class, "col7")]/div/form/input[@name="packageCode"]/@value' ).extract()[0] loader.add_value('identifier', tariff_code) loader.add_value( 'name', response.meta['device_name'] + ' - ' + tariff_name) loader.add_value('url', response.url) loader.add_value('brand', name.split()[0]) price = tariff.select( 'td[contains(@class, "col6")]/h4/text()').extract() loader.add_value('price', price) image_url = hxs.select( '//span[@class="handset-image"]/img/@src').extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) product = loader.load_item() metadata = TelecomsMeta() metadata['device_name'] = meta['device_name'] metadata['monthly_cost'] = monthly_cost.replace(u'\xa3', '') metadata['tariff_name'] = tariff_name metadata['contract_duration'] = duration metadata['operator'] = meta['operator'] metadata['channel'] = channel metadata['network_generation'] = net_gen product['metadata'] = metadata yield product next = hxs.select( '//a[i[contains(@class, "i-right-arrow-white")] and contains(@href, "page")]/@href' ).extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url, callback=self.parse_operator, meta=meta)
def parse_product(self, response): try: brand_name = response.xpath( '//span[@class="manufacturer"]/text()').extract()[0] name = response.xpath( '//div[@id="product-box"]//div[@class="title"]/text()' ).extract()[0].strip() except: self.log('No brand or name found: %s' % response.url) return if response.xpath( '//div[@class="no-valid-variants" and contains(text(), "this item is currently not available")]' ): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_value('name', brand_name + ' ' + name) sku = response.xpath( '////div[@class="additional-product-no"]/@data-xencoded').extract( ) if sku: sku = sku[0] h = HTMLParser.HTMLParser() key, data = sku.split(':', 1) key = int(key) data = h.unescape(data) # XOR decoding data = [ord(c) ^ key for c in data] data = ''.join([chr(c) for c in data]) sku = re.search('Manufacturer Item no\. (.*)', data) if sku: sku = sku.group(1) # 'Hersteller Artikelnr: 20050/20051' product_loader.add_value('sku', sku) # product_loader.add_xpath('sku', u'//div[@class="additional-product-no" and contains(text(), "Manufacturer Item no.")]', re=r'Manufacturer Item no\. (.*)') identifier = response.xpath( '//input[@name="vw_id"]/@value').extract()[0] product_loader.add_value('identifier', identifier) price = response.xpath( '//div[@class="current-price"]/span[@class="price"]/text()' ).extract() if not price: price = response.xpath( '//table[@class="product-price"]//tr[@class="price"]/td/text()' ).extract() if price: price = price[0] product_loader.add_value('price', extract_price_eu(price)) else: self.log('No product price found: %s' % response.url) return category = response.css('.uk-breadcrumb a::text').extract()[-1] product_loader.add_value('category', category) product_loader.add_value('brand', brand_name.strip()) try: image_url = response.urljoin( response.xpath('//img[@itemprop="image"]/@src').extract()[0]) product_loader.add_value('image_url', image_url) except: pass product = product_loader.load_item() rrp = extract_price_eu(''.join( response.xpath('//span[@class="retail-value"]/text()').extract())) rrp = str(rrp) if rrp > extract_price_eu(price) else '' options = response.xpath( '//div[contains(@id,"artikel_element_prices")]') if options: for opt in options: p = Product(product) optname = opt.xpath( './/meta[@itemprop="name"]/@content').extract()[0] p['name'] = optname p['price'] = extract_price( opt.xpath('.//meta[@itemprop="price"]/@content').extract() [0]) p['identifier'] = p['identifier'] + '-' + opt.xpath('@id').re( 'artikel_element_prices(.*)')[0] if p['identifier'] not in self.identifiers: self.identifiers.append(p['identifier']) yield p else: if product['identifier'] not in self.identifiers: self.identifiers.append(product['identifier']) yield product
def parse_product(self, response): image_url = response.xpath('//div[@class="main-image"]/img/@src').extract() if image_url: image_url = response.urljoin(image_url[-1]) category = response.xpath(u'//ol[@id="breadcrumbs"]/li/a/span[@itemprop="title"]/text()').extract() category = category[0] if category else '' brand = response.meta['brand'] multiple_prices = response.xpath('//label[text()="Options"]/../select/option') if not multiple_prices: identifier = response.xpath('//input[@name="sku"]/@value').extract() if not identifier: return else: identifier = identifier[0] price = response.xpath('//div[@class="price"]/span[@class="text"]/text()').re(r'[\d\.,]+') if not price: price = response.xpath('//div[@class="price"]/span[@class="text"]//text()').re(r'[\d\.,]+') product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//div[@class="name"]/h1/text()') if image_url: product_loader.add_value('image_url', image_url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product_loader.add_value('url', response.url) product_loader.add_value('identifier', identifier) product_loader.add_value('price', price) item = product_loader.load_item() # Try to solve "same product" issue but different name, price and url # Will be collected the lower price if item['identifier'] in self._products: item['name'] = self._products[item['identifier']]['name'] item['url'] = self._products[item['identifier']]['url'] else: self._products[item['identifier']] = { 'name': item['name'], 'url': item['url'], } yield item else: for name_and_price in multiple_prices: product_loader = ProductLoader(item=Product(), selector=name_and_price) name = response.xpath('//div[@class="name"]/h1/text()').extract()[0] name += ' ' + name_and_price.select('text()').extract()[0].strip() try: opt_id = name_and_price.select('@data-sku').extract()[0] except: continue product_loader.add_value('name', name) if image_url: product_loader.add_value('image_url', image_url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product_loader.add_value('url', response.url) product_loader.add_value('identifier', opt_id) price = name_and_price.select('@data-price').extract() if not price: price = name_and_price.select(u'./td[position()=2]/p[@class="now-table"]/text()').extract() if not price: price = name_and_price.select(u'.//*[@itemprop="price"]/text()').extract() product_loader.add_value('price', price) if name_and_price.select('@data-stock').extract() == ['0']: continue item = product_loader.load_item() # Try to solve "same product" issue but different name, price and url # Will be collected the lower price if item['identifier'] in self._products: item['name'] = self._products[item['identifier']]['name'] item['url'] = self._products[item['identifier']]['url'] else: self._products[item['identifier']] = { 'name': item['name'], 'url': item['url'], } yield item
def parse_product(response): identifier = response.xpath('//div[@class="nosto_product"]/span[@class="product_id"]/text()').extract_first() name = response.xpath('//div[@class="nosto_product"]/span[@class="name"]/text()').extract_first() price =response.xpath('//div[@class="nosto_product"]/span[@class="price"]/text()').extract_first() category = response.xpath('//div[@class="nosto_product"]/span[@class="category"]/text()').extract_first() category = category.split('/')[1:] image_url = response.xpath('//div[@class="nosto_product"]/span[@class="image_url"]/text()').extract_first() variations = response.xpath('//a[@class="button btn-cart basket-below"]') if variations: variations = response.xpath('//*[@id="super-product-table"]/tbody/tr') for variant in variations: o_name = name for option in variant.xpath('./td[@fil-id!=""]/span/text()').extract(): if option != 'Yes': o_name += ' ' + option o_id = variant.xpath('.//input/@name').extract_first() if not o_id: continue o_id = o_id.replace('super_group[', '')[:-1] o_sku = variant.xpath('.//span[@class="sku"]/text()').extract_first() o_price = variant.xpath('.//span[@class="break-price"]/text()').extract_first() loader = ProductLoader(item=Product(), response=response) loader.add_value('name', o_name) loader.add_value('identifier', o_id) loader.add_value('sku', o_sku) loader.add_value('category', category) loader.add_value('url', response.url) loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('price', o_price) option_item = loader.load_item() yield option_item else: sku = response.xpath('//span[@class="product-ids"]/text()').extract_first() if sku: sku = sku.replace('Item code: ', '') loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('category', category) loader.add_value('url', response.url) loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('price', price) option_item = loader.load_item() yield option_item
def parse_product(self, response): hxs = HtmlXPathSelector(response) category = hxs.select(u'//div[contains(@class,"Breadcrumbs")]/nobr//span/text()').extract() category = u' > '.join(category) if hxs.select('//div[@id="ProductContainer9"]'): return for product in hxs.select(u'//form[@name="productForm"]//div[@itemscope="itemscope"]'): loader = ProductLoader(item=Product(), selector=product) identifier = product.select(u'.//*[@itemprop="name"]/@id').re(u'ProductTitle-P(\d+)') if not identifier: identifier = product.select(u'.//meta[@itemprop="productID"]/@content').extract() if identifier: identifier = identifier[0] else: continue loader.add_value('identifier', identifier) sku = product.select(u'.//meta[@itemprop="productID"]/@content')[0].extract() sku = re.search(u'(\d+)', sku) if sku: sku = sku.group(1) loader.add_value('sku', sku) loader.add_value('url', response.url) name = ''.join(product.select(u'.//div[contains(@id,"ProductIntroduction-P")]//text()').extract()).strip() if not name or (name and not ('lego' in name.lower())): continue loader.add_value('name', name) price = product.select(u'.//meta[@itemprop="price"]/@content').extract() if price: price = price[0].strip().replace('.', '').replace(',', '.') else: price = '0.00' loader.add_value('price', price) loader.add_value('category', category) img = product.select('div/div//a[contains(@id, "ProductThumbnailImage")]/img/@src').extract() if not img: img = product.select(u'.//a[contains(@id,"ProductThumbnail")]/img/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('brand', 'lego') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) identifier = hxs.select( '//div[contains(@class,"skuinfo")]//input[@id="hdfProduto"]/@value' ).extract() sku = hxs.select( '//li[contains(text(),"Fornecedor")]//text()').extract() if sku: sku = re.search(':(.*)', re.sub('[\r\n\t]', '', sku[0])).group(1) else: sku = hxs.select( '//label[@id="lblRefereniaMBS"]/text()')[0].extract() loader.add_value('identifier', identifier) loader.add_value('sku', sku) name = hxs.select( '//label[@id="lblTituloProduto"]/text()').extract()[0].strip() try: loader.add_value('name', name) except: loader.add_value('name', name.decode('utf-8', 'replace')) category = hxs.select('//div[@class="n03"]//a/text()').extract() loader.add_value('category', ' > '.join(category[:3])) image_url = hxs.select('//img[@id="productimage-0"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) brand = hxs.select('//li[contains(text(),"Marca")]//text()').extract() if brand: brand = re.search(':(.*)', re.sub('[\r\n\t]', '', brand[0])).group(1) loader.add_value('brand', brand) loader.add_value('url', response.url) price = hxs.select( '//label[@id="MainContent_ucPreco_lblProdutoPrecoFinal"]/text()' ).extract() price = price[0].replace('.', '').replace( ',', '.').strip() if price else '0.00' loader.add_value('price', price) out_of_stock = hxs.select('//span[@class="stock-red"]') if out_of_stock: loader.add_value('stock', 0) price = loader.get_output_value('price') if price: price = Decimal(price) if price <= 48.99: loader.add_value('shipping_cost', '3.00') metadata = StaplesMeta() metadata['exclusive_online'] = 'Yes' if hxs.select( '//label[@id="lblTituloProduto"]/font[contains(text(),"Exclusivo Internet")]' ) else '' delivery_time = hxs.select( '//label[@id="lblEntregaPrevista"]/text()').extract() metadata['delivery_time'] = delivery_time[0] if delivery_time else '' promotion_price = hxs.select( '//label[@id="MainContent_ucPreco_lblPrecoProdutoAntes"]/text()' ).extract() metadata['promotion_price'] = promotion_price[0].replace( '.', '').replace(',', '.').replace(u'\u20ac', '') if promotion_price else '' product = loader.load_item() product['metadata'] = metadata yield product
def parse_searchanise(self, response): res = json.loads(response.body) items = [] try: items = res['items'] except KeyError: self.log('Wrong response: {}'.format(str(res))) retries = response.meta.get('retries', 0) if retries < 5: time.sleep(60) yield Request(response.url, dont_filter=True, callback=self.parse_searchanise, meta={ 'offset': response.meta['offset'], 'retries': retries + 1 }) for item in items: if not item['product_code']: continue loader = ProductLoader(item=Product(), selector=HtmlXPathSelector()) loader.add_value('identifier', item['product_code']) loader.add_value('sku', item['product_code']) price = item['price'] if '.' in price: price = price.split('.') price = price[0] + '.' + price[1][:2] loader.add_value('price', price) loader.add_value('name', item['title']) loader.add_value('url', item['link']) loader.add_value('stock', '1') yield loader.load_item() if items: meta = {'offset': response.meta['offset'] + 99} yield Request(self.searchanise_url.format(self.searchanise_api, meta['offset']), meta=meta, callback=self.parse_searchanise)
def parse_product(response): name = response.xpath('//h1[@class="name"]/text()').extract()[0] identifier = response.xpath('//meta[@itemprop="sku"]/@content').extract()[0] image_url = response.xpath('//*[@id="zoom_01"]/@src').extract() category = response.xpath('//*[@id="wayProd"]//a/span/text()').extract()[-3:] price = response.xpath('//*[@id="total_dinamic"]/span/text()').extract()[0] price = extract_price(price) products = response.xpath('//*[@id="datesBuy"]//select[@name="talla_color"]/option') for product in products: product_loader = ProductLoader(item=Product(), selector=product) p_name = product.select('./text()').extract()[0] p_name = name if p_name == '- ' else name + ' ' + p_name p_identifier = product.select('./@value').extract()[0] product_loader.add_value('identifier', identifier + '_' + p_identifier) product_loader.add_value('name', p_name) product_loader.add_value('sku', identifier + '_' + p_identifier) if image_url: product_loader.add_value('image_url', response.urljoin(image_url[0])) product_loader.add_value('price', price) product_loader.add_value('category', category) product_loader.add_value('brand', 'CamelBak') product_loader.add_value('url', response.url) product = product_loader.load_item() yield product
def parse_category(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) products = hxs.select(self.products_xpath) self.log('{} products found'.format(len(products))) for p in products: loader = ProductLoader(selector=p, item=Product()) name = p.select( './/td[@class="product-title-wrap"]/a/text()').extract() if not name: continue loader.add_value('name', name) loader.add_value('stock', 1) try: url = p.select( './/td[@class="product-title-wrap"]/a/@href').extract()[0] url = urljoin_rfc(base_url, url) loader.add_value('url', url) product_id = p.select( './/input[contains(@name, "[product_id]")]/@value' ).extract()[0] price_num = p.select('.//span[@class="price-num"]/text()') if price_num: price = ''.join(price_num.extract()) loader.add_value('price', price) product_url = url if product_id in self.product_info or product_url in self.product_info: p_cache = self.product_info.get( product_id) or self.product_info.get(product_url) loader.add_value('identifier', p_cache['sku'].upper()) loader.add_value('sku', p_cache['sku']) self.products += 1 yield loader.load_item() else: yield Request(self.get_url( loader.get_output_value('url')), callback=self.parse_product, cookies={}, meta={ 'proxy': self.get_proxy(), 'loader': loader, 'product_id': product_id, 'dont_merge_cookies': True }) else: price_image = p.select( './/span[@class="price"]//img/@src').extract()[0] params = { 'url': price_image, 'resize': 200, 'blur': 1, 'mode': '7', 'format': 'float' } prev_price = self.product_info.get(product_id, {}).get('price') \ or self.product_info.get(url, {}).get('price') yield Request(self.ocr_url, method="POST", body=urlencode(params), meta={ 'loader': loader, 'product_id': product_id, 'price_image': price_image, 'prev_price': prev_price }, callback=self.parse_price, dont_filter=True) except IndexError: continue next_category_url = hxs.select( '//div[@id="pagination_contents"]//a[@name="pagination"][contains(' '@class, "next")]/@href').extract() retries = response.meta.get('retries', 0) if len(next_category_url) > 0 or len(products) > 190: page = int(response.meta.get('page', 1)) + 1 next_url = add_or_replace_parameter(self.search_url, 'page', str(page)) yield Request(self.get_url(next_url), callback=self.parse_category, cookies={}, dont_filter=True, meta={ 'proxy': self.get_proxy(), 'dont_merge_cookies': True, 'page': page }) elif retries < 3 and (response.status != 200 or not next_category_url or not len(products)): page = int(response.meta.get('page', 1)) next_url = add_or_replace_parameter(self.search_url, 'page', str(page)) yield Request(self.get_url(next_url), callback=self.parse_category, cookies={}, dont_filter=True, meta={ 'proxy': self.get_proxy(), 'dont_merge_cookies': True, 'page': page, 'retries': retries + 1 })
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) sku = response.xpath('//span[@itemprop="sku"]/text()').extract()[0] loader.add_value('sku', sku) loader.add_value('identifier', sku) loader.add_xpath('brand', '//span[@itemprop="manufacturer"]/text()') name = response.xpath( '//h1[@itemprop="name"]/text()').extract()[0].strip() desc = ''.join( response.xpath( '//h2[@itemprop="description"]/text()').extract()).strip() if desc: name = name + ' ' + desc loader.add_value('name', name) loader.add_value('url', response.url) price = extract_price( response.xpath('//*[@itemprop="price"]/@content').extract_first()) if price < 295: shipping_cost = 29 else: shipping_cost = 0 price_before = response.css( '.product-main-info .product-price-before::text').extract_first() if price_before: sales_price = price price = extract_price_eu(price_before) else: sales_price = None loader.add_value('price', price) image_url = response.xpath( '//div[@class="swiper-slide"]/img/@data-src').extract() image_url = response.urljoin(image_url[0]) if image_url else '' loader.add_value('image_url', image_url) breadcrumbs = response.css( 'nav.breadcrumbs::attr(data-initobject)').extract_first() breadcrumbs = json.loads(breadcrumbs)['model']['links'][-3:] categories = [category['title'] for category in breadcrumbs] if 'Forsiden' in categories: categories.remove('Forsiden') loader.add_value('category', categories) loader.add_value('shipping_cost', shipping_cost) item = loader.load_item() if sales_price: item['metadata'] = {'SalesPrice': sales_price} yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url name = hxs.select( '//div[@class="product-essential"]//div[@class="product-name"]/h1/text()' ).extract()[0] sku = ''.join( hxs.select( u'//div[@class="product-essential"]//div[@class="product-name"]//span[@class="sku"]/text()' ).extract()).replace('Vare:', '').strip() # price = hxs.select(u'.//div[@class="product-shop"]//span[@class="price"]/text()[last()]').extract()[-1] # price = price.strip().replace('.', '').replace(',', '.') price = hxs.select( "//div[@class='product-essential']//span[@class='regular-price']/span[@class='price']//text()" ).extract() price += hxs.select( "//div[@class='product-essential']//p[@class='special-price']/span[@class='price']//text()" ).extract() price = price[0] price = price.strip().replace('.', '').replace(',', '.') loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', name) loader.add_value('sku', sku) loader.add_value('identifier', sku) loader.add_value('url', url) loader.add_value('price', price) yield loader.load_item() opthtml = hxs.select( '//div[@class="product-essential"]//div[@id="product-options-wrapper"]' ) if opthtml: m = re.search('Product.Config\((.+?)\);', opthtml.extract()[0]) if m: m = re.search('rrelse","options":(.+?)]}],', m.group(1)) if m: sizes = re.findall('label":"([^"]+)"', m.group(1)) for sz in sizes: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', name + ' - ' + sz) loader.add_value('sku', sku + '-' + sz) loader.add_value('identifier', sku + '-' + sz.replace(' ', '')) loader.add_value('url', url) loader.add_value('price', price) yield loader.load_item()
def parse(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', row['Product page URL'].decode('utf-8')) product_loader.add_value('name', row['Product name'].decode('utf-8')) product_loader.add_value('image_url', row['Image URL'].decode('utf-8')) product_loader.add_value('identifier', row['sku'].decode('utf-8')) product_loader.add_value( 'sku', row['Unique product code'].decode('utf-8')) product_loader.add_value('price', str(float(row['Price'].decode('utf-8')))) product_loader.add_value('category', row['Category'].decode('utf-8')) product_loader.add_value('brand', row['Brand'].decode('utf-8')) product_loader.add_value('shipping_cost', row['Shipping cost']) item = product_loader.load_item() yield item
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) # categories category_urls = hxs.select('//ul[@id="nav"]//li/a/@href').extract() for url in category_urls: url = urljoin_rfc(base_url, url) yield Request(url) # pages pages_urls = hxs.select('//div[@class="pager"]//a/@href').extract() for url in pages_urls: url = urljoin_rfc(base_url, url) yield Request(url) # products list products = hxs.select("//li[contains(@class, 'item')]") if not products: logging.error("ERROR!! NO PRODUCTS!! %s " % response.url) for product_el in products: name = product_el.select('.//h2[@class="product-name"]/a/text()').extract() if not name: continue discountinued = product_el.select('div/div[@class="cant_buy_online"]/p/text()').extract() if discountinued: continue name = name[0] url = product_el.select('.//h2[@class="product-name"]/a/@href').extract() if not url: logging.error("ERROR!! NO URL!! %s %s" % (response.url, name)) continue url = url[0] url = urljoin_rfc(base_url, url) price = product_el.select('.//span[@class="price"]/text()').extract() if not price: logging.error("ERROR!! NO PRICE!! %s %s" % (response.url, name)) continue price = extract_price(price.pop()) identifier = product_el.select(u'.//div[@class="buy-now"]/a').re(r'/product/(\d+)/form_key') if not identifier: identifier = product_el.select(u'.//span[contains(@id, "product-price")]/@id').re(r'product-price-(\d+)') if not identifier: continue identifier = identifier.pop() loader = ProductLoader(item=Product(), selector=product_el) loader.add_value('identifier', identifier) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_xpath('image_url', u'.//a[contains(@class, "product-image")]//img/@src') item = loader.load_item() self._urls.append(item['url']) if identifier not in self.ids or price != self.ids[identifier]: self.ids[identifier] = price yield item
def parse_product(self, response): identifier = response.xpath( "//div[@class='item-number']/text()").extract_first() sku = identifier identifier = re.sub(u'a', u'', identifier, flags=re.IGNORECASE) name = response.xpath( "//div[@class='product-title']/h1/text()").extract_first().strip() price = response.xpath( "//div[@class='price']//span[@class='disc-price']/text()").extract( ) if not price: price = response.xpath( "//div[@class='price']/div[@class='regular-price']/span[@class]/text()" ).extract() if price: price = price[0].strip('$').replace(",", "") else: price = '0.00' price = Decimal(price) # convert using xe.com image_url = response.xpath( "//a[@id='mainImage']/img/@src").extract_first() categories = response.xpath( '//div[@id="breadcrumbs-"]/ul/li/a//text()')[1:-1].extract() try: brand = response.xpath( '//b[contains(., "BRAND:")]/following-sibling::text()[1]' ).extract_first().title() except AttributeError: brand = '' attributes = response.xpath('//fieldset[@class="attributes"]//li') options = [] option_names = {} for option in response.xpath( '//select[@name="attrValue_1"]/option[@value!=""]'): opt_val = option.xpath('./@value').extract() opt_name = option.xpath('./span/text()').extract() if opt_val and opt_name: option_names[opt_val[0]] = opt_name[0] for attr in attributes: attr_name = attr.xpath( './/input[@name="attrName_1"]/@value').extract() if attr_name: attr_name = attr_name[0] else: continue attr_options = [] attr_values = attr.xpath( './/select/option[@value!=""]/@value').extract() for attr_value in attr_values: attr_options.append((attr_name, attr_value)) if not attr_values: attr_value = attr.xpath( './/input[@name="attrValue_1"]/@value')[0].extract() attr_options.append((attr_name, attr_value)) if attr_options: options.append(attr_options) options = itertools.product(*options) items = [] for option in options: opt = [option_names.get(v, '') for _, v in option] opt = [o for o in opt if o] option_name = ' '.join(opt).strip() opt = [SIZES_DICT.get(o.lower(), o) for o in opt if o] option_id = ':'.join(opt).strip() option_name = re.sub('size', '', option_name, flags=re.IGNORECASE).strip() size = option_names.get(option[-1][-1], '') if option and option[-1] else '' size = re.sub('size', '', size, flags=re.IGNORECASE).strip() if option_name: product_name = name + ' (' + option_name + ')' else: product_name = name if option_id: product_identifier = identifier + u':' + option_id.strip( ).lower() else: product_identifier = identifier loader = ProductLoader(Product(), option) loader.add_value('name', product_name) loader.add_value('url', response.url) loader.add_value('identifier', product_identifier) loader.add_value('sku', sku) loader.add_value('price', price) loader.add_value('image_url', image_url) loader.add_value('brand', brand) for category in categories: loader.add_value('category', category) product = loader.load_item() product['metadata'] = { 'size': SIZES_DICT.get(size.lower(), size).title() } player = [ p for p in self.players if p[1].lower() in product_name.lower() ] if player: product['metadata']['player'] = player[0][1].title() product['metadata']['number'] = player[0][2] item = {'item': product} item['attributes'] = () for k, v in option: item['attributes'] += ((k, v), ) items.append(item) if not options: loader = ProductLoader(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('price', price) loader.add_value('image_url', image_url) loader.add_value('brand', brand) for category in categories: loader.add_value('category', category) product = loader.load_item() product['metadata'] = {} player = [p for p in self.players if p[1].lower() in name.lower()] if player: product['metadata']['player'] = player[0][1].title() product['metadata']['number'] = player[0][2] item = {'item': product} item['attributes'] = () item['attributes'] += (( response.xpath( '//input[@name="attrName_1"]/@value')[0].extract(), response.xpath( '//input[@name="attrValue_1"]/@value')[0].extract()), ) item['attributes'] += (( response.xpath( '//input[@name="attrName_1"]/@value')[1].extract(), response.xpath( '//input[@name="attrValue_1"]/@value')[1].extract()), ) items.append(item) product_id = response.xpath( '//input[@name="productId"]/@value')[0].extract() yield Request( 'http://www.worldsoccershop.com/InventoryCheck.json?productId={}'. format(product_id), meta={'items': items}, callback=self.parse_stock)
def get_products(self, hxs, url): root_url = 'https://www.instawares.com' res = [] products = hxs.select('//ol[starts-with(@class, "productListResultOL")]/li') # self.log('%s products found' % len(products)) for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/div[@class="listResultsDescriptionDiv"]/a/text()') loader.add_xpath('identifier', './/div[@class="listResultsDescriptionDiv"]/dl/dd[1]/text()') loader.add_xpath('price', './/div[@class="listResultPrice"]/text()') loader.add_xpath('brand', './/div[@class="listResultsDescriptionDiv"]/dl/dt[contains(text(), "By")]/following-sibling::dd/text()') url = product.select('.//div[@class="listResultsDescriptionDiv"]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(root_url, url)) if loader.get_output_value('identifier') in self.prod_data: row = self.prod_data[loader.get_output_value('identifier')] loader.add_value('brand', row['brand'].decode('utf8')) loader.add_value('category', row['category'].decode('utf8')) loader.add_value('sku', row['sku'].decode('utf8')) image_url = product.select('.//img[@class="productimagelarge"]/@src').extract() if image_url: image_url = image_url[0] loader.add_value('image_url', urljoin_rfc(root_url, image_url)) p = loader.load_item() if p['identifier'] in self.sold_as: sold_as = self.sold_as[p['identifier']] metadata = TigerChefMeta() metadata['sold_as'] = sold_as p['metadata'] = metadata res.append(loader.load_item()) if not res and hxs.select('//h1[@class="productName fn"]/text()'): loader = ProductLoader(selector=hxs, item=Product(), spider_name=self.name) loader.add_value('url', url) loader.add_xpath('name', '//h1[@class="productName fn"]/text()') loader.add_xpath('price', '//li[@class="price"]//text()') loader.add_xpath('sku', '//div[starts-with(@class, "specificationContent")]' + '//td[contains(text(), "Manufacturer ID")]/following-sibling::td/text()') loader.add_xpath('identifier', '//td[@itemprop="productID"]/text()') brand = hxs.select('//td[@class="brand"]/text()').extract() if not brand: self.log("ERROR brand not found") else: loader.add_value("brand", brand[0].strip()) image_url = hxs.select('//div[@class="productImageDiv"]/a/img/@src').extract() if not image_url: self.log("ERROR image_url not found") else: loader.add_value("image_url", urljoin_rfc(root_url, image_url[0])) category = hxs.select('(//ol[@class="breadcrumbOL"]/a)[last()]/text()').extract() if not category: self.log("ERROR category not found") else: loader.add_value("category", category[0].strip()) sold_as = hxs.select('//dl[@class="soldAsPackedAsDL"]/dd[1]/text()').extract() product = loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].strip() if sold_as else '1 ea' product['metadata'] = metadata if product.get('identifier'): res.append(loader.load_item()) return res
def parse_product(self, response): loader = ProductLoader(item=response.meta.get('product', Product()), response=response) loader.add_xpath('identifier', '//input[@name="id"]/@value') loader.add_value('url', response.url) loader.add_xpath('name', '//h1//text()') sku = response.xpath( '//div[@class="basic-content-body"]//dt[contains(text(), "Artikelnummer")]' '/following-sibling::dd/text()').re(r'(\d{3}\d*)') if sku: loader.add_value('sku', sku) else: self.log('No SKU for %s' % (response.url)) loader.add_xpath( 'category', '//ul[contains(@class, "breadcrumbs")]/li[position()=last()-1]/a/text()' ) img = response.xpath('//img[@itemprop="image"]/@src').extract() if img: loader.add_value('image_url', response.urljoin(img[0])) price = ''.join( response.xpath('normalize-space(//*[@itemprop="price"]/text())'). re(r'([\d.,]+)')) loader.add_value('price', extract_price_eu(price)) loader.add_value('brand', 'Lego') in_stock = bool( response.xpath( '//div[@class="product-info"]//em[@class="mod-success"]//text()' ).re(r'lager')) if not in_stock: loader.add_value('stock', 0) yield self.add_shipping_cost(loader.load_item())
def parse_product(self, response): hxs = HtmlXPathSelector(response) name = hxs.select( u'//span[@itemprop="name"]/text()').extract()[0].strip() product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', response.url.split('/')[-1].split('.')[0]) product_loader.add_value('url', response.url) product_loader.add_value('name', name) product_loader.add_xpath('brand', u'//meta[@itemprop="brand"]/@content') product_loader.add_xpath('price', u'//span[@itemprop="price"]/text()') product_loader.add_value('sku', response.url.split('/')[-1].split('.')[0]) product_loader.add_value('category', response.meta.get('category')) img = hxs.select(u'//a/img[@class="product-img"]/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_id = hxs.select('//aside/span/span/text()')[0].extract() product_loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//article/header/h1/text()').extract() product_loader.add_value('name', u'{}'.format(name[0].strip())) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta.get('brand') or '') product_loader.add_value('identifier', '{}'.format(product_id)) product_loader.add_value('sku', product_id) try: category = hxs.select( '//ul[@class="breadcrumb"]//a/i/text()')[-1].extract() except: category = hxs.select( '//ul[@class="breadcrumb"]//a/text()')[-1].extract() product_loader.add_value('category', category) image_url = hxs.select( '//img[@id="img-01"]/@data-zoom-image').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) product_loader.add_value('image_url', image_url) price = hxs.select( '//aside[contains(@class, "price-container")]/div/p[@class="price"]//text()' ).extract() product_loader.add_value('price', extract_price(price[0]) if price else '0.00') if not hxs.select( '//div[@class="infos-checkout"]/a[contains(@class,"cta green")]' ): product_loader.add_value('stock', 0) weight = hxs.select( '//section[@id="description-technique"]//th[@scope="row" and contains(text(),"Poids")]/following-sibling::td/text()' ).extract() if weight: product_loader.add_value('shipping_cost', self._get_shipping_cost(weight[-1])) product = product_loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = response.meta.get('brand') or '' product['metadata'] = metadata reviews_url = 'http://www.leroymerlin.fr/v3/bazaarvoice/viewReviews.do?reflm={}&page={}&maxItems=4' yield Request(reviews_url.format(product_id, '1'), meta={ 'product': product, 'page': 1, 'product_url': response.url, 'product_id': product_id, 'reviews_url': reviews_url }, callback=self.parse_review, dont_filter=True)
def parse(self, response): transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" transport.connect(username = username, password = password) sftp = paramiko.SFTPClient.from_transport(transport) files = sftp.listdir_attr() last = get_last_file(self.file_start_with, files) sftp.get(last.filename, self.csv_file_path) # Convert XLXS file to CSV #excel_to_csv(self.xls_file_path, self.csv_file_path) with open(self.csv_file_path) as f: reader = UnicodeDictReader(f) # csv.DictReader(f, delimiter=',') for row in reader: if row['Item Code'].lower() in self.identifiers: continue self.identifiers.append(row['Item Code'].lower()) loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['Item Code']) loader.add_value('sku', row['Item Code']) loader.add_value('name', row['Product Description']) loader.add_value('price', row['RRP']) loader.add_value('brand', 'USN') loader.add_value('category', row['Category']) image_url = self.images.get(row['Item Code']) if image_url: loader.add_value('image_url', image_url) loader.add_value('url', row['USN Url:']) product = loader.load_item() metadata = USNFeedMeta() metadata['ASIN'] = row['ASIN'] if row['ASIN'].lower() != 'n/a' else '' product['metadata'] = metadata yield Request(product['url'], callback=self.parse_details, meta={'product': product, 'option_id': row['Option Value']}, dont_filter=True)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', response.meta['name']) loader.add_value('price', response.meta['price']) loader.add_value('url', response.url) mpn = hxs.select( '//*[@id="tab-attribute"]/table/tbody/tr[td/text()="Manufacturers Part No"]/td/text()' ).extract() sku = hxs.select( '//*[@id="tab-attribute"]/table/tbody/tr[td/text()="Act Ref"]/td/text()' ).extract() if sku: loader.add_value('identifier', sku[1]) loader.add_value('sku', sku[1]) else: if mpn: loader.add_value('identifier', mpn[1]) loader.add_value('sku', mpn[1]) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) row = response.meta['row'] loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['our part no.'].lower()) loader.add_value('sku', row['our part no.']) loader.add_value('url', response.url) brand = hxs.select( '//img[@class="manufacturer_image"]/@title').extract() brand = brand[0].strip() if brand else '' loader.add_value('brand', brand) loader.add_value('category', brand) loader.add_xpath('image_url', '//div[@class="product-image"]//img/@src') loader.add_xpath('name', '//h1/text()') price = extract_price(''.join(''.join( hxs.select( '//form//p[@class="special-price"]//span[@class="price"]/text()' ).extract()).split())) if not price: price = extract_price(''.join(''.join( hxs.select( '//span[@class="regular-price"]//span[@class="price"]/text()' ).extract()).split())) loader.add_value('price', price) item = loader.load_item() metadata = HargrovesCyclesMeta() metadata['mpn'] = row['mpn'] item['metadata'] = metadata option_found = False options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) prices[product] = prices.get( product, 0) + extract_price(option['oldPrice']) for option_id, option_name in products.iteritems(): # Check for the correct options according to the google doc spreadsheet if product_data['products'][option_id]['sku'].upper( ) == item['sku'].upper(): item['price'] = product_data['childProducts'][option_id][ 'finalPrice'] item['name'] += ' ' + option_name stock = product_data['products'][option_id].get( 'saleable', False) if not stock: item['stock'] = 0 yield item else: out_of_stock = hxs.select( '//div[contains(@class, "product-info")]//span[@class="stock"]/span[@class="outstock"]' ) if out_of_stock: item['stock'] = 0 yield item
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) #pagination urls = hxs.select('//*[@id="paging"]//a/@href').extract() for url in urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse) products = hxs.select('//div[contains(@class,"produkt_boks")]') for product in products: product_loader = ProductLoader(item=Product(), selector=hxs) in_stock = product.select( './/div[@class="laegikurv"]/a/@class').extract()[0] if in_stock != 'laegivogn': product_loader.add_value('stock', 0) identifier = product.select('.//div[@class="desc"]/a/@href' ).extract()[0].partition('&vn=')[2] product_loader.add_value('identifier', identifier) image_url = product.select( './/div[@class="produkt_img"]//img/@src').extract()[0] product_loader.add_value('image_url', urljoin_rfc(base_url, image_url)) product_name = product.select( './/div[@class="desc"]/a/text()').extract()[0] product_loader.add_value('name', product_name) sku_text = ''.join( product.select('.//div[@class="desc"]/text()').extract()) sku = '' for match in re.finditer(r"([\d]+)", sku_text): if len(match.group()) > len(sku): sku = match.group() product_loader.add_value('sku', sku) url = product.select('.//div[@class="desc"]/a/@href').extract()[0] product_loader.add_value('url', urljoin_rfc(base_url, url)) price = product.select('.//span[@class="pris"]/text()').extract( )[0].strip().strip('DKK ') price = extract_price(price) product_loader.add_value('price', price) if price < 1000: product_loader.add_value('shipping_cost', 49) else: product_loader.add_value('shipping_cost', 0) yield product_loader.load_item()
def parse_product(self, response): identifier = response.xpath('//*[@id="pid"]/@value').extract_first() p_data = json.loads( response.xpath('//*[@id="product-data-{}"]/@value'.format( identifier)).extract_first()) name = p_data['variant'] stock = response.xpath('//*[@id="add-to-cart"]') price = p_data['price'] brand = p_data['brand'] category = response.xpath( '//div[@class="breadcrumb"]//a/span/text()').extract()[1:] image_url = response.xpath( '//img[@itemprop="image"]/@src').extract_first() variations = response.xpath('//div[@class="product-variations"]/ul/li') url = response.meta.get('url', '') if variations and url == '': combined_options = [] for variant in variations: element_options = [] vtype = variant.xpath('./@class').extract_first() if vtype == 'attribute': # colour vtitle = variant.xpath('./span/text()').extract_first() if 'Select Colour' != vtitle: self.log('Unknown vtitle: {} URL: {}'.format( vtitle, response.url)) return for option in variant.xpath( './div/ul/li[@class="available"]'): option_url = option.xpath('./a/@href').extract_first() element_options.append(option_url) elif vtype == 'attribute variant-dropdown': for option in variant.xpath( './/select[@class="variation-select"]/option')[1:]: option_url = option.xpath('./@value').extract_first() element_options.append(option_url) else: self.log('Unknown vtype: {} URL: {}'.format( vtype, response.url)) return combined_options.append(element_options) if len(variations) > 1: combined_options = list(itertools.product(*combined_options)) for combined_option in combined_options: url = '' for option in combined_option: if url == '': url = option else: params = dict( urlparse.parse_qsl( urlparse.urlsplit(option).query)) for name, value in params.iteritems(): url = add_or_replace_parameter( url, name, value) yield scrapy.Request(url, callback=self.parse_product, meta={'url': response.url}) else: for option in combined_options[0]: yield scrapy.Request(option, callback=self.parse_product, meta={'url': response.url}) else: if name == '': return loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('category', category) if brand != 'Unbranded': loader.add_value('brand', brand) url = response.meta.get('url', response.url) loader.add_value('url', url) loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('price', price) if loader.get_output_value('price') <= 10: loader.add_value('shipping_cost', '1.50') elif loader.get_output_value('price') <= 200: loader.add_value('shipping_cost', '5.95') if not stock: loader.add_value('stock', 0) option_item = loader.load_item() yield option_item