def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # products products = hxs.select(u'//form/div[@id="price"]') if not products: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//div[@class="product"]/h1/text()') price = hxs.select(u'//div[@class="product"]//p[@class="price1"]/text()').re(u'\xa3(.*)') if not price: return product_loader.add_value('price', price) yield product_loader.load_item() else: for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('name', u'./h4/text()') product_loader.add_value('url', response.url) price = product.select(u'.//p[@class="price1"]/text()').re('\xa3(.*[0-9])') if not price: continue product_loader.add_value('price', price) yield product_loader.load_item()
def browse_and_parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for subcat_href in hxs.select('//div[@id="navColumnOne"]//a/@href').extract(): subsubcat_url = urlparse.urljoin(base_url, subcat_href) if subsubcat_url not in self.navig_url_set: self.navig_url_set.add(subsubcat_url) yield Request(subsubcat_url, callback=self.browse_and_parse) next_page = hxs.select("//div[@id='productListing']//div[@id='productsListingListingTopLinks']//a[contains(., 'Neste')]/@href") if next_page: yield Request(next_page[0].extract(), callback=self.browse_and_parse) # parse product listing in this page, if any for tr in hxs.select('//div[@id="productListing"]//tr[@class="productListing-even" or @class="productListing-odd"]'): product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', tr.select(".//td[2]//a/@href").extract()[0]) product_loader.add_value('name', tr.select(".//td[2]//a/text()").extract()[0]) product_loader.add_value('price', tr.select(".//td[3]/text()").extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.')) yield product_loader.load_item() # edge case: product listing page with a single product product_price = hxs.select('//h2[@id="productPrices"]/text()').extract() if product_price: # this product listing page contains a single product product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@id="productName"]/text()') product_loader.add_value('url', response.url) product_loader.add_value('price', product_price[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.')) yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('price', u'//div[@id="conv-box"]//dd[@class="amount"]/text()') if not product_loader.get_output_value('price'): product_loader.add_xpath('price', u'//dl[@class="ssa-price-dl"]/dd[@class="ssa-price"]/text()') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) name = hxs.select(u'//div[@class="right-column-left"]/div[@class="title"]/h2/text()').extract()[0].strip() product_loader.add_value('name', name) # sku = response.meta['sku'].lower().split(' ') # name = product_loader.get_output_value('name').lower() # sku = filter(lambda x: x != '' and x in name, sku) part_number = hxs.select(u'//div[@class="title"]/h2/span/text()').re('Part No. (.*)')[0] mfrgid = response.meta['mfrgid'] if part_number == mfrgid and product_loader.get_output_value('price'): yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select('//ul[@class="product-list"]/li') for product in products: product_loader = ProductLoader(item=Product(), selector=product) name = product.select('.//h2/a/strong/text()').extract()[0] extra_name = product.select('.//h2/a/text()').extract() if extra_name: name += ' ' + extra_name[0] product_loader.add_value('name', name) url = product.select('.//h2/a/@href').extract() url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) product_loader.add_xpath('price', u'.//p/strong/text()', re='\xa3(.*)') yield product_loader.load_item() if not products and not response.meta.get('retry'): yield Request(response.url, callback=self.parse_product, dont_filter=True, cookies={}, meta={'dont_merge_cookies': True, 'retry': True})
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) multiple_options = hxs.select(u'//div[@id="spec-with-options"]//table//tr')[1:] name = hxs.select('//div[@id="product-title"]/text()').extract()[0] if not multiple_options: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name) product_loader.add_value('url', response.url) product_loader.add_xpath('price', u'//div[@class="price-now"]/span[contains(@id,"product-price")]/text()', re='\xa3(.*)') yield product_loader.load_item() else: for option in multiple_options: product_loader = ProductLoader(item=Product(), selector=option) option_name = option.select('./td[position()=2]/text()').extract()[0] product_loader.add_value('name', name + ' ' + option_name) product_loader.add_value('url', response.url) product_loader.add_xpath('price', './/div[@class="price-now"]/span/text()', re='\xa3(.*)') yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # products products = hxs.select(u'//div[@class="rightcol"]//td[contains(child::text(),"\xa3")] | //div[@class="rightcol"]//td[child::h1]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('name', './a/text()') product_loader.add_xpath('name', './h1/text()') url = product.select('./a/@href').extract() if not url: url = response.url else: url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) price = product.select('./text()').re('\xa3(.*)') if not price: price = product.select('.//span[@id="_EKM_PRODUCTPRICE"]/text()').extract() if not price: continue product_loader.add_value('price', price) yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # products products = hxs.select('//div[@id="mtbody"]//table//table//a/img/../..') for product in products: product_loader = ProductLoader(item=Product(), selector=product) #product_loader.add_xpath('name', './/span[@class="cellheader"]/a/text()') product_loader.add_xpath('price', './/span[@class="pricetext"]/text()', re='.*\$(.*[0-9])') sku = product.select('.//span[@class="sku"]/text()').extract() if not sku: continue sku = re.sub('[.\- ]', '', sku[0]) product_loader.add_value('sku', sku) if sku: product_loader.add_value('name', sku.lower()) else: product_loader.add_xpath('name', './/span[@class="cellheader"]/a/text()') url = product.select('.//span[@class="cellheader"]/a/@href').extract() if not url: continue url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # products products = hxs.select('//div[@class="product_listing"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) #product_loader.add_xpath('name', './/span[@class="prod_name"]/a/@title') product_loader.add_xpath('price', './/span[@class="prod_our_price"]/strong/text()', re='.*\$(.*[0-9])') sku = product.select('.//span[@class="prod_number"]/text()').re('\((.*)\)') sku = re.sub('[\-]', '', sku[0]) product_loader.add_value('sku', sku) if sku: product_loader.add_value('name', sku.lower()) else: product_loader.add_xpath('name', './/span[@class="prod_name"]/a/@title') url = product.select('.//span[@class="prod_name"]/a/@href').extract() url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # pages next_page = hxs.select(u'//div[@style="float:left;padding-right:8px;"]/a[child::img]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, callback=self.parse_product) products = hxs.select(u'//div[contains(@class,"itemGrid")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[@class="oesLink"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select(u'.//a[@class="oesLink"]/span/text()').extract()[0] name += ' ' + product.select(u'.//a[@class="oesLink"]/text()').extract()[0] product_loader.add_value('name', name) product_loader.add_xpath('price', u'.//span[@class="PlistOfferPrice"]/text()', re=u'\$(.*)') product_loader.add_xpath('price', u'.//div[@class="pricing"]/span/div/span/text()', re=u'\$(.*)') loaded = product_loader.get_output_value('name') and product_loader.get_output_value('price') if not loaded: continue yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) # detect multiple product page tableHeader = hxs.select("//td[text()='Item#']") if tableHeader: subProducts = tableHeader.select("../../tr[@class='Multi-Child_Background']") for subProduct in subProducts: loader = ProductLoader(Product(), subProduct) theTDs = subProduct.select("td") loader.add_value('sku', theTDs[0].select("text()").extract()) loader.add_value('name', theTDs[1].select("text()").extract()) loader.add_value('price', theTDs.select("b/text()").extract()) loader.add_value('url', response.url) yield loader.load_item() else: productNode = hxs.select('//table[@id="v65-product-parent"]')[0] priceNode = productNode.select(".//font[@class='pricecolor colors_productprice']/text()") # Unavailable products are still online but have no price if priceNode: loader = ProductLoader(selector=productNode, item=Product()) loader.add_xpath('name', './/font[@class="productnamecolorLARGE colors_productname"]/text()') loader.add_value('url', response.url) loader.add_value('price', priceNode.extract()) sku = ''.join(hxs.select('.//span[@class="product_code"]/text()').extract()).strip() loader.add_value('sku', sku) yield loader.load_item()
def parse_option_price(self, response): product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', response.meta['name']) product_loader.add_value('url', response.meta['url']) product_loader.add_xpath('price', u'//div[@class="webPriceLabel"]/text()', re=u'\xa3(.*)') yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@id="top_product_info_block_product_title_text"]/text()') loader.add_xpath('sku', '//ul[@id="top_product_info_block_product_data_list"]/li/strong/text()') loader.add_xpath('price', '//p[@id="top_product_info_block_product_data_new_low_price"]/text()') yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()') product_loader.add_xpath('price', '//div[@class="club"]/span[@itemprop="Price"]/text()', re='.*\$(.*[0-9])') product_loader.add_value('url', response.url) return product_loader.load_item()
def parse_products(self, hxs, base_url): products = hxs.select('//div[@class="productResultInfo"]') for product in products: product_loader = ProductLoader(Product(), product) product_loader.add_xpath('name', './/a[@class="ProductNameText"]/text()') url = product.select('.//a[@class="ProductNameText"]/@href').extract()[0] product_loader.add_value('url', urljoin_rfc(base_url, url)) price = ' '.join(product.select('.//span[@class="variantprice"]//text()').extract()) product_loader.add_value('price', price) product_loader.add_xpath('sku', './/p[contains(@class, "productSKU")]/text()') yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', response.meta['sku']) product_loader.add_xpath('price', '//div[@class="yourPrice"]/span[@class="salePriceContent"]/text()', re='.*\$(.*)') product_loader.add_xpath('price', '//div[@class="yourPrice"]/span[@class="itemPriceContent"]/text()', re='.*\$(.*)') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) return product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()') product_loader.add_xpath( 'price', '//div[@class="club"]/span[@itemprop="Price"]/text()', re='.*\$(.*[0-9])') product_loader.add_value('url', response.url) return product_loader.load_item()
def parse_item(self, response): hxs = HtmlXPathSelector(response) image_url = hxs.select( '//div[@class="ProductThumbImage"]/a/@href').extract() image_url = image_url[0] if image_url else '' brand = hxs.select('//h4[@class="BrandName"]/a/text()').extract() brand = brand[0] if brand else '' items = hxs.select( "//div[@id='ProductDetails']/div[@class='BlockContent']") for item in items: title = item.select( './/div[@class="ProductDetailsGrid"]//h1/text()').extract()[0] url = response.url product_id = item.select( ".//input[@type='hidden' and @name='product_id']/@value" ).extract()[0] select_el = item.select( ".//div[@class='productOptionViewSelect']/select") options = list(Options(select_el).gen()) if options: field_name = select_el.select("@name").extract()[0] for option in options: options_dict = {x[0]: x[1][0] for x in option} item_options = json_api_request_args.copy() item_options.update(options_dict) item_options['product_id'] = product_id new_item_name = title + " " + " ".join( [x[1][1] for x in option]) request = FormRequest(url=json_api_url, formdata=item_options, callback=self._parse_item_json) request.meta['item_name'] = new_item_name request.meta['item_url'] = url request.meta['subtype_id'] = "-".join( [x[1][0] for x in option]) request.meta['product_id'] = product_id request.meta['image_url'] = image_url request.meta['brand'] = brand request.meta['category'] = response.meta.get('category') yield request else: l = ProductLoader(item=Product(), response=response) l.add_value('identifier', product_id) l.add_value('name', title) l.add_value('url', url) l.add_value('image_url', image_url) l.add_value('category', response.meta.get('category')) l.add_value('brand', brand) l.add_xpath( 'price', '//div[contains(@class, "PriceRow")]/div/span/text()') yield l.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) row = response.meta['row'] name = hxs.select('//span[@itemprop="name"]/text()').extract()[0].strip() url = response.url price = hxs.select('//p[@class="special-price"]/span[@class="price"]/text()').extract() if not price: price = hxs.select('//span[@class="regular-price"]/span[@class="price"]/text()').extract() price = price[0] if price else 0 l = ProductLoader(item=Product(), response=response) l.add_value('name', name) l.add_value('url', response.url) l.add_value('sku', row['SKU']) l.add_value('price', price) identifier = hxs.select('//input[@name="productId"]/@value').extract() if not identifier: identifier = hxs.select('//input[@name="product"]/@value').extract() l.add_value('identifier', identifier) l.add_xpath('brand', '//tr[th/text()="Brand"]/td/text()') l.add_xpath('image_url', '//a[@id="shoe-spin"]/img/@src') categories = hxs.select('//li[@typeof="v:Breadcrumb"]/a/text()').extract() l.add_value('category', categories) in_stock = hxs.select('//div[@class="offer"]//p[@class="availability in-stock"]') if not in_stock: l.add_value('stock', 0) item = l.load_item() options_config = re.search(r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: option_item = deepcopy(item) product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join((products.get(product, ''), option['label'])) prices[product] = prices.get(product, 0) + extract_price(option['price']) for option_id, option_name in products.iteritems(): option_item = deepcopy(item) option_item['identifier'] = option_item['identifier'] + '-' + option_id option_item['name'] = option_item['name'] + re.findall('(.*) \(', option_name)[0] option_item['price'] = option_item['price'] + prices[option_id] if 'IN STOCK' not in option_name.upper(): option_item['stock'] = 0 yield option_item else: yield item
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) special_price_url = hxs.select( "//td[@class='tdcf10bk']/a/@href").extract() if special_price_url: special_price_url = special_price_url[0] special_price_url = urljoin_rfc(get_base_url(response), special_price_url) request = Request(url=special_price_url, callback=self.parse_product) request.meta['sku'] = response.meta['sku'] request.meta['mfrgid'] = response.meta['mfrgid'] request.meta['search_q'] = response.meta['search_q'] yield request return product_loader = ProductLoader(item=Product(), response=response) name = hxs.select("//h1/font/b/text()").extract() price = hxs.select("//font[@color='#990000']/b/text()").extract() if not name or not price: retry_count = self.retry_urls.get(response.url, 0) retry_count += 1 if retry_count > 100: self.log("ERROR MAX retry count reached (100), giving up...") return else: self.log( "ERROR parsing HTML, adding to retry queue (#{})".format( retry_count)) self.retry_urls[response.url] = retry_count request = Request(url=response.url, callback=self.parse_product, dont_filter=True) request.meta['sku'] = response.meta['sku'] request.meta['mfrgid'] = response.meta['mfrgid'] request.meta['search_q'] = response.meta['search_q'] yield request return else: product_loader.add_value('name', name[0]) product_loader.add_value('price', price[0]) product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku'].lower()) product_loader.add_xpath('identifier', '//form/input[@name="PID"]/@value') yield product_loader.load_item()
def parse_product(self, response): compound = [product for product in self._parse_compound_product(response)] if compound: for product in compound: yield product return loader = ProductLoader(response=response, item=Product()) loader.add_xpath('name', '//h1[@itemprop="Name"]//text()') loader.add_xpath('price', '//input[@name="price"]/@value') loader.add_value('url', response.url) loader.add_xpath('sku', '//span[@itemprop="model"]/text()') yield loader.load_item()
def browse_and_parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for subcat_href in hxs.select( '//div[@id="navColumnOne"]//a/@href').extract(): subsubcat_url = urlparse.urljoin(base_url, subcat_href) if subsubcat_url not in self.navig_url_set: self.navig_url_set.add(subsubcat_url) yield Request(subsubcat_url, callback=self.browse_and_parse) next_page = hxs.select( "//div[@id='productListing']//div[@id='productsListingListingTopLinks']//a[contains(., 'Neste')]/@href" ) if next_page: yield Request(next_page[0].extract(), callback=self.browse_and_parse) # parse product listing in this page, if any for tr in hxs.select( '//div[@id="productListing"]//tr[@class="productListing-even" or @class="productListing-odd"]' ): product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value( 'url', tr.select(".//td[2]//a/@href").extract()[0]) product_loader.add_value( 'name', tr.select(".//td[2]//a/text()").extract()[0]) product_loader.add_value( 'price', tr.select(".//td[3]/text()").extract()[0].split("-")[0].split( " ")[1].replace('.', '').replace(',', '.')) yield product_loader.load_item() # edge case: product listing page with a single product product_price = hxs.select( '//h2[@id="productPrices"]/text()').extract() if product_price: # this product listing page contains a single product product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@id="productName"]/text()') product_loader.add_value('url', response.url) product_loader.add_value( 'price', product_price[0].split("-")[0].split(" ")[1].replace( '.', '').replace(',', '.')) yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) multiple_prices = hxs.select( u'//table[@class="grpChld"]//tr[@class="r1"]') if not multiple_prices: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', u'//div[@class="det"]/h1/text()') product_loader.add_value('url', response.url) product_loader.add_xpath( 'price', u'//div[@class="addBsk"]/div[@class="pri"]/b/text()', re=u'\xa3(.*)') yield product_loader.load_item() else: for name_and_price in multiple_prices: product_loader = ProductLoader(item=Product(), selector=name_and_price) product_loader.add_xpath('name', u'./td[@class="c1"]/text()', re=u'.*?-[\xa0]*(.*)') product_loader.add_value('url', response.url) product_loader.add_xpath( 'price', u'./following-sibling::node()[1]/td[@class="c3"]/span/text()', re=u'\xa3(.*)') yield product_loader.load_item()
def load_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) next = hxs.select('//div[@class="pager"]//a[@class="next i-next"]') if next: href = next.select("./@href").extract()[0] yield Request(urlparse.urljoin(base_url, href), callback=self.load_products) for product_box in hxs.select('//ol[@id="products-list"]/li'): product_loader = ProductLoader(item=Product(), selector=product_box) product_loader.add_xpath('name', './/h2[@class="product-name"]/a/text()') product_loader.add_xpath('url', './/h2[@class="product-name"]/a/@href') if product_box.select('.//p[@class="special-price"]'): product_loader.add_xpath( 'price', './/div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()' ) else: product_loader.add_xpath( 'price', './/div[@class="price-box"]//span[@class="regular-price"]/span[@class="price"]/text()' ) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select( '//td[@id="workspace"]/h1/a/text()').pop().extract().strip() category = hxs.select( '//div[@class="odkazy_cesta"]/a/text()').pop().extract().strip() sku = hxs.select('//input[@name="detail"]/@value').extract().pop() pid = sku if not sku: sku = pid price = self.parse_price( hxs.select( '//table[@id="detail_tabulka2"]/tr/th[contains(text(), "Cena s DPH")]/following-sibling::td/descendant-or-self::text()' ).pop().extract()) stock = hxs.select( '//table[@id="detail_tabulka2"]/tr/td//img[contains(@src, "skladem.png")]' ) if price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) try: loader.add_xpath('image_url', '//td[@id="detail_foto"]/div/a/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) except IndexError: self.errors.append("No image set for url: '%s'" % urljoin(base_url, response.url)) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', 'LEGO') loader.add_value('shipping_cost', 69) if not stock: loader.add_value('stock', 0) yield self.load_item_with_metadata(loader.load_item()) else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="summaryboxsearch"]') for product in products[0:1]: # extract only the first product product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath( 'price', u'.//span[@class="floatl sli_price"]/text()') product_loader.add_xpath('url', u'.//p[@class="mtext nobreak"]/a/@title') product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) product_loader.add_xpath('name', u'.//p[@class="mtext nobreak"]/a/text()') name = product_loader.get_output_value('name').lower() sku = product_loader.get_output_value('sku').lower().split(' ') sku = filter(lambda x: x != '' and x in name, sku) site_mfrgid = product.select( './/span[@class="floatl sli_grid_code"]/text()').extract() if site_mfrgid: mfrgid = response.meta['mfrgid'].lower() site_mfrgid = site_mfrgid[0].strip().lower() if mfrgid in site_mfrgid and sku: yield product_loader.load_item() if not products: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('price', u'//p[@class="strong"]/span/text()') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) product_loader.add_xpath( 'name', u'//div[@class="indentl orderbox"]//h1/text()') name = product_loader.get_output_value('name').lower() sku = product_loader.get_output_value('sku').lower().split(' ') sku = filter(lambda x: x != '' and x in name, sku) site_mfrgid = hxs.select( '//div[@class="indentl orderbox"]/div[@class="floatl"]/p/strong/text()' ).extract() if site_mfrgid: site_mfrgid = site_mfrgid[0].strip().lower() mfrgid = response.meta['mfrgid'].lower() if mfrgid in site_mfrgid and sku: yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="product-name fn"]/h1/text()').extract()[0] multiple_prices = hxs.select(u'//table[@id="super-product-table"]//tr') if not multiple_prices: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name) product_loader.add_value('url', response.url) product_loader.add_xpath('price', u'//div[@class="price-box"]/span[contains(@id,"product-price")]/span[@class="price"]/text()', re='\xa3(.*[0-9])') product_loader.add_xpath('price', u'//div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()', re='\xa3(.*[0-9])') yield product_loader.load_item() else: for name_and_price in multiple_prices: product_loader = ProductLoader(item=Product(), selector=name_and_price) name_options = name_and_price.select(u'./td[position()=1]/text()').extract()[0] product_loader.add_value('name', name + ' ' + name_options) product_loader.add_value('url', response.url) product_loader.add_xpath('price', u'./td[position()=2]/div[@class="price-box"]/span[@class="regular-price"]/span[@class="price"]/text()', re=u'\xa3(.*)') product_loader.add_xpath('price', u'./td[position()=2]/div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()', re=u'\xa3(.*)') yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select( '//div[@id="product-detail"]/h2/text()').extract().pop().strip() category = hxs.select( '//div[@id="product-detail"]/p[@id="zarazeni"]/a/text()').pop( ).extract().strip() sku = hxs.select( '//div[@class="content"]/p/strong[contains(text(), "d produktu")]/following-sibling::text()' )[0].extract().strip() if sku.startswith('lego'): sku = sku[4:] pid = sku #pid = hxs.select('//input[@name="order[id]"]/@value').pop().extract() price = self.parse_price( hxs.select( '//div[@class="content"]/p/strong[@class="price"]/big/text()'). pop().extract()) stock = hxs.select( '//div[@class="content"]/p/strong[@class="price"][contains(text(), "Dostupnost: Skladem")]' ) if price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//table[@id="pictures"]/tr/td[1]/a/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', 'LEGO') if int(price) < 2500: loader.add_value('shipping_cost', 89) if not stock: loader.add_value('stock', 0) yield self.load_item_with_metadata(loader.load_item()) else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//table[@class="list"]//tr')[1:] for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//h3/a/@href').extract() url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//h3/a/text()') product_loader.add_xpath('price', u'.//p[@class="prixPromo"]/text()', re=u'([\d\.]+)') yield product_loader.load_item()
def parse_category(self, response): hxs = HtmlXPathSelector(response) ''' categories = hxs.select("//ul[@id='nav']//a/@href").extract() for category in categories: yield Request(category, callback=self.parse) ''' pages = hxs.select("//div[@class='pages']/ol/li/a/@href").extract() for page in pages: yield Request(page, callback=self.parse_category) items = hxs.select('//li[contains(@class, "item")]/div') for item in items: name = item.select("h2[@class='product-name']/a/text()").extract() if not name: logging.error("NO NAME! %s" % response.url) return name = name[0] url = item.select("h2[@class='product-name']/a/@href").extract() if not url: logging.error("NO URL! %s" % response.url) return url = url[0] # adding product price = item.select( "div[@class='price-box']/p[@class='special-price']/span[@class='price']/text() |\ div[@class='price-box']/span[@class='regular-price']/span[@class='price']/text()" ).extract() if not price: logging.error("NO PRICE! %s" % response.url) return price = price[0].replace(".", "").replace(",", ".") identifier = item.select( './/*[contains(@id, "product-price-")]/@id').re( r'product-price-(\d+)') l = ProductLoader(item=Product(), response=response) l.add_value('identifier', identifier) l.add_value('name', name) l.add_value('url', url) l.add_xpath('image_url', 'a[@class="product-image"]/img/@src') l.add_xpath('category', '//div[contains(@class, "category-title")]/h1/text()') l.add_value('price', price) yield l.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), selector=response) loader.add_value('url', response.url) image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) identifier = response.xpath('//form[@name="form1"]/@action').extract() if not identifier: yield self.retry_request(response) return identifier = identifier[0] loader.add_value('identifier', identifier) price = response.xpath( '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract() loader.add_value('price', price) stock = response.xpath('//div[@class="stockLevel"]//text()').re( r'(\d+)') if stock: loader.add_value('stock', stock[0]) brand = response.xpath('//*[@itemprop="brand"]/@content').extract() if not brand: brand = response.xpath( '//div[@class="hidden"]/input[@class="producerName"]/@value' ).extract() if brand: brand = brand[0].strip() loader.add_value('brand', brand) if 'category' in response.meta: if response.meta['category'] != 'Car tyres': loader.add_value('category', response.meta['category']) else: category = response.xpath( '//dt[contains(text(), "Type:")]/following-sibling::dd/text()' ).extract() if category: loader.add_value('category', category[0].strip()) else: loader.add_value('category', loader.get_output_value('brand')) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('sku', '//*[@itemprop="sku"]/@content') if self.exclude_word not in loader.get_output_value('name'): yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//div[@class="product-listing-2"]/div[contains(@class,"rec")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//div[@class="description"]/h2/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//div[@class="description"]/h2/a/text()') product_loader.add_xpath('price', u'.//span[@class="prod-price"]/text()', re=u'\$(.*)') yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//tr[@align="center" and child::td[child::a[@target="_top"]]]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[@target="_top" and child::span]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//a[@target="_top"]/span/text()') product_loader.add_xpath('price', u'.//span[contains(@class,"price")]/text()', re=u'\xa3([\d\.,]+)') yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="shopprods"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('name', './/p/strong/a/text()') url = product.select('.//p/strong/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) price = product.select('.//span[@class="price"]/text()').extract()[0] price = Decimal(price) + Decimal(5) price = str(price) product_loader.add_value('price', price) yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//form[@name="frmCompare"]//ul[@class="ProductList "]//li') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//div[@class="ProductDetails"]/strong/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//div[@class="ProductDetails"]/strong/a/text()') product_loader.add_xpath('price', u'.//div[@class="ProductPriceRating"]/em/text()', re=u'\$(.*)') yield product_loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) #pagination urls = hxs.select( '//ul[@class="pager"]/li[@class!="selected"]/a/@href').extract() for url in urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse_products) #products category = hxs.select('//*[@id="listing_h1"]/h1/text()').extract() products = hxs.select( '//*[@id="listing_products2"]/div[@class="product"]') for product in products: loader = ProductLoader(item=Product(), selector=product) name = product.select('.//div[@class="product_title"]/h2/a/text()' ).extract()[0].strip() url = product.select( './/div[@class="product_title"]/h2/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(base_url, url)) loader.add_value('name', name) loader.add_xpath('image_url', './/div[@class="product_image"]/a/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) price = product.select( './/span[@class="price"]/text()').extract()[0] price = price.split(u'\xa0')[0] price = extract_price_eu(price) loader.add_value('price', price) sku = product.select( './/table/tr[1]/td[2]/strong/text()').extract() if sku: loader.add_value('sku', sku[0]) identifier = product.select( './/div[@class="product_title"]/h2/a/@href').re( r"-p([\d]+)$")[0] loader.add_value('identifier', identifier) loader.add_value('brand', 'LEGO') stock = product.select( './/table//span[@class="skladom"]/text()').extract() if stock: results = re.search(r"\b([\d]+)\b", stock[0]) if results: loader.add_value('stock', results.group(1)) if category: loader.add_value('category', category[0]) yield self.load_item_with_metadata(loader.load_item())
def parse_product(self, response): base_url = get_base_url(response) image_url = response.xpath('//img[@id="mainImg"]/@src').extract() product_loader = ProductLoader(item=Product(), response=response) identifier = response.xpath('//div[@class="productCode"]/span/text()').extract() if not identifier: return identifier = identifier[0] product_loader.add_value('identifier', identifier) product_loader.add_xpath('name', '//h1/text()') if image_url: product_loader.add_value('image_url', response.urljoin(image_url[0])) product_loader.add_value('sku', identifier) price = response.xpath('//div[@class="prodRightWrapper"]//div[@class="price"]/text()').extract()[0].strip() product_loader.add_value('price', price) product_loader.add_value('url', response.url) categories = response.xpath('//div[@id="breadCrumbWrapper"]//div[@itemprop="title"]/text()').extract()[1:-1] product_loader.add_value('category', categories) product_loader.add_value('brand', '') item = product_loader.load_item() options_url = "http://www.careco.co.uk/ajaxTwoDimSelect/" options = response.xpath('//select[@class="buysSelect"]/option[@value!=""]') if options: for option in options: option_item = deepcopy(item) name = option.xpath('text()').extract()[0].split(u'\xa3')[0].strip() option_item['name'] += ' ' + name identifier = option.xpath('@value').extract()[0] option_item['identifier'] += '-' + identifier price = option.xpath('text()').re(u'\xa3\d+\.\d+') if price: option_item['price'] = extract_price(price[0]) ajax_option = option.xpath('@onclick') if ajax_option: formdata = {'FS': item['identifier'], 'CODE': option.xpath('@value').extract()[0]} yield FormRequest(options_url, dont_filter=True, formdata=formdata, callback=self.parse_options, meta={'item': option_item}) else: yield option_item else: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) prices = hxs.select('//span[@class="price"]/text()') loader = ProductLoader(response=response, item=Product()) if prices: loader.add_value('price', prices[len(prices) - 1]) loader.add_xpath('name', '//div[@class="product_l"]/h2/text()') loader.add_value('url', response.url) txt = hxs.select("//label[starts-with(text(), 'Manufacturers')]").extract()[0] sku = txt[txt.find('/label>')+7:] loader.add_value('sku', sku.strip()) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="shopprods"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('name', './/p/strong/a/text()') url = product.select('.//p/strong/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) price = product.select( './/span[@class="price"]/text()').extract()[0] price = Decimal(price) + Decimal(5) price = str(price) product_loader.add_value('price', price) yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', response.meta['sku']) product_loader.add_xpath( 'price', '//div[@class="yourPrice"]/span[@class="salePriceContent"]/text()', re='.*\$(.*)') product_loader.add_xpath( 'price', '//div[@class="yourPrice"]/span[@class="itemPriceContent"]/text()', re='.*\$(.*)') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) return product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//div[@class="listitem"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//div[@class="heading"]/a[child::span[@class="ProductListHead"]]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select(u'.//div[@class="heading"]/a/span[@class="ProductListHead"]/text()').extract()[0].strip() product_loader.add_value('name', name) product_loader.add_xpath('price', u'.//span[@class="price"]/span[@class="ProductListItem"]/text()', re=u'\xa3(.*)') yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select( '//div[@class="product-name"]/h1/text()').extract().pop().strip() category = hxs.select('//div[@class="breadcrumbs"]/ul/li/a/text()' ).pop().extract().strip() if category.startswith(u'Dom\u016f'): category = "" sku = self.get_sku_from_text(name) pid = hxs.select('//input[@name="product"]/@value').pop().extract() if not sku: sku = "" price = self.parse_price("".join( hxs.select( '//span[contains(@id, "product-price")]/descendant-or-self::text()' ).extract())) #stock = hxs.select('//p[@class="availability in-stock"]') if price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//div[@class="product-img-box"]/p/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', 'LEGO') loader.add_value('shipping_cost', 59) #if not stock: #loader.add_value('stock', 0) yield self.load_item_with_metadata(loader.load_item()) else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('price', u'//span[@class="price"]/span[@class="price" and contains(@id, "sec_discounted_price")]/text()') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) product_loader.add_xpath('name', u'//div[@class="product-info"]/h1[@class="mainbox-title"]/text()') site_mfrgid = hxs.select(u'//div[@class="form-field" and child::label[contains(text(),"Model#")]]/text()').extract() if len(site_mfrgid) > 1: site_mfrgid = site_mfrgid[1].strip() if site_mfrgid == response.meta['mfrgid']: yield product_loader.load_item()
def parse_products(self, hxs): products = hxs.select('//div[starts-with(@id, "product_")]') for product in products: product_loader = ProductLoader(Product(), product) product_loader.add_xpath('url', './/span[@class="description"]/a/@href') product_loader.add_xpath('name', './/span[@class="description"]/a/b/text()') #product_loader.add_xpath('price', './/label/text()') product_loader.add_xpath('price', './/div[@class="our_price"]/text()') product_loader.add_xpath('sku', './/span[@class="description"]', re='Model #:[\s(]*([\S^)]*)') yield product_loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) #pagination urls = hxs.select( '//ul[@class="pager"]/li[@class!="selected"]/a/@href').extract() for url in urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse_products) #products category = hxs.select('//*[@id="wherei"]/p//a/text()').extract() products = hxs.select('//div[@class="productBody"]') for product in products: loader = ProductLoader(item=Product(), selector=product) name = product.select( './/div[@class="productTitleContent"]/a/text()').extract( )[0].strip() url = product.select( './/div[@class="productTitleContent"]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(base_url, url)) loader.add_value('name', name) loader.add_xpath('image_url', './/div[@class="img_box"]/a/img[1]/@src', Compose(lambda v: urljoin(base_url, v[0]))) price = product.select( './/div[@class="productPrice"]/span[contains(@itemprop, "price")]/text()' ).extract()[0] price = price.split(u'\xa0')[0] price = extract_price_eu(price) loader.add_value('price', price) results = re.search(r"\b([\d]+)\b", name) if results: loader.add_value('sku', results.group(1)) identifier = product.select( './/div[@class="img_box"]/a/img[1]/@rel').extract()[0] loader.add_value('identifier', identifier) loader.add_value('brand', 'LEGO') stock = product.select('.//div[@class="stock_no"]').extract() if stock: loader.add_value('stock', 0) if category: loader.add_value('category', category[-1]) if price < 15: loader.add_value('shipping_cost', 2.69) yield self.load_item_with_metadata(loader.load_item())
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # products products = hxs.select(u'//b[contains(text(), "\xa3")]/../..') for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('name', './b/font/text()') product_loader.add_value('url', response.url) price = product.select(u'.//b[contains(text(), "\xa3")]/text()').re('\xa3(.*[0-9])') if not price: continue product_loader.add_value('price', price) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_xpath( 'name', '//font[@class="productnamecolorLARGE colors_productname"]/text()') loader.add_value('url', response.url) loader.add_xpath( 'price', '//font[@class="pricecolor colors_productprice"]/text()') sku = (''.join( hxs.select( '//span[@class="product_code"]/text()').extract()).strip()) # sku = [x.strip() for x in sku if x.strip()] sku = sku[3:] loader.add_value('sku', sku) # loader.add_value('sku', "the_sku") yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select('//table[@width="86%"]/tr') for product in products: sku_ = product.select('./form/td[1]/b/text()').extract() if sku_: site_mfrgid = product.select('./form/td[2]/font[contains(text(),"Manufacturer")]/b/text()').extract() if site_mfrgid: site_mfrgid = site_mfrgid[0].lower() == response.meta['mfrgid'].lower() else: site_mfrgid = False if sku_[0] == response.meta['search_q'] or site_mfrgid: price = "".join(product.select("./form/td[3]/font/b/text()").re(r'([0-9\,\. ]+)')).strip() if price: name = product.select('./form/td[2]/text()').extract()[0] product_loader = ProductLoader(item=Product(), response=response) if '...Regularly' in name: name = re.sub('\.{3}Regularly.*?\$.*$', '', name) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) product_loader.add_value('name', response.meta['sku'] + ' ' + name) yield product_loader.load_item() name = hxs.select(u'//h1[@class="big product_title"]/text()').extract() if not products and name: product_loader = ProductLoader(item=Product(), response=response) name = name[0] if '...Regularly' in name: name = re.sub('\.{3}Regularly.*?\$.*$', '', name) product_loader.add_value('name', name) product_loader.add_xpath('price', u'//dt[@id="prod_price"]//span[@class="small"]/strong[@class="big"]/text()', re='\$(.*)') product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) product_loader.add_value('url', response.url) site_mfrgid = hxs.select(u'//span[@class="small" and contains(text(),"Manufacturer")]/following-sibling::strong[1]/text()').extract() if site_mfrgid: site_mfrgid = site_mfrgid[0].lower().strip() if site_mfrgid == response.meta['mfrgid'].strip().lower(): yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) featured_product = hxs.select(u'//div[@class="featuredProduct"]') product_loader = ProductLoader(item=Product(), selector=featured_product) url = featured_product.select(u'.//div[@class="fDescription"]/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//div[@class="fDescription"]/a/strong/text()') price_css_classes = [{'tag': 'span', 'class': 'newprice'}, {'tag': 'div', 'class': 'price'}] for price_css_class in price_css_classes: price = featured_product.select(u'.//' + price_css_class['tag'] + '[@class="' + price_css_class['class'] + '"]/text()').re(u'([0-9\,\.]+)') if price: price = re.sub(',', '.', price[0]) product_loader.add_value('price', price) break yield product_loader.load_item() products = hxs.select(u'//div[contains(@class,"productsRow")]/div[contains(@class,"productItem")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//div[@class="prodDecription"]/a/@href').extract() if not url: continue url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//div[@class="prodDecription"]/a/text()') price_css_classes = [{'tag': 'span', 'class': 'newprice'}, {'tag': 'div', 'class': 'price'}] for price_css_class in price_css_classes: price = product.select(u'.//' + price_css_class['tag'] + '[@class="' + price_css_class['class'] + '"]/text()').re(u'([0-9\,\.]+)') if price: price = re.sub(',', '.', price[0]) product_loader.add_value('price', price) break yield product_loader.load_item() if not products or not featured_product: log.msg('Retrying url: %s' % response.url, level=log.WARNING) retries = response.meta.get('retries', 0) if retries < 3: yield Request(response.url, dont_filter=True, meta={'retries': retries + 1})
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: name = hxs.select( '//*[@itemprop="name"]/text()').extract().pop().strip() except IndexError: yield Request(response.url.replace( 'hamleys.com/', 'hamleys.com/detail.jsp?pName=').replace('.ir', ''), callback=self.parse_product) return out_of_stock = 'OUT OF STOCK' in ''.join( hxs.select( '//li[@class="stockStatus"]/span/text()').extract()).upper() # cat_regex = 'LEGO Duplo|LEGO Bricks and More|LEGO Bricks|LEGO Creator|LEGO City|LEGO Ninjago|LEGO Monster Fighters|LEGO Super Heros|LEGO Lord Of The Rings|LEGO Star Wars|LEGO Games' loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//img[@class="productMain"]/@src', TakeFirst()) loader.add_xpath('price', '//div[@class="productprice "]/text()', Join(''), re="([.0-9]+)") category = hxs.select( '//div[@class="pagetopnav"]/ul[contains(@class, "crumb")]/li/a/text()' ).extract()[-2] loader.add_value('category', category) loader.add_value('sku', name, re=' (\d\d\d+)\s*$') loader.add_value('brand', 'LEGO') identifier = hxs.select( '//*[@itemprop="productID"]/text()').extract()[0].replace( 'Code: ', '') loader.add_value('identifier', identifier) if out_of_stock: loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): schema = SpiderSchema(response) product_data = schema.get_product() loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', product_data['productID']) loader.add_value('sku', product_data['productID']) loader.add_value('name', product_data['name']) out_stock = bool(response.css('.product-shop .out-of-stock')) if (not out_stock) and ( 'InStock' in product_data['offers']['properties']['availability']): loader.add_value('stock', 1) else: loader.add_value('stock', 0) category = response.css('.breadcrumbs').xpath( './/li/a/text()').extract()[1:] loader.add_value('category', category) loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath( 'brand', '//th[@class="label" and contains(text(), ' '"Brand")]/following-sibling::td/text()') price = response.css('.product-shop .price-box .minimal-price .price' ).xpath('text()').re_first(r'[\d\.,]+') if not price: price = response.css( '.product-shop .price-box .regular-price .price').xpath( 'text()').re_first(r'[\d\.,]+') if not price: price = response.css( '.product-shop .price-box .special-price .price').xpath( 'text()').re_first(r'[\d\.,]+') loader.add_value('price', price) if loader.get_output_value('price') >= Decimal('45.0'): loader.add_value('shipping_cost', '0.0') else: loader.add_value('shipping_cost', '4.95') yield loader.load_item() for url in response.css('.grouped-items-table-wrapper .name-wrapper' ).xpath('a/@href').extract(): yield Request(url, callback=self.parse_product)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) base_name = hxs.select(u'//div[@class="ProductTopTitle"]/h1/text()').extract() multiple_options = hxs.select('//div[@class="variantdiv"]') if not multiple_options: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', base_name) product_loader.add_value('url', response.url) product_loader.add_xpath('price', u'//div[@class="webPriceLabel"]/text()', re=u'\xa3(.*)') yield product_loader.load_item() else: color_options = multiple_options.select(u'.//select[contains(@id,"Color")]/option/@value').extract() size_options = multiple_options.select(u'.//select[contains(@id,"Size")]/option/@value').extract() if color_options: for color in color_options[1:]: if size_options: for size in size_options[1:]: params = {'Colour': color, 'Size': size} url = response.url + '?' + urlencode(params) request = Request(url, callback=self.parse_option_price, dont_filter=True) request.meta['name'] = base_name[0] + ' ' + size + ' ' + color request.meta['url'] = response.url yield request else: params = {'Colour': color} url = response.url + '?' + urlencode(params) request = Request(url, callback=self.parse_option_price, dont_filter=True) request.meta['name'] = base_name[0] + ' ' + color request.meta['url'] = response.url yield request elif size_options: for size in size_options[1:]: params = {'Size': size} url = response.url + '?' + urlencode(params) request = Request(url, callback=self.parse_option_price, dont_filter=True) request.meta['name'] = base_name[0] + ' ' + size request.meta['url'] = response.url yield request
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # products products = hxs.select(u'//b[contains(text(), "\xa3")]/../..') for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('name', './b/font/text()') product_loader.add_value('url', response.url) price = product.select( u'.//b[contains(text(), "\xa3")]/text()').re('\xa3(.*[0-9])') if not price: continue product_loader.add_value('price', price) yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//ul[@class="product-list"]/li') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//div[@class="listItemLink"]/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select(u'.//div[@class="listBrand"]/text()').extract()[0] name += ' ' + product.select(u'.//div[@class="listItemLink"]/a/text()').extract()[0] name += ' ' + product.select(u'.//div[@class="listData"]/text()').extract()[0] product_loader.add_value('name', name) product_loader.add_xpath('price', u'.//span[@class="salePrice"]/span/text()', re=u'\$(.*)') yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) # products products = hxs.select('//div[@class="product_listing"]') id_regex = re.compile(r'id=(\d+)') for product in products: product_loader = ProductLoader(item=Product(), selector=product) # product_loader.add_xpath('name', './/span[@class="prod_name"]/a/@title') product_loader.add_xpath( 'price', './/span[@class="prod_our_price"]/strong/text()', re='.*\$(.*[0-9])') price = product.select( './/span[@class="prod_our_price"]/strong/text()').re( r'.*\$(.*[0-9])') if not price: price = product.select( './/span[@class="prod_sale_price"]/span/text()').re( r'.*\$(.*[0-9])') if not price: self.log('NO PRICE => %s' % response.url) continue product_loader.add_value('price', price[0]) sku = product.select('.//span[@class="prod_number"]/text()').re( '\((.*)\)') sku = re.sub('[\-]', '', sku[0]) product_loader.add_value('sku', sku) product_loader.add_xpath('name', './/span[@class="prod_name"]/a/@title') url = product.select( './/span[@class="prod_name"]/a/@href').extract() product_loader.add_value('identifier', re.search(id_regex, url[0]).groups()[0]) url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) img = product.select('.//img/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) product_loader.add_value('category', response.meta.get('category')) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//*[@itemprop="name"]/text()').extract().pop().strip() # category = hxs.select('//div[@class="breadcrumbs"]/a/text()')[-1].extract().strip() # category = name.split('-')[0].strip() categories = map(unicode.strip, hxs.select('//ul[@id="breadcrumbs"]/li/a/text()').extract()) if categories: category = categories[-1] if category.startswith(u"\xbb"): category = category[2:] else: category = '' pid = hxs.select('//*[@itemprop="identifier"]/text()').pop().extract().strip() sku = hxs.select(u'//th[contains(text(), "K\xf3d produktu")]/following-sibling::td[1]/text()').extract().pop().strip() price = hxs.select('//meta[@itemprop="price"]/@content').pop().extract() stock = hxs.select('//meta[@itemprop="availability" and @content="in_stock"]') if price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src', Compose(lambda v: urljoin(base_url, v[0]))) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', 'LEGO') if Decimal(price) < Decimal('3000'): loader.add_value('shipping_cost', 95) if not stock: loader.add_value('stock', 0) yield self.load_item_with_metadata(loader.load_item()) else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))