def parse_product(self, response): if not isinstance(response, HtmlResponse): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//td[@class="page_headers"]/text()') product_loader.add_xpath( 'price', u'//td[@class="price-info"]//div[@id="price" and @class="price"]/text()', re=u'\$(.*)') name = product_loader.get_output_value('name').strip() md5 = hashlib.md5() md5.update(name) hashed_name = md5.hexdigest() sku = self.skus_dict[hashed_name] product_loader.add_value('sku', sku) product_loader.add_xpath('sku', u'//span[@id="product_id"]/text()') product_loader.add_value( 'identifier', product_loader.get_output_value('sku').lower()) loaded = (product_loader.get_output_value('name')) and ( product_loader.get_output_value('price')) if loaded: yield product_loader.load_item() else: return
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath( 'price', u'//div[@id="conv-box"]//dd[@class="amount"]/text()') if not product_loader.get_output_value('price'): product_loader.add_xpath( 'price', u'//dl[@class="ssa-price-dl"]/dd[@class="ssa-price"]/text()') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) name = hxs.select( u'//div[@class="right-column-left"]/div[@class="title"]/h2/text()' ).extract()[0].strip() product_loader.add_value('name', name) # sku = response.meta['sku'].lower().split(' ') # name = product_loader.get_output_value('name').lower() # sku = filter(lambda x: x != '' and x in name, sku) part_number = hxs.select(u'//div[@class="title"]/h2/span/text()').re( 'Part No. (.*)')[0] mfrgid = response.meta['mfrgid'] if part_number == mfrgid and product_loader.get_output_value('price'): yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # pages next_page = hxs.select(u'//div[@style="float:left;padding-right:8px;"]/a[child::img]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, callback=self.parse_product) products = hxs.select(u'//div[contains(@class,"itemGrid")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[@class="oesLink"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select(u'.//a[@class="oesLink"]/span/text()').extract()[0] name += ' ' + product.select(u'.//a[@class="oesLink"]/text()').extract()[0] product_loader.add_value('name', name) product_loader.add_xpath('price', u'.//span[@class="PlistOfferPrice"]/text()', re=u'\$(.*)') product_loader.add_xpath('price', u'.//div[@class="pricing"]/span/div/span/text()', re=u'\$(.*)') loaded = product_loader.get_output_value('name') and product_loader.get_output_value('price') if not loaded: continue yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()') product_loader.add_xpath( 'price', '//div[@class="club"]/span[@itemprop="Price"]/text()', re='.*\$(.*[0-9])') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) if not product_loader.get_output_value('price'): return mfrgid = response.meta['mfrgid'] if product_loader.get_output_value('name'): site_mfrgid = hxs.select( u'//p[@class="specs" and child::span[contains(text(),"Mfg Part")]]/text()' ).extract() site_mfrgid = site_mfrgid[1] if len(site_mfrgid) >= 2 else None name = response.meta['name'].split(' ') if site_mfrgid and (mfrgid == site_mfrgid.strip() or site_mfrgid in name): return product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('price', u'//div[@id="conv-box"]//dd[@class="amount"]/text()') if not product_loader.get_output_value('price'): product_loader.add_xpath('price', u'//dl[@class="ssa-price-dl"]/dd[@class="ssa-price"]/text()') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) name = hxs.select(u'//div[@class="right-column-left"]/div[@class="title"]/h2/text()').extract()[0].strip() product_loader.add_value('name', name) # sku = response.meta['sku'].lower().split(' ') # name = product_loader.get_output_value('name').lower() # sku = filter(lambda x: x != '' and x in name, sku) part_number = hxs.select(u'//div[@class="title"]/h2/span/text()').re('Part No. (.*)')[0] mfrgid = response.meta['mfrgid'] if part_number == mfrgid and product_loader.get_output_value('price'): yield product_loader.load_item()
def parse_brand_list(self, response): hxs = HtmlXPathSelector(response) # products product_items = hxs.select('//div[@class="productGrid"]/ul/li/div[@class="item"]') category_items = hxs.select('//h1[@class="categoryLandingPageTitle_heading"]/a/text()').extract() category = category_items[0] if category_items else '' brand_name = get_brand_from_url(response.url) def get_full_image_url(url): return get_full_url(response, url) for product_item in product_items: image_url = product_item.select(u'div[@class="prodimg"]/a/img/@src').extract() if image_url: image_url = get_full_url(response, image_url[0]) ploadr = ProductLoader(item=Product(), selector=product_item, response=response) ploadr.add_xpath('name', 'div[@class="prodname"]/a/text()', TakeFirst(), Compose(unicode.strip)) ploadr.add_xpath('url', 'div[@class="prodname"]/a/@href', TakeFirst(), Compose(unicode.strip), Compose(get_full_image_url)) ploadr.add_value('category', category) ploadr.add_value('image_url', image_url) price = ploadr.get_xpath('div[@class="proddetails"]//div[@class="prodnowprice"]/span/text()', TakeFirst(), Compose(extract_price)) price_excl_vat = Decimal(price) ploadr.add_value('price', price_excl_vat) ploadr.add_value('shipping_cost', Decimal('5.00') if price_excl_vat < 50 else Decimal('0.0')) ploadr.add_xpath('sku', 'div[@class="proddetails"]//div[@class="proditemcode"]/a/span/following-sibling::text()', TakeFirst(), Compose(unicode.strip)) ploadr.add_value('identifier', ploadr.get_output_value('sku')) stock_info = product_item.select(u'div[@class="proddetails"]/div/div/span[contains(@class, "instock")]/@class').extract() buy_button = product_item.select(u'div[@class="proddetails"]/div[@class="prodquickbuy"]/a[@class="primaryBtn"]').extract() ploadr.add_value('brand', brand_name) ploadr.add_value('stock', 1 if stock_info or buy_button else 0) item = ploadr.load_item() tmp = ''.join(product_item.select("//div[@class='proditemcode']//text()").extract()) item['metadata'] = {'product_code': tmp.split(':')[-1].strip()} if not ploadr.get_output_value('brand'): yield Request(item['url'], meta={'item': item}, callback=self.parse_brand) else: yield item
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="summaryboxsearch"]') for product in products[0:1]: # extract only the first product product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath( 'price', u'.//span[@class="floatl sli_price"]/text()') product_loader.add_xpath('url', u'.//p[@class="mtext nobreak"]/a/@title') product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) product_loader.add_xpath('name', u'.//p[@class="mtext nobreak"]/a/text()') name = product_loader.get_output_value('name').lower() sku = product_loader.get_output_value('sku').lower().split(' ') sku = filter(lambda x: x != '' and x in name, sku) site_mfrgid = product.select( './/span[@class="floatl sli_grid_code"]/text()').extract() if site_mfrgid: mfrgid = response.meta['mfrgid'].lower() site_mfrgid = site_mfrgid[0].strip().lower() if mfrgid in site_mfrgid and sku: yield product_loader.load_item() if not products: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('price', u'//p[@class="strong"]/span/text()') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) product_loader.add_xpath( 'name', u'//div[@class="indentl orderbox"]//h1/text()') name = product_loader.get_output_value('name').lower() sku = product_loader.get_output_value('sku').lower().split(' ') sku = filter(lambda x: x != '' and x in name, sku) site_mfrgid = hxs.select( '//div[@class="indentl orderbox"]/div[@class="floatl"]/p/strong/text()' ).extract() if site_mfrgid: site_mfrgid = site_mfrgid[0].strip().lower() mfrgid = response.meta['mfrgid'].lower() if mfrgid in site_mfrgid and sku: yield product_loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), selector=response) loader.add_value('url', response.url) image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) identifier = response.xpath('//form[@name="form1"]/@action').extract() if not identifier: yield self.retry_request(response) return identifier = identifier[0] loader.add_value('identifier', identifier) price = response.xpath( '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract() loader.add_value('price', price) stock = response.xpath('//div[@class="stockLevel"]//text()').re( r'(\d+)') if stock: loader.add_value('stock', stock[0]) brand = response.xpath('//*[@itemprop="brand"]/@content').extract() if not brand: brand = response.xpath( '//div[@class="hidden"]/input[@class="producerName"]/@value' ).extract() if brand: brand = brand[0].strip() loader.add_value('brand', brand) if 'category' in response.meta: if response.meta['category'] != 'Car tyres': loader.add_value('category', response.meta['category']) else: category = response.xpath( '//dt[contains(text(), "Type:")]/following-sibling::dd/text()' ).extract() if category: loader.add_value('category', category[0].strip()) else: loader.add_value('category', loader.get_output_value('brand')) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('sku', '//*[@itemprop="sku"]/@content') if self.exclude_word not in loader.get_output_value('name'): yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) multiple_products = hxs.select(u'//table[@id="ListeSkuGroupTableGauche"]//a/@href').extract() for url in multiple_products: yield Request(url) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) name = hxs.select(u'//h1[@id="sku_Title"]/text()').extract() if not name: return brand = hxs.select(u'//h1[@id="sku_Title"]/span[@id="sku_Brand"]/text()').extract() if brand: name = brand[0] + ' ' + name[0].strip() else: name = name[0].strip() sku = hxs.select(u'//div[@class="sku_TP_TD"]/div[@class="sku_TP_SKU"]/text()').extract() if sku: name += ' (' + sku[0].strip() + ')' product_loader.add_value('name', name) price = hxs.select(u'//div[@id="sku_ZonePriceNormal"]//div[@id="sku_ZPN_HT"]/text()').re(u'([\d\.,]+)') if price: price = re.sub(',', '.', price[0]) product_loader.add_value('price', price) if product_loader.get_output_value('name') and not multiple_products: yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select( u'//form/div[contains(@class,"highlightProduits hproduct")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//h3/a[@class="item url"]/@href').extract() url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//h3/a[@class="item url"]/text()') product_loader.add_xpath('price', u'.//p[@class="price"]/text()', re=u'([0-9\.]+)') if not product_loader.get_output_value('price'): product_loader.add_xpath( 'price', u'.//p[contains(@class,"price")]/text()', re=u'([0-9\.]+)') yield product_loader.load_item() if not products: log.msg('Retrying url: %s' % response.url, level=log.WARNING) retries = response.meta.get('retries', 0) if retries < 1: yield Request(response.url, dont_filter=True, meta={'retries': retries + 1})
def parse_subproduct (self, subprod): if not isinstance(subprod, HtmlXPathSelector): return url= join(subprod.select(u'td[1]/a/@href').extract()) name = join(subprod.select(u'td[1]/a/@title').extract()) price = join(subprod.select(u'td[3]/text()').extract()) #remove euro sign and replace ',' with '.' in the price price = price.replace(u',',u'.').replace(u'\xe2',u"").strip() # if there is a discount the price is in another element if price is None or len(price) == 0 : price = join(subprod.select(u'td[3]/ins/text()').extract()) price = price.replace(u',',u'.').replace(u'\xe2',u"").strip() #strip html tags from name name = re.sub('<[^<]+?>', '',name) product_loader = ProductLoader(item=Product(), selector=subprod) product_loader.add_value('name', name) product_loader.add_value('url', url) product_loader.add_value('price', price) if product_loader.get_output_value('name'): yield product_loader.load_item()
def parse_category(self, response): # more categories categories = response.xpath( u'//div[@id="subcategories"]/ul/li//a[1]/@href').extract() for category in categories: url = response.urljoin(category) yield Request(url, callback=self.parse_category) # products products = response.xpath(u'//ul[@id="product_list"]/li') products_category = list(set(response.xpath(u'//div[@class="breadcrumb" and position()=1]/a[not(position()=1)]/text()').extract() + \ [response.xpath(u'//div[@class="breadcrumb" and position()=1]//text()').extract()[-1]])) for product_xs in products: pack_price = product_xs.xpath( './/span[@class="price-pack"]//text()').re(r'[\d\,.]+') price = product_xs.xpath('.//span[@class="price"]/text()').re( r'[\d\,.]+') loader = ProductLoader(item=Product(), selector=product_xs) loader.add_xpath('identifier', './/h3/a/@href', re=r'/(\d+)-') loader.add_xpath('name', './/h3/a/text()') loader.add_xpath('url', './/h3/a/@href') loader.add_value('category', products_category) loader.add_value('price', pack_price[-1] if pack_price else price[-1]) price = loader.get_output_value('price') if price: loader.add_value( 'shipping_cost', Decimal('4.95') if price < Decimal('50') else Decimal('0.0')) in_stock = bool( product_xs.xpath( './/img[@class="ticky-tick" and (@alt="product in stock" or contains(@alt, "days"))]' )) loader.add_value('stock', 1 if in_stock else 0) item = loader.load_item() item['metadata'] = {'product_code': item['identifier']} item = loader.load_item() if item['identifier'] in self.products_cache: self.log('Product found in cache => %s' % item['identifier']) item['sku'] = self.products_cache[item['identifier']]['sku'] item['image_url'] = self.products_cache[ item['identifier']]['image_url'] yield item else: self.log('Product NOT found in cache => %s' % item['identifier']) yield Request(item['url'], callback=self.parse_product) if not products: meta = response.meta.copy() meta['retry'] = meta.get('retry', 0) if meta['retry'] < 3: meta['retry'] += 1 self.log('>>> RETRY %d => %s' % (meta['retry'], response.request.url)) yield Request(response.request.url, meta=meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) name = hxs.select('//div[@class="productDetail_name_and_description"]/h1/text()')[0].extract().strip() sku = hxs.select('//input[@name="productCode"]/@value')[0].extract() image_url = hxs.select('//img[@id="zoom"]/@src').extract() category = hxs.select('//div[@id="breadcrumbs"]/a[not(@class)]/text()').extract() brand = hxs.select('//div[@class="productDetail_tab_content"]//p/text()').re('Brand: (.*)') price = hxs.select('//div[@class="productDetail_main_pricelist"]/span[@id="now_price"]/text()') if not price: price = hxs.select('//div[@class="productDetail_main_pricelist"]/div[@id="now_price"]/text()') price = price.re('[\.\d,]+')[0].strip().replace(',', '') if price else '0.00' stock = hxs.select('//input[@class="primaryBasket"]').extract() price_excl_vat = Decimal(price) ploadr = ProductLoader(item=Product(), response=response) ploadr.add_value('name', name) ploadr.add_value('url', response.url) if image_url: ploadr.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) ploadr.add_value('sku', sku) ploadr.add_value('identifier', ploadr.get_output_value('sku')) ploadr.add_value('price', price_excl_vat) if category: ploadr.add_value('category', category[-1]) if brand: ploadr.add_value('brand', brand[0].strip()) ploadr.add_value('shipping_cost', Decimal('5.00') if price_excl_vat < 50 else Decimal('0.0')) ploadr.add_value('stock', 1 if stock else 0) item = ploadr.load_item() tmp = hxs.select("//div[@class='productDetail_item_code']/text()").extract() item['metadata'] = {'product_code': tmp[0].split(':')[-1].strip()} yield item
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//div[@class="pt9P cf clear"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select( u'.//a[@class="pNameM cf"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//a[@class="pNameM cf"]/text()') product_loader.add_xpath( 'price', u'.//div[contains(@class,"pOurPrice")]/text()', re=u'\$(.*)') if product_loader.get_output_value('price'): yield product_loader.load_item() else: cart_url = product.select( u'.//div[@class="pt0PBtns"]/a[child::img]/@href').extract( )[0] cart_url = urljoin_rfc(get_base_url(response), cart_url) request = Request(cart_url, callback=self.parse_cart, cookies={}, meta={'dont_merge_cookies': True}) request.meta['product_loader'] = product_loader yield request
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name_xpath = '//div[@id="product-details"]/h1/span/text()' names = hxs.select('//h1[@id="product_title"]/text()').extract() if names and len(names) > 0: name = names[0].strip() else: # product not found. Just continue self.log('WARNING: Product not found => %s' % response.url) return quantity = hxs.select('//p[@id="stock_status"]/text()').extract() if quantity and "In Stock" in quantity.pop(): quantity = None else: quantity = 0 category = hxs.select( '//ul[@id="crumbs"]/li[@class="last"]/a/text()').extract() brand = hxs.select( '//div[@id="product_title_container"]/span[@class="secondary"]/text()' ).extract() loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//img[@id="main_image"]/@src', TakeFirst(), Compose(lambda v: urljoin(base_url, v))) loader.add_xpath( 'price', '//div[@class="product_price"]/span[@class="price"]/text()', TakeFirst(), re="([.0-9]+)") if not loader.get_output_value('price'): loader.add_value('price', 0) if category: loader.add_value('category', category[0].strip()) loader.add_value('sku', name, TakeFirst(), re='(\d\d\d+)\s*$') if brand: loader.add_value('brand', brand[0].strip()) identifier = hxs.select('//input[@name="ProductID"]/@value').extract() if not identifier: identifier = hxs.select('//li[@itemprop="id"]/text()').extract() loader.add_value('identifier', identifier[0]) if quantity == 0: loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) if 'loader' in response.meta: loader = response.meta['loader'] else: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//*[@itemprop="name"]/text()') loader.add_xpath('price', '//*[@itemprop="price"]/@content') loader.add_value('category', response.meta.get('category', '')) try: identifier = hxs.select( '//input[@type="hidden" and @name="Product_Code"]/@value' )[0].extract() except: identifier = '' if not identifier: identifier = re.search(r'product/(.*).html$', response.url).group(1) loader.add_value('identifier', identifier) image_url = '' line_no = None for i, line in enumerate(response.body.split('\n')): if '"image_data":' in line: line_no = i break if line_no is not None: image_url = response.body.split('\n')[line_no + 2].replace( '\\', '')[1:-2] if image_url: image_url = urljoin_rfc(base_url, image_url) loader.add_value('image_url', image_url) out_of_stock = hxs.select( '//p[@class="notifications"]//strong[contains(text(),"On backorder")]' ) if out_of_stock: loader.add_value('stock', 0) try: shipping_cost = '0.00' if float( loader.get_output_value('price')) >= 75.00 else '5.00' loader.add_value('shipping_cost', shipping_cost) except: return yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()') product_loader.add_xpath('price', '//div[@class="club"]/span[@itemprop="Price"]/text()', re='.*\$(.*[0-9])') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) if not product_loader.get_output_value('price'): return mfrgid = response.meta['mfrgid'] if product_loader.get_output_value('name'): site_mfrgid = hxs.select(u'//p[@class="specs" and child::span[contains(text(),"Mfg Part")]]/text()').extract() site_mfrgid = site_mfrgid[1] if len(site_mfrgid) >= 2 else None name = response.meta['name'].split(' ') if site_mfrgid and (mfrgid == site_mfrgid.strip() or site_mfrgid in name): return product_loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) category = hxs.select('//div[@id="crumblinks"]//a/text()').extract() category = category[-1] if category else '' image_url = hxs.select('//img[@id="product-big"]/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' product_brand = '' brand_url = hxs.select( '//div[@class="description"]//img[@alt="Brand Image"]/parent::a/@href' ).extract() if brand_url: brand_url = urljoin_rfc(base_url, brand_url[0]) product_brand = url_query_parameter(brand_url, 'search') name = hxs.select("//h1[@class='coarse']/text()")[0].extract().strip() options = hxs.select('//div[@class="generated"]/table/tr')[1:] select = hxs.select( '//form[@id="cart_form"]//select[@class="prodoptions"]').extract() if options: # options for option in options: name2 = option.select('./td[position()=4]/text()') name2 = name2[0].extract().strip() if name2 else '' price = option.select('.//td/text()').extract()[-2].strip() loader = ProductLoader(item=Product(), selector=option) loader.add_xpath('identifier', './td[position()=2]/text()') loader.add_xpath('sku', './td[position()=3]/text()') loader.add_value('url', response.url) loader.add_value( 'name', name + ' %s %s' % (loader.get_output_value('identifier'), name2)) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('brand', product_brand) yield loader.load_item() else: price = "".join( hxs.select(".//span[@class='bigprice']/text()").re( r'([0-9\,\. ]+)')).strip() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('identifier', response.url) loader.add_value('image_url', image_url) loader.add_value('category', category) loader.add_xpath('sku', './td[position()=2]/text()') loader.add_value('brand', product_brand) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="summaryboxsearch"]') for product in products[0:1]: # extract only the first product product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('price', u'.//span[@class="floatl sli_price"]/text()') product_loader.add_xpath('url', u'.//p[@class="mtext nobreak"]/a/@title') product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) product_loader.add_xpath('name', u'.//p[@class="mtext nobreak"]/a/text()') name = product_loader.get_output_value('name').lower() sku = product_loader.get_output_value('sku').lower().split(' ') sku = filter(lambda x: x != '' and x in name, sku) site_mfrgid = product.select('.//span[@class="floatl sli_grid_code"]/text()').extract() if site_mfrgid: mfrgid = response.meta['mfrgid'].lower() site_mfrgid = site_mfrgid[0].strip().lower() if mfrgid in site_mfrgid and sku: yield product_loader.load_item() if not products: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('price', u'//p[@class="strong"]/span/text()') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) product_loader.add_xpath('name', u'//div[@class="indentl orderbox"]//h1/text()') name = product_loader.get_output_value('name').lower() sku = product_loader.get_output_value('sku').lower().split(' ') sku = filter(lambda x: x != '' and x in name, sku) site_mfrgid = hxs.select('//div[@class="indentl orderbox"]/div[@class="floatl"]/p/strong/text()').extract() if site_mfrgid: site_mfrgid = site_mfrgid[0].strip().lower() mfrgid = response.meta['mfrgid'].lower() if mfrgid in site_mfrgid and sku: yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # pages next_page = hxs.select( u'//div[@style="float:left;padding-right:8px;"]/a[child::img]/@href' ).extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, callback=self.parse_product) products = hxs.select(u'//div[contains(@class,"itemGrid")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[@class="oesLink"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select( u'.//a[@class="oesLink"]/span/text()').extract()[0] name += ' ' + product.select( u'.//a[@class="oesLink"]/text()').extract()[0] product_loader.add_value('name', name) product_loader.add_xpath( 'price', u'.//span[@class="PlistOfferPrice"]/text()', re=u'\$(.*)') product_loader.add_xpath( 'price', u'.//div[@class="pricing"]/span/div/span/text()', re=u'\$(.*)') loaded = product_loader.get_output_value( 'name') and product_loader.get_output_value('price') if not loaded: continue yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//td[@class="page_headers"]/text()') product_loader.add_xpath('price', u'//td[@class="price-info"]//div[@id="price" and @class="price"]/text()', re=u'\$(.*)') name = product_loader.get_output_value('name').strip() md5 = hashlib.md5() md5.update(name) hashed_name = md5.hexdigest() sku = self.skus_dict[hashed_name] product_loader.add_value('sku', sku) product_loader.add_xpath('sku', u'//span[@id="product_id"]/text()') product_loader.add_value('identifier', product_loader.get_output_value('sku').lower()) loaded = (product_loader.get_output_value('name')) and (product_loader.get_output_value('price')) if loaded: yield product_loader.load_item() else: return
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//div[contains(@class,"product-container")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select( u'.//a[@class="SearchLinkBold"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select( u'.//a[@class="SearchLinkBold"]/text()').extract()[0] extra_name = product.select( u'.//div[contains(@class,"prod-info-box")]/p/text()').extract( ) if extra_name: name += ' ' + extra_name[0] r = re.search('ID=prod(\d+)', url) if r: log.msg('Found ' + r.groups()[0]) name = self.names.get(r.groups()[0], name) product_loader.add_value('name', name) #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()', # re=u'.*?or 1/\$(.*)') #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()', # re=u'\$(.*)') #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()', # re=u'.*?or 1/\$(.*)') #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()', # re=u'\$(.*)') product_loader.add_xpath( 'price', './/p[@class="empPrc"]/span[@class="FSprice"]/text()') product_loader.add_xpath('price', './/p[@class="FSprice"]/text()', re=u'.*?or 1/\$(.*)') product_loader.add_xpath('price', './/p[@class="FSprice"]/text()', re=u'.*?or 1/\$(.*)') product_loader.add_xpath('price', './/p[@class="Rprice"]/text()') product_loader.add_xpath('price', './/p[@class="Rprice"]/text()') if not product_loader.get_output_value('price'): continue yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('price', '//font[@class="pricecolor colors_productprice"]/text()', re='.*\$(.*[0-9])') product_loader.add_value('url', response.url) product_loader.add_xpath('sku', '//span[@class="product_code"]/text()') sku = product_loader.get_output_value('sku') if sku: product_loader.add_value('name', sku) else: product_loader.add_xpath('name', '//font[@class="productnamecolorLARGE colors_productname"]/text()') return product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) l = ProductLoader(item=Product(), selector=hxs) l.add_xpath('identifier', '//*[@itemprop="sku"]/text()') l.add_xpath('name', '//h1[@itemprop="name"]/text()') l.add_value('url', response.url) l.add_xpath('price', '//*[@itemprop="price"]/@content') l.add_xpath('image_url', '//*[@itemprop="image"]/@src') l.add_value('sku', response.meta['sku']) l.add_value('brand', response.meta['brand']) l.add_value('category', response.meta['category']) l.add_value('stock', re.search('"stock": (.+),', response.body).group(1)) if l.get_output_value('price') < 10: l.add_value('shipping_cost', '2.99') else: l.add_value('shipping_cost', '0') yield l.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: product_id = hxs.select('//form[@class="add-to-cart" or @id="add-notification"]/@action').re('productId=(.*)')[0] except: self.log('No product_id found on %s' %response.url) return image_url = hxs.select('//section[@id="product-image-viewer"]/div[@id="slider"]/ul[@class="slides"]//a[@class="fancy-box"]/@href').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', product_id) loader.add_xpath('sku', '//p[contains(text(), "Brand Code")]/text()', re=r': (.*)$') loader.add_xpath('name', '//h1/text()') loader.add_xpath('price', '//span[@class="product-price"]/span/text()', re=r'[\d,.]+') if not loader.get_output_value('price'): loader.add_xpath('price', '//span[@class="product-price"]/span[@class="text-red"]/text()', re=r'[\d,.]+') loader.add_value('url', response.url) loader.add_xpath('category', '//nav[@id="breadcrumb"]/span/a/span/text()', lambda elms: elms[-1]) loader.add_xpath('brand', '//a[@class="pull-right" and contains(@href, "Brands")]/img/@alt') out_of_stock = hxs.select('//div[@class="stock-status"]/span[@class="stock-status-circle out-of-stock"]') if out_of_stock: loader.add_value('stock', 0) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) item = loader.load_item() yield item data_model = hxs.select('//form[@class="add-to-cart"]/@data-model').extract() #import ipdb; ipdb.set_trace() if data_model: data = json.loads(data_model[0]) if 'associatedProducts' in data: for option in data['associatedProducts']: loader = ProductLoader(item=Product(item), response=response) loader.replace_value('identifier', option['id']) loader.replace_value('name', item['name'] + ' ' + ' '.join([o['value'].split(u'(\xa3')[0].strip() for o in option['fieldValues']])) loader.replace_value('price', round(option['price'], 2)) yield loader.load_item()
def parse_product(self, response): schema = SpiderSchema(response) product_data = schema.get_product() loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', product_data['productID']) loader.add_value('sku', product_data['productID']) loader.add_value('name', product_data['name']) out_stock = bool(response.css('.product-shop .out-of-stock')) if (not out_stock) and ( 'InStock' in product_data['offers']['properties']['availability']): loader.add_value('stock', 1) else: loader.add_value('stock', 0) category = response.css('.breadcrumbs').xpath( './/li/a/text()').extract()[1:] loader.add_value('category', category) loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath( 'brand', '//th[@class="label" and contains(text(), ' '"Brand")]/following-sibling::td/text()') price = response.css('.product-shop .price-box .minimal-price .price' ).xpath('text()').re_first(r'[\d\.,]+') if not price: price = response.css( '.product-shop .price-box .regular-price .price').xpath( 'text()').re_first(r'[\d\.,]+') if not price: price = response.css( '.product-shop .price-box .special-price .price').xpath( 'text()').re_first(r'[\d\.,]+') loader.add_value('price', price) if loader.get_output_value('price') >= Decimal('45.0'): loader.add_value('shipping_cost', '0.0') else: loader.add_value('shipping_cost', '4.95') yield loader.load_item() for url in response.css('.grouped-items-table-wrapper .name-wrapper' ).xpath('a/@href').extract(): yield Request(url, callback=self.parse_product)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('price', '//div[@class="h3"]/span[@class="productSpecialPrice"]/text()', re='.*\$(.*)') product_loader.add_xpath('price', '//div[@class="h3"]/text()', re='.*\$(.*[0-9])') product_loader.add_value('url', response.url) product_loader.add_xpath('sku', '//div[@id="content"]/div[@id="right-column"]/span[@class="right"]/text()', re='-(.*)\]') sku = product_loader.get_output_value('sku') if sku: product_loader.add_value('name', sku) else: product_loader.add_xpath('name', '//div[@id="content"]/div[@id="right-column"]/h1[@class="bottom-border"]/text()') return product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) multiple_products = hxs.select(u'//table[@id="ListeSkuGroupTableGauche"]//a/@href').extract() for url in multiple_products: yield Request(url) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) name = hxs.select(u'//h1[@id="sku_Title"]/text()').extract() if not name: return brand = hxs.select(u'//h1[@id="sku_Title"]/span[@id="sku_Brand"]/text()').extract() if brand: name = brand[0] + ' ' + name[0].strip() product_loader.add_value('brand', brand[0].strip()) else: name = name[0].strip() sku = hxs.select(u'//div[@class="sku_TP_TD"]/div[@class="sku_TP_SKU"]/text()').extract() if sku: name += ' (' + sku[0].strip() + ')' product_loader.add_value('sku', sku[0].strip()) product_loader.add_value('name', name) product_loader.add_value('identifier', sku) product_loader.add_value('category', hxs.select('//a[@class="BreadCrumbLink"]/text()')[-2].extract()) image = hxs.select('//img[@id="ChangePhoto"]/@src').extract() if image: product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), image[0])) shipping = hxs.select('//div[@id="sku_ZoneFLtxtFix"]//strong/text()').extract() if shipping: product_loader.add_value('shipping_cost', shipping[0].replace(',', '.')) price = hxs.select(u'//div[@id="sku_ZonePriceNormal"]//div[@id="sku_ZPN_HT"]/text()').re(u'([\d\.,]+)') if price: price = re.sub(',', '.', price[0]) product_loader.add_value('price', price) if product_loader.get_output_value('name') and not multiple_products: yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) category = hxs.select(u'//div[@class="breadcrumb"]/a/text()').extract() category = category[-1] if category else '' image_url = hxs.select( u'//ul[@id="product_images"]/li/a//img[@class="big_photo"]/@src' ).extract() if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0]) name = hxs.select(u'//h1/text()').extract()[0] product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name.strip()) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('image_url', image_url) req_url = os.path.basename(response.request.url) identifier, _, _ = req_url.partition('-') product_loader.add_value('identifier', identifier) product_loader.add_xpath('price', '//div[@id="center_column"]/script[2]/text()', re="var productPrice='([0-9.]+)'") price = product_loader.get_output_value('price') product_loader.add_value('shipping_cost', 25 if price < 500 else 0) stock_option = hxs.select( u'//div[@class="shipping" and ./h2/text()="Dostawa"]/div[@class="feature_value"]/text()' ) product_loader.add_value('stock', 0 if stock_option else 1) yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//div[contains(@class,"product-container")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[@class="SearchLinkBold"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select(u'.//a[@class="SearchLinkBold"]/text()').extract()[0] extra_name = product.select(u'.//div[contains(@class,"prod-info-box")]/p/text()').extract() if extra_name: name += ' ' + extra_name[0] r = re.search('ID=prod(\d+)', url) if r: log.msg('Found ' + r.groups()[0]) name = self.names.get(r.groups()[0], name) product_loader.add_value('name', name) #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()', # re=u'.*?or 1/\$(.*)') #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()', # re=u'\$(.*)') #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()', # re=u'.*?or 1/\$(.*)') #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()', # re=u'\$(.*)') product_loader.add_xpath('price', './/p[@class="FSprice"]/text()', re=u'.*?or 1/\$(.*)') product_loader.add_xpath('price', './/p[@class="FSprice"]/text()', re=u'.*?or 1/\$(.*)') product_loader.add_xpath('price', './/p[@class="Rprice"]/text()') product_loader.add_xpath('price', './/p[@class="Rprice"]/text()') if not product_loader.get_output_value('price'): continue yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//div[@id="produktDET"]/div/div/h1[@class="or"]/text()').pop().extract().strip() category = hxs.select('//div[@id="link"]/a/@title').pop().extract().strip() sku = hxs.select('//span[@class="code"]/text()').extract().pop().strip() pid = self.get_id(hxs.select('//div[@class="buy"]/a/@href').pop().extract()) price = hxs.select('//div[@class="pricebox"]/div/div/p[@class="prodCena"]/span/span[@class="actual_price"]/text()').pop().extract() stock = hxs.select('//div[@class="prodRight"]/div/div/p[@class="makeGreen"][contains(text(), "Skladem")]') if price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) try: loader.add_xpath('image_url', '//div[@class="mainImgCont"]/a/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) except IndexError: self.errors.append("No image set for url: '%s'" % urljoin(base_url, response.url)) loader.add_value('price', price.replace(' ', '')) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', 'LEGO') price = loader.get_output_value('price') if int(price) < 1990: loader.add_value('shipping_cost', 99) if not stock: loader.add_value('stock', 0) yield self.load_item_with_metadata(loader.load_item()) else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) url=response.url name = join(hxs.select(u'//h1[@id="titre_produit"]/text()').extract()) price = join(hxs.select(u'//div[@id="productPrice"]/text()').extract()) #remove euro sign and replace ',' with '.' in the price price = price.replace(u',',u'.').replace(u'\xe2',u"").strip() # if there is a discount the price is in another element if price is None or len(price) == 0 : price = join(hxs.select(u'//div[@id="productPrice"]/ins/text()').extract()) price = price.replace(u',',u'.').replace(u'\xe2',u"").strip() #strip html tags from name name = re.sub('<[^<]+?>', '',name) product_loader = ProductLoader(item=Product(), selector=name) product_loader.add_value('name', name) product_loader.add_value('url', url) product_loader.add_value('price', price) if product_loader.get_output_value('name'): yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//div[@class="pt9P cf clear"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[@class="pNameM cf"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//a[@class="pNameM cf"]/text()') product_loader.add_xpath('price', u'.//div[contains(@class,"pOurPrice")]/text()', re=u'\$(.*)') if product_loader.get_output_value('price'): yield product_loader.load_item() else: cart_url = product.select(u'.//div[@class="pt0PBtns"]/a[child::img]/@href').extract()[0] cart_url = urljoin_rfc(get_base_url(response), cart_url) request = Request(cart_url, callback=self.parse_cart, cookies={}, meta={'dont_merge_cookies': True}) request.meta['product_loader'] = product_loader yield request
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//form/div[contains(@class,"highlightProduits hproduct")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//h3/a[@class="item url"]/@href').extract() url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) product_loader.add_xpath('name', u'.//h3/a[@class="item url"]/text()') product_loader.add_xpath('price', u'.//p[@class="price"]/text()', re=u'([0-9\.]+)') if not product_loader.get_output_value('price'): product_loader.add_xpath('price', u'.//p[contains(@class,"price")]/text()', re=u'([0-9\.]+)') yield product_loader.load_item() if not products: log.msg('Retrying url: %s' % response.url, level=log.WARNING) retries = response.meta.get('retries', 0) if retries < 1: yield Request(response.url, dont_filter=True, meta={'retries': retries + 1})
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) multiple_products = hxs.select( u'//table[@id="ListeSkuGroupTableGauche"]//a/@href').extract() for url in multiple_products: yield Request(url) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) name = hxs.select(u'//h1[@id="sku_Title"]/text()').extract() if not name: return brand = hxs.select( u'//h1[@id="sku_Title"]/span[@id="sku_Brand"]/text()').extract() if brand: name = brand[0] + ' ' + name[0].strip() else: name = name[0].strip() sku = hxs.select( u'//div[@class="sku_TP_TD"]/div[@class="sku_TP_SKU"]/text()' ).extract() if sku: name += ' (' + sku[0].strip() + ')' product_loader.add_value('name', name) price = hxs.select( u'//div[@id="sku_ZonePriceNormal"]//div[@id="sku_ZPN_HT"]/text()' ).re(u'([\d\.,]+)') if price: price = re.sub(',', '.', price[0]) product_loader.add_value('price', price) if product_loader.get_output_value( 'name') and not multiple_products: yield product_loader.load_item()
def parse_product(self, response): URL_BASE = 'http://www.djkit.com' hxs = HtmlXPathSelector(response) name = hxs.select('//*[@itemprop="name"]/text()').extract() if not name: self.log("ERROR name not found") name = "" else: name = name[0] name = name.strip() if 'B-STOCK' in name.upper(): return price = hxs.select( '//span[@class="product-variation-value discount-value"]//*[@itemprop="price"]/text()' ).extract() if not price: price = hxs.select('//*[@itemprop="price"]/text()').extract() if not price: self.log("ERROR price not found") price = "" else: price = extract_price(price[0].strip()) sku = hxs.select('//*[@itemprop="sku"]/strong/text()').extract() if not sku: sku = hxs.select('//*[@itemprop="sku"]/text()').extract() if not sku: self.log("ERROR sku not found") else: sku = sku[0] product_id = hxs.select( '//*[@id="sub"]/input[@name="product"]/@value').extract() if not product_id: self.log("ERROR ID not found") return else: product_id = product_id[0] img_url = hxs.select('//img[@itemprop="image"]/@src').extract() if not img_url: self.log("ERROR img not found") else: img_url = urljoin_rfc(URL_BASE, img_url[0]) category = hxs.select( '//div[@id="breadcrumbs"]/a[@class="breadlink"]/text()').extract() category = category[-1] if category else '' loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) loader.add_value('image_url', img_url) loader.add_value('identifier', product_id.strip()) loader.add_value('category', category) shipping_cost = '5.50' if float( loader.get_output_value('price')) < 50.00 else '0.00' loader.add_value('shipping_cost', shipping_cost) stock = hxs.select( '//div[@class="delivery-availability"]//text()[normalize-space()]' ).extract() if 'DISCONTINUED' in stock: return if not ('In Stock' in stock or 'In stock' in stock): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # products product_links = hxs.select( '//div[@id="CategoryContent"]//div[@class="ProductDetails"]/strong/a/@href' ).extract() for url in product_links: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta=response.meta) if product_links or not hxs.select('//h1/text()'): return # sub products subproduct_urls = hxs.select( '//div[@class="ProductDescriptionContainer"]//a/@href').extract() if subproduct_urls: for url in subproduct_urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta=response.meta) name = hxs.select('//h1/text()')[0].extract() if 'MSDS' in name.upper() or 'ABC' in name.upper(): return category = response.meta.get('category', '') brand = hxs.select( '//div[@class="DetailRow" and div[text()="Brand:"]]/div[@class="Value"]//text()[normalize-space()]' ).extract() image_url = hxs.select( '//meta[@property="og:image"]/@content').extract() price = hxs.select( '//em[@class="ProductPrice VariationProductPrice"]/text()' ).extract() price = price[0] if price else '0.00' identifier = hxs.select( '//form[@id="productDetailsAddToCartForm"]//input[@type="hidden" and @name="product_id"]/@value' ) if identifier: identifier = identifier[0].extract() else: log.msg('Product without identifier: ' + response.url) return sku = hxs.select('//div[@id="sku"]/text()').extract() sku = sku[0] if sku else None loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('identifier', identifier) if sku: loader.add_value('sku', sku) loader.add_value('category', category) if brand: loader.add_value('brand', brand[0].strip()) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta relevant_categories = hxs.select( '//div[@class="catsMI"]/div/a/@href').extract() for category in relevant_categories: yield Request(urljoin_rfc(base_url, category), meta=meta) products = hxs.select('//table[@id="productlist-table"]/tbody/tr') if not products and meta.get('model_search', False): url = 'http://www.trovaprezzi.it/categoria.aspx?libera=' + meta[ 'name'].replace(' ', '+') + '&id=-1&prezzomin=&prezzomax=' meta['model_search'] = False yield Request(url, meta=meta) else: category = hxs.select( '//div[@id="divTitle"]/h1/text()').extract()[0] pr = None for product in products: name = product.select( 'td[@class="descCol"]/a/b/text()').extract()[0] if self.match_name(meta['name'], name, match_threshold=70): loader = ProductLoader(item=Product(), selector=product) image_url = product.select( 'td[@class="imgCol"]/a/img/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) else: image_url = '' loader.add_value('image_url', image_url) loader.add_xpath('dealer', 'td[@class="mercCol"]/a/img/@alt') loader.add_xpath('name', 'td[@class="descCol"]/a/b/text()') loader.add_value('category', category) loader.add_value('sku', response.meta.get('sku')) url = product.select( 'td[@class="descCol"]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(base_url, url)) price = product.select('td[@class="prodListPrezzo"]/text()' ).extract()[0].strip().replace( '.', '').replace(',', '.') loader.add_value('price', price) shipping_cost = product.select( 'td[@class="prodListPrezzo"]/' + 'span[@class="deliveryCost nobr"]/' + 'text()').extract()[0].strip().replace('.', '').replace( ',', '.') loader.add_value('shipping_cost', shipping_cost) loader.add_value('identifier', response.meta.get('identifier')) if loader.get_output_value('price') and ( pr is None or pr.get_output_value('price') > loader.get_output_value('price')): pr = loader if pr: item = pr.load_item() if not item in self.items: self.items.append(item)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name fitting_method = 'Delivered' loader.add_value('url', response.url) image_url = hxs.select('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = hxs.select('//form[@name="form1"]/@action').extract() if not identifier: yield self.retry_request(response) return identifier = identifier[0] loader.add_value('identifier', identifier) price = hxs.select( '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract() loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) brand = hxs.select( '//div[@class="hidden"]/input[@class="producerName"]/@value' ).extract() if not brand: yield self.retry_request(response) return brand = brand[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) brand = re.sub(u'\u0119', u'e', brand) product_name = hxs.select( '//h1[@itemprop="name"]/text()')[0].extract().strip() product_name = re.sub(u'[:\u2122]', u'', product_name) product_name = product_name.replace(brand, '').strip() data = parse_pattern(product_name) if not data: log.msg('ERROR parsing "{}" [{}]'.format(product_name, response.url)) self.errors.append('ERROR parsing "{}" [{}]'.format( product_name, response.url)) return loader.add_value('name', data['Name']) metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in product_name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run on flat' in product_name.lower( ) or 'run flat' in product_name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in product_name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get( manufacturer_mark, '') if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) # metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def _get_prices(self, price_response): unpriced = re.search('eBunpriced="(.*)"', price_response.body) if unpriced: unpriced = [prod_id for prod_id in unpriced.groups()[0].split(',') if prod_id.strip()] eBzp = [None] * 200 eBzpp = [None] * 200 eBzp_assignments = re.findall('(eBzp\[\d+\]=.*);', price_response.body) for assignment in eBzp_assignments: exec assignment.replace('eBop', "''").replace('eBspl', "'&'").replace('eBsp', "'&'") eBzpp_assignments = re.findall('(eBzpp\[\d+\]=.*);', price_response.body) for assignment in eBzpp_assignments: exec assignment.replace('eBop', "'&'").replace('eBspl', "'&'").replace('eBsp', "'&'") prices = {} for i, prod in enumerate(eBzp): if prod: prices[prod] = eBzpp[i] hxs = price_response.meta['hxs'] main_name = hxs.select('//h1/text()').extract()[0].strip() products = hxs.select('//form[@id="eBvariant1"]//option') subprods = hxs.select('//div[@id="TabbedPanels1"]//em/strong[contains(text(), "//")]/text()').extract() if not products and subprods: subprods = subprods[0].split('//') for prod in subprods: r = prod.split(':') if len(r) == 2: p = Product() loader = ProductLoader(response=price_response.meta['main_response'], item=p) loader.add_value('name', main_name + ' ' + r[0].strip()) loader.add_value('price', r[1]) loader.add_value('url', price_response.meta['main_response'].url) yield loader.load_item() return if not products and prices: product_id = hxs.select('//span[@class="eBprice"]/@id').re('pP(.*)') if product_id: price = prices.get(product_id[0]) or eBzpp[0] else: price = eBzpp[0] p = Product() loader = ProductLoader(response=price_response.meta['main_response'], item=p) loader.add_value('name', main_name) loader.add_value('price', price) loader.add_value('url', price_response.meta['main_response'].url) yield loader.load_item() for product in products: subprods = product.select('./@value').extract()[0].split(',') if len(subprods) == 1 and subprods[0] in prices and subprods[0] not in unpriced: p = Product() loader = ProductLoader(response=price_response.meta['main_response'], item=p) subname = product.select('./text()').extract()[0].strip() loader.add_value('name', main_name + ' ' + subname) loader.add_value('price', prices[subprods[0]]) loader.add_value('url', price_response.meta['main_response'].url) yield loader.load_item() elif len(subprods) > 1: subprods = subprods[1:] for i, subprod in enumerate(subprods): if subprod in prices and subprod not in unpriced: p = Product() loader = ProductLoader(response=price_response.meta['main_response'], item=p) loader.add_value('url', price_response.meta['main_response'].url) first_subname = product.select('./text()').extract()[0].strip() subname = subprods[i - 1].strip() loader.add_value('name', unquote(main_name + ' ' + first_subname + ' ' + subname)) loader.add_value('price', prices[subprod]) yield loader.load_item() alternate_prices = hxs.select('//a[@class="green2"]') for alt in alternate_prices: subprods = alt.select('./following-sibling::em//text()').extract() for subprod in subprods: prod_data = subprod.split(':') if len(prod_data) > 1: loader = ProductLoader(selector=alt, item=Product()) loader.add_value('url', price_response.meta['main_response'].url) loader.add_value('name', main_name) loader.add_value('name', prod_data[0]) loader.add_value('price', prod_data[1]) if not loader.get_output_value('price'): continue yield loader.load_item()
def parse(self, response): base_url = get_base_url(response) row = response.meta['row'] products = json.loads(response.body_as_unicode()) for product_el in products: #skip winter tyres if product_el['winter'] != '0': continue loader = ProductLoader(item=Product(), selector=product_el) brand = product_el['tyreMake'].title() if 'goodrich' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) load_rating = product_el['loadrating'] speed_rating = product_el['tyreSpeed'] loader.add_value('price', product_el['priceVat']) loader.add_value('identifier', product_el['id']) loader.add_value( 'url', urljoin('http://www.etyres.co.uk/tyre-detail/', product_el['URLString'])) if product_el['tyreModelImage2']: image_url = 'images/' + product_el['tyreModelImage2'] if image_url: loader.add_value('image_url', urljoin(base_url, image_url)) metadata = MicheldeverMeta() metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = speed_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating metadata[ 'xl'] = 'Yes' if product_el['tyreReinforced'] == 'T' else 'No' metadata[ 'run_flat'] = 'Yes' if product_el['runflat'] == '1' else 'No' name = product_el['tyreModel'] man_code = '' for code, man_mark in self.all_man_marks.iteritems(): result, name = cut_name(code, name) if result: man_code = man_mark break if not man_code: for code, man_mark in self.custom_man_marks.iteritems(): if name.endswith(code): name = name.partition(code)[0] man_code = man_mark break metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating, speed_rating)) name = name.replace(' EXTRA LOAD', '') name = name.replace(' RUNFLAT', '') loader.add_value('name', name.strip()) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_products(self, response): hxs = HtmlXPathSelector(response) product_data = response.meta['product_data'] width = product_data['Width'] aspect_ratio = product_data['Aspect Ratio'] rim = product_data['Rim'] speed_rating = product_data['Speed rating'] alt_speed = product_data['Alt Speed'] name_reg = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % ( width, rim, speed_rating.upper()) name_reg2 = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % ( width, rim, alt_speed.upper()) name_reg3 = r'(.+?)\s*%s.+%s.?[\s]*(.*)' % (width, rim) products = hxs.select( '//div[@id="product-listing"]//div[@class="product"]/..') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) try: url = product_el.select( './/div[@class="title"]/a/@href')[0].extract() except: continue loader.add_value('url', url) loader.add_value( 'identifier', product_el.select(".//span[@class='addcompare']/input/@id"). extract()[0].split(":")[1]) # loader.add_value('identifier', re.search('productId_(\d+)_', url).groups()[0]) loader.add_xpath('price', './/span[@class="prodPirce"]/text()') try: name = product_el.select( './/div[@class="title"]/a/text()')[0].extract() except: continue run_flat_found = is_run_flat(name) if not re.search(r'(\(.*\))', name): # name = name.replace('/', '') m = re.search(name_reg, name) if not m: m = name_parts = re.search(name_reg2, name) if not m: m = name_parts = re.search(name_reg3, name) if m: name_parts = m.groups() else: self.log('Failed parsing ' + name) self.log('URL: ' + response.url) self.log('Params: ' + ", ".join( map(str, [width, rim, speed_rating.upper()]))) continue else: name_parts = [] name_parts.append(name.split()[0]) load_rating_reg = re.search(r'(\d+)%s' % speed_rating.upper(), name) if not load_rating_reg: load_rating_reg = re.search(r'(\d+)%s' % alt_speed.upper(), name) if not load_rating_reg: self.log('Failed parsing ' + name) self.log('URL: ' + response.url) self.log('Params: ' + ", ".join( map(str, [width, rim, speed_rating.upper()]))) continue name_parts.append(load_rating_reg.groups()[0]) name_parts.append(' '.join(name.split()[1:]).split('(')[0]) loader.add_value( 'name', name_parts[-1].replace('XL', '').replace('ROF', '').replace('RFT', '')) brand = name_parts[0] loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) loader.add_xpath('image_url', './/a[contains(@class, "tyre")]/img/@src') m = MicheldeverMeta() m['aspect_ratio'] = aspect_ratio m['rim'] = rim m['width'] = width m['speed_rating'] = speed_rating.upper() m['load_rating'] = name_parts[1] if 'ROF' in name.upper() or 'RFT' in name.upper( ) or run_flat_found: m['run_flat'] = 'Yes' else: m['run_flat'] = 'No' if 'XL' in name.upper(): m['xl'] = 'Yes' else: m['xl'] = 'No' m['full_tyre_size'] = '/'.join( (m['width'], m['aspect_ratio'], m['rim'], m['load_rating'], m['speed_rating'])) # m['alternative_speed_rating'])) m['fitting_method'] = 'Fitted' m['manufacturer_mark'] = self._get_manufacturer_code( name_parts[-1]) fuel = product_el.select( './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "fuel_")]/@class' ).re(r'fuel_(\w)') m['fuel'] = fuel[0] if fuel else '' grip = product_el.select( './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "grip_")]/@class' ).re(r'grip_(\w)') m['grip'] = grip[0] if grip else '' noise = product_el.select( './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "noise_")]/@class' ).re(r'_(\d+)') m['noise'] = noise[-1] if noise else '' product = loader.load_item() product['metadata'] = m if not is_product_correct(product): self.log('The product is not correct: %r' % product) continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product next_page = hxs.select('//span[@class="nextlink"]/a/@href') if next_page: yield Request(next_page.extract()[0], callback=self.parse_products, meta=response.meta)
def parse_product(self, response): if 'TERMS' in response.url or 'ABOUTUS' in response.url: return if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//td[@class="linkcell"]/div[@onclick]/@onclick | //div[@id="DARKSTRIP"]//td[@onclick]/@onclick').re('assign\(\'(.*)\'') products += hxs.select(u'//div[@id="DARKSTRIP"]//a/@href').extract() products += hxs.select(u'//a["HDlistTitlefont"]/@href').extract() for url in set(products): url = urljoin_rfc(get_base_url(response), '/' + url) if ('javascript' not in url) and ('Javascript' not in url): yield Request(url, callback=self.parse_product) product_loader = ProductLoader(item=Product(), response=response) name = hxs.select(u'//title/text()').extract()[0] name = re.sub('\n', ' ', name) product_loader.add_value('name', name) product_loader.add_value('url', response.url) product_loader.add_xpath('price', u'//div[@class="HDPriceHD"]//span/text()', re='\xa3(.*)') product_loader.add_xpath('price', u'//div[@id="MASTER"]//td[@valign]/text()', re='\xa3(.*)') product_loader.add_xpath('price', u'//div[@class="HDPriceRRP"]//text()', re='.*?\xa3(.*)') if product_loader.get_output_value('price'): yield product_loader.load_item() products = hxs.select(u'//td[@class="DefaultFont"]') for product in products: name = product.select(u'.//p/strong/text()').extract() price = product.select(u'.//p/text()').re('\xa3(.*)\)') url = product.select(u'.//a[child::u]/@href').extract() if url: url = urljoin_rfc(get_base_url(response), '/' + url[0]) if not price: continue product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_value('name', name) product_loader.add_value('price', price) product_loader.add_value('url', url if url else response.url) yield product_loader.load_item() products = hxs.select(u'//td[@id="LIGHTSTRIP"]') for product in products: name = product.select(u'.//a[@class="DefaultFont" and contains(@style,"#000000")]/text()').extract() if len(name) > 1: name = map(lambda x: x.strip(), name) name = ' '.join(name) else: name = name[0].strip() price = product.select(u'.//span[contains(text(),"Hot Deal - only")]/span/text()').re('\xa3(.*)') if not price: price = product.select(u'.//span[@class="DefaultFont" and contains(text(),"RRP")]/text()').re('\xa3(.*)') if not price: continue url = product.select(u'.//a[@class="DefaultFont"]/@href').extract() if url: url = urljoin_rfc(get_base_url(response), '/' + url[0]) product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_value('name', name) product_loader.add_value('price', price) product_loader.add_value('url', url if url else response.url) yield product_loader.load_item()