def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//*[@itemprop="name"]/text()').extract() name = name[0].strip() if name else '' product_loader.add_value('name', name) sku = '' for match in re.finditer(r"([\d,\.]+)", name): if len(match.group()) > len(sku): sku = match.group() product_loader.add_value('sku', sku) image_url = hxs.select( '//img[contains(@class, "productimage") and contains(@class, "main")]/@src' ).extract() product_loader.add_value('image_url', image_url) price = hxs.select('//div[@itemprop="offers"]//*[@itemprop="price"]/text()').re(r'[\d,. ]+')[0]\ .strip().replace(' ', '').replace(',-', '').replace(u'\xa0', '').replace(',', '.') product_loader.add_value('price', extract_price(price)) if product_loader.get_collected_values( 'price' ) and product_loader.get_collected_values('price')[0] < 1000: product_loader.add_value('shipping_cost', '49') identifier = hxs.select('//div[@id="description-extra"]/text()').re( '\d+') product_loader.add_value('identifier', identifier) product_loader.add_value('url', response.url) out_stock = hxs.select( '//*[@itemprop="availability" and contains(@href, "OutOfStock")]') if out_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[@class="product-wrapper"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) name = product.select('.//h3//text()').extract()[0] product_loader.add_value('name', name) sku = '' for match in re.finditer(r"([\d,\.]+)", name): if len(match.group()) > len(sku): sku = match.group() product_loader.add_value('sku', sku) image_url = product.select( './div[@class="product-image"]//img/@data-original').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = product.select('./div[@class="product-price"]//span[@class="price-amount"]/text()').extract()[0].strip()\ .strip(' Kr').replace('.', '') product_loader.add_value('price', extract_price(price)) if product_loader.get_collected_values( 'price' ) and product_loader.get_collected_values('price')[0] < 1500: product_loader.add_value('shipping_cost', '49') buy_button = product.select( './div[@class="product-buttons"]/a[@class="buy-button"]') if not buy_button: product_loader.add_value('stock', 0) url = product.select( './div[@class="product-buttons"]/a[@class="button-info"]/@href' ).extract()[0] product_loader.add_value('url', urljoin_rfc(base_url, url)) identifier = product.select( './div[@class="product-name"]//@data-productid').extract()[0] product_loader.add_value('identifier', identifier) product = product_loader.load_item() yield product pages = hxs.select('//a[@class="paging-link-box"]/@href').extract() for url in pages: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product_list)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_name = hxs.select( '//div[@class="product-name"]/h1/text()').extract()[0] product_loader.add_value('name', product_name) image_url = hxs.select( '//div[contains(@class, "img-box")]//img/@src').extract()[0] product_loader.add_value('image_url', urljoin_rfc(base_url, image_url)) identifier = hxs.select( '//span[@class="sku"]/text()').extract()[0].strip() product_loader.add_value('identifier', identifier) #sku = re.search('(\d+)', identifier) #sku = sku.group(1) if sku else '' sku = identifier product_loader.add_value('sku', sku) #price = hxs.select('//span[starts-with(@id,"product-price")]//span[@class="price"]/text()').extract() price = hxs.select( '//div[@class="product-type-data"]/div[@class="price-box"]//span[@class="price"]/text()' ).extract() price = price[-1].strip() if price else '0.00' product_loader.add_value( 'price', price.replace(',', '.').replace(' ', '').replace(u'\xa0', '')) if product_loader.get_collected_values( 'price' ) and product_loader.get_collected_values('price')[0] < 1000: product_loader.add_value('shipping_cost', '49') # category = hxs.select('').extract() # category = category[0].strip() if category else '' # product_loader.add_value('category', category) product_loader.add_value('brand', 'Lego') yield product_loader.load_item()
def parse_product(self, response): import re hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('identifier', '//input[@id="product_id"]/@value') loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') sku = ''.join(hxs.select('//h1/text()').extract()) try: loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0]) except: sku = response.xpath( '//input[@id="product_productID"]/@value').extract() if sku: loader.add_value('sku', sku[0].strip()) else: self.log('No SKU for %s' % (response.url)) loader.add_xpath('price', '//*[@itemprop="price"]/text()') loader.add_xpath( 'category', '//ul[@id="mnu_main"]/li[contains(@class, "selected")]//a/text()') img = hxs.select('//div[@class="img"]//a/@href').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('brand', 'lego') if loader.get_collected_values( 'price') and loader.get_collected_values('price')[0] < 600: loader.add_value('shipping_cost', '29') # loader.add_value('shipping_cost', '49') # loader.add_value('stock', '0') prod = loader.load_item() if prod.get('price'): yield prod else: for opt in hxs.select('//div[@class="cont"]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), opt), callback=self.parse_product, meta=response.meta)
def parse_product(self, response): import re hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value( 'identifier', re.search(r'onclick="wl\.addProductItem\((\d+),', response.body).groups()[0]) loader.add_value('url', response.url) loader.add_xpath('name', '//h1//text()') sku = ''.join(hxs.select('//h1//text()').extract()) try: loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0]) except: self.log('No SKU for %s' % (response.url)) loader.add_value( 'price', extract_price(''.join( hxs.select('//div[@class="price"]/text()').extract()).replace( ' ', ''))) if loader.get_collected_values( 'price') and loader.get_collected_values('price')[0] < 400: loader.add_value('shipping_cost', '49') loader.add_value('category', 'Lego') img = hxs.select('//div[@class="image"]//img/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('brand', 'lego') # loader.add_value('shipping_cost', '49') if re.search('ItemData .*Finns i lager.*', response.body): loader.add_value('stock', '1') else: loader.add_value('stock', '0') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) meta = response.meta.copy() identifier = hxs.select( '//form[@id="product_addtocart_form"]' '//input[@name="product"]/@value').extract().pop().strip() shipping_cost = hxs.select( '//li[contains(string(), "Shipping cost:")]/span[@class="price"]/text()' ).extract() price = "".join( hxs.select('//span[@id="product-price-%s"]//text()' % identifier).extract()).strip() loader = ProductLoader(item=Product(), selector=response) loader.add_value('sku', identifier) loader.add_value('identifier', identifier) loader.add_value('name', meta.get('name')) if price: loader.add_value('price', price) else: loader.add_value('price', meta.get('price')) if not loader.get_collected_values("price")[0]: return loader.add_value('url', response.url) loader.add_value('brand', meta.get('brand', '')) loader.add_value('category', meta.get('category', '')) if shipping_cost: loader.add_value('shipping_cost', shipping_cost.pop()) loader.add_value('image_url', meta.get('image_url', '')) in_stock = bool( hxs.select( '//form//div[@class="add-to-cart-btn"]/button[contains(string(), "Add to Cart")]' ) or hxs.select( '//form//div[@class="add-to-cart-btn"]/button[contains(string(), "Coming Soon")]' ) or hxs.select( '//form//div[@class="add-to-cart-btn"]/button[contains(string(), "Very Limited Stock!")]' ) or hxs.select( '//form//div[@class="add-to-cart-btn"]/button[contains(string(), "Back Order Only")]' )) if in_stock: loader.add_value('stock', 1) else: loader.add_value('stock', 0) yield loader.load_item()