def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) product = Product() loader = WindowsCleaningProductLoader(item=product, response=response) try: loader.add_value('url', response.url) name = hxs.select('//div[@class="bigbox"]/div[@class="top"]/text()').extract()[0] loader.add_value('name', name) price = hxs.select('//div[@class="priceAmount"]/text()').extract()[0] loader.add_value('price', price) try: sku = hxs.select('//font[@class="content" and contains(text(), "Model")]/../../td[2]/font/text()').extract()[0] except IndexError: sku = '' loader.add_value('sku', sku) yield loader.load_item() except IndexError: return
def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) name = hxs.select( '//div[@id="product-detail-div"]//select/@name').extract() subproducts = hxs.select( '//div[@id="product-detail-div"]//select/option') if name and 'size' not in response.meta: subproducts = subproducts[1:] for subproduct in subproducts: request = FormRequest.from_response( response, formdata={ name[0]: subproduct.select('./@value').extract() }, dont_click=True, callback=self.parse_product) request.meta['size'] = subproduct.select( './text()').extract()[0].strip() yield request return product = Product() loader = WindowsCleaningProductLoader(item=product, response=response) #try: #product['url'] = response.url loader.add_value('url', response.url) name = hxs.select( '//div[@id="product-detail-div"]//h1/text()').extract()[0].strip() if 'size' in response.meta: name += ' ' + response.meta['size'] loader.add_value('name', name) special_price = hxs.select( '//span[@class="prod-detail-sale-value"]/text()').extract() price = special_price[0] if special_price\ else hxs.select('//span[@class="prod-detail-cost-value"]/text()') \ .extract()[0] loader.add_value('price', price) try: sku = hxs.select( '//span[@class="prod-detail-man-part-value"]/text()').extract( )[0] except IndexError: sku = '' loader.add_value('sku', sku) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) # product list product_list_nodes = hxs.select('//td[@class="DarkCell"]/../..') if product_list_nodes: for node in product_list_nodes: p = Product() loader = WindowsCleaningProductLoader(item=p, response=response) loader.add_value('url', response.url) try: name = node.select('.//font/text()').extract()[0] except IndexError: return loader.add_value('name', name) try: price = node.select('.//span[@class="variantprice"]//text()').re('Price:.*?(\d.*)')[0] except IndexError: price = node.select('.//span[@class="SalePrice"]//text()').re('.*?\$(\d.*)')[0] loader.add_value('price', price) try: sku = node.select('.//*[contains(text(), "SKU")]/../td[2]/text()').extract()[0] except IndexError: sku = '' loader.add_value('sku', sku) yield loader.load_item() return # compound product try: common_desc = hxs.select('//span[@class="ProductNameText"]/text()').extract()[0] except IndexError: return sub_products = hxs.select('//select[@name="variants"]/option') if sub_products: for node in sub_products: p = Product() loader = WindowsCleaningProductLoader(item=p, response=response) loader.add_value('url', response.url) name = common_desc + ' ' + node.select('./text()')[0].extract().split(u'\xa0')[0] loader.add_value('name', name) try: price = node.select('./span/text()').re('([\d\.,]+)')[0] loader.add_value('price', price) except IndexError: continue yield loader.load_item() return # simple product p = Product() loader = WindowsCleaningProductLoader(item=p, response=response) loader.add_value('url', response.url) name = common_desc loader.add_value('name', name) try: price = hxs.select('//span[@class="variantprice"]/text()').re('.*?\$(.*)')[0] except IndexError: price = hxs.select('//span[@class="SalePrice"]/text()').re('.*?\$(.*)')[0] loader.add_value('price', price) try: sku = hxs.select('//td[contains(text(), "SKU")]/../td[2]/text()').extract()[0] except IndexError: sku = '' loader.add_value('sku', sku) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return identifier = re.search('/p-(\d+)-', response.url).group(1) BASE = get_base_url(response) # sub products hxs = HtmlXPathSelector(response) image_url = hxs.select( '//img[contains(@id,"ProductPic")]/@src').extract() image_url = urljoin_rfc(BASE, image_url[0]) if image_url else None category = hxs.select( '//span[@id="nevTabLink"]/span/a/text()').extract() # compound product common_desc = hxs.select( '//span[@class="ProductNameText"]/text()').extract() if not common_desc: common_desc = hxs.select('//h1/text()').extract() if not common_desc: return common_desc = common_desc[0] # product list product_list_nodes = hxs.select( '//table[tr[@class="DarkCell"]]/tr[not(@class="DarkCell")]') if product_list_nodes: i = 0 for node in product_list_nodes: loader = WindowsCleaningProductLoader(item=Product(), response=response) sub_product_id = node.select( './/input[@name="VariantID"]/@value').extract() if not sub_product_id: continue else: sub_product_id = sub_product_id[0] loader.add_value('identifier', '%s.%s' % (identifier, sub_product_id)) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', image_url) if category: loader.add_value('category', category) name = node.select('.//font/text()').extract() if name: name = name[0] else: return loader.add_value('name', common_desc + ' ' + name) price = node.select('.//span[@class="variantprice"]//text()' ).re('Price:.*?(\d.*)') if not price: price = node.select('.//span[@class="SalePrice"]//text()' ).re('.*?\$(\d.*)') price = price[0] if price else '0.00' loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) sku = node.select( './/*[contains(text(), "SKU")]/../td[2]/text()').extract() if not sku: sku = node.select('.//font/text()').extract() if sku: sku = sku[1] else: return loader.add_value('sku', sku) yield loader.load_item() i += 1 return product_list_nodes = hxs.select('//table[tr/td[@class="GreyCell"]]') if product_list_nodes: for node in product_list_nodes: loader = WindowsCleaningProductLoader(item=Product(), response=response) loader.add_value('url', response.url) sub_product_id = node.select( './/input[@name="VariantID"]/@value').extract() if not sub_product_id: continue else: sub_product_id = sub_product_id[0] loader.add_value('identifier', '%s.%s' % (identifier, sub_product_id)) sub_prod_image = node.select('.//img[@id="ProductPic' + sub_product_id + '"]/@src').extract() if sub_prod_image: loader.add_value('image_url', urljoin_rfc(BASE, sub_prod_image[0])) if category: loader.add_value('category', category) name = node.select( 'tr/td[@class="DarkCell"]/font/text()').extract() if name: name = name[0] else: return loader.add_value('name', common_desc + ' ' + name) sku = node.select( './/tr[td[contains(text(), "SKU")]]/td[not(contains(text(), "SKU"))]/text()' ).extract() if sku: loader.add_value('sku', sku) price = node.select( './/span[@class="variantprice"]/text()').re('([\d\.,]+)') price = price if price else '0.00' loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item() return sub_products = hxs.select('//select[@name="variants"]/option') if sub_products: for node in sub_products: loader = WindowsCleaningProductLoader(item=Product(), response=response) loader.add_value('url', response.url) sub_product_id = node.select('@value').extract() if not sub_product_id: continue else: sub_product_id = sub_product_id[0] loader.add_value('identifier', '%s.%s' % (identifier, sub_product_id)) if image_url: loader.add_value('image_url', image_url) if category: loader.add_value('category', category) name = common_desc + ' ' + node.select( './text()')[0].extract().split(u'\xa0')[0] loader.add_value('name', name) price = node.select('./span/text()').re('([\d\.,]+)') price = price if price else '0.00' loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item() return # simple product loader = WindowsCleaningProductLoader(item=Product(), response=response) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', image_url) if category: loader.add_value('category', category) name = common_desc loader.add_value('name', name) loader.add_value('identifier', identifier) price = hxs.select('//span[@class="variantprice"]/text()').re( '.*?\$(.*)') if not price: price = hxs.select('//span[@class="SalePrice"]/text()').re( '.*?\$(.*)') price = price[0] if price else '0.00' loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) sku = hxs.select( '//td[contains(text(), "SKU")]/../td[2]/text()').extract() sku = sku if sku else '' loader.add_value('sku', sku) size_options = hxs.select( '//select[@name="Size"]/option[not(contains(text(),"Size"))]/text()' ).extract() if size_options: i = 0 for size in size_options: item = copy(loader.load_item()) item['identifier'] += '.%s' % i item['name'] += ' %s' % size yield item i += 1 return else: yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) # sub products hxs = HtmlXPathSelector(response) name = hxs.select( '//div[@id="product-detail-div"]//select/@name').extract() subproducts = hxs.select( '//div[@id="product-detail-div"]//select/option') if name and 'size' not in response.meta: subproducts = subproducts[1:] for subproduct in subproducts: request = FormRequest.from_response( response, formdata={ name[0]: subproduct.select('./@value').extract() }, dont_click=True, callback=self.parse_product) request.meta['size'] = subproduct.select( './text()')[0].extract().strip() yield request return loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) name = hxs.select( '//div[@id="product-detail-div"]/h1/text()').extract()[0].strip() if 'size' in response.meta: name += ' ' + response.meta['size'] loader.add_value('name', name) price = hxs.select( '//span[@class="prod-detail-sale-value"]/text()').extract() if not price: price = hxs.select( '//span[@class="prod-detail-cost-value"]/text()')[0].extract() price = price if price else '0.00' loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) sku = hxs.select('//*[@itemprop="mpn"]/text()').extract() if not sku: sku = hxs.select( '//span[@class="prod-detail-part-value"]/text()').extract() if sku: sku = sku[0] product_id = hxs.select( '//input[@type="hidden" and @class="productDetailsID"]/@value' ).extract() item_number = hxs.select( '//span[@class="prod-detail-part-value"]/text()').extract() if not item_number: item_number = hxs.select('//*[@itemprop="mpn"]/text()').extract() if item_number and product_id: identifier = '%s.%s' % (product_id[0], item_number[0]) else: log.msg('Identifier not found: [%s]' % response.url) return loader.add_value('sku', sku) loader.add_value('identifier', identifier) image_url = hxs.select('//a[@id="Zoomer"]//img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) brand = hxs.select('//*[@itemprop="manufacturer"]/text()').extract() if brand: loader.add_value('brand', brand[0]) category = hxs.select( '//div[contains(@id,"breadcrumb")]//a/text()').extract() if category: loader.add_value('category', category[-1]) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) # sub products hxs = HtmlXPathSelector(response) # compound product identifier = hxs.select('//input[@type="hidden" and @name="product"]/@value')[0].extract() image_url = hxs.select('//div[@class="onsale-product-container"]/a/img/@src').extract() if not image_url: image_url = hxs.select('//p[@class="product-image"]/a[@id="zoom1"]/@href').extract() category = hxs.select('//div[@class="breadcrumbs"]//a/text()').extract() loader = WindowsCleaningProductLoader(item=Product(), selector=hxs) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) if category: loader.add_value('category', category[-1]) sub_products = hxs.select('//table[@id="super-product-table"]//tr')[1:] if sub_products: item = loader.load_item() sub_products.sort(key=lambda p: p.select('td[1]//text()')[0].extract()) i = 0 for p in sub_products: name = p.select('td[1]//text()')[0].extract() price = ''.join(p.select('td[2]//text()').extract()).strip() in_stock = p.select('td[3]/input') loader = WindowsCleaningProductLoader(item=item, selector=hxs) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', '') loader.add_value('identifier', '%s.%s' % (identifier, i)) if not in_stock: loader.add_value('stock', 0) yield loader.load_item() i += 1 return name = hxs.select('//div[@class="product-name"]/h1/text()')[0].extract() loader.add_value('url', response.url) loader.add_value('sku', '') loader.add_value('identifier', identifier) loader.add_value('name', name) out_of_stock = hxs.select('//p[contains(@class, "availability") and contains(@class, "out-of-stock")]') if out_of_stock: loader.add_value('stock', 0) price = hxs.select('//div[@class="product-shop"]//p[@class="special-price"]/span[2]/text()').extract() if not price: price = hxs.select('//div[@class="product-shop"]//span[@class="regular-price"]/span/text()').extract() price = price if price else '0.00' loader.add_value('price', price) # TODO stock options = re.search('var spConfig = new Product\.Config\(({.*})\);', response.body) if options: item = loader.load_item() options = json.loads(options.group(1)) base_price = float(options['basePrice']) for attribute in options['attributes'].values(): for option in attribute['options']: opt_item = Product(item) opt_item['identifier'] += '.%s.%s' % (attribute['id'], option['id']) opt_item['name'] += ' %s' % option['label'] opt_item['price'] = Decimal(str(float(base_price) + float(option['price']))) yield opt_item else: yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) # sub products hxs = HtmlXPathSelector(response) # product_links = hxs.select('//a[contains(@href, "product_info.php")]/@href').extract() products = hxs.select('//a[contains(@href, "products_id") and \ not(contains(@href, "review")) and \ not(contains(@href, "notify")) and \ not(contains(@href, "language")) and \ not(contains(@href, "buy_now"))]/@href' ).extract() for url in products: if not 'language=' in url: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( '//td[@class="pageHeading" and not(@align="right")]/text()' ).extract()[0] if name in ['Welcome, Please Sign In', "Let's See What We Have Here"]: return price = hxs.select( '//form//span[@class="productSpecialPrice"]/text()').re('\$(.*)') if not price: price = hxs.select( '//td[@class="pageHeading" and (@align="right")]/text()').re( '\$(.*)') price = price[0] if price else '0.00' sku = hxs.select( '//td[@class="pageHeading"]//span[@class="smallText"]/text()' ).extract() if sku: sku = sku[0].replace(']', '').replace('[', '') else: sku = '' identifier = re.findall(r'products_id=(\d+)', response.url)[0] category = hxs.select( '//td[not(@align) and @class="headerNavigation"]//a[@class="headerNavigation"]/text()' ).extract() loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) brand = hxs.select( '//td[@class="boxText"]/table//tr/td[@align="center"]/img/@alt' ).extract() if brand: brand = brand[0].strip() loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('url', response.url) image_url = hxs.select( '//td[@class="main"]//a/img[not(contains(@src,"reviews"))]/@src' ).extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) if not image_url: image_url = 'http://windows101.com/shop/popup_image.php?pID=%s' % identifier loader.add_value('image_url', image_url) if len(category) > 1: loader.add_value('category', category[-2]) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: identifier = hxs.select( '//font[@class="content" and contains(text(), "Item Code:")]' '/parent::td/following-sibling::td/font/text()').extract()[0] except: identifier = response.url.split('/')[-3] if 'jracenstein.com' in identifier: identifier = url_query_parameter(response.url, 'ic') if not identifier: identifier = url_query_parameter(response.url, 'kc') return try: name = hxs.select( '//div[@class="bigbox"]/div[@class="top"]/text()').extract()[0] price = hxs.select( '//div[@class="priceAmount"]/text()').extract()[0] sku = hxs.select( '//font[@class="content" and contains(text(), "Model")]/../../td[2]/font/text()' ).extract() sku = sku[0] if sku else None category = hxs.select( '//div[@class="pageHeaderCrumbs"]/a/text()').extract() brand = hxs.select( '//font[@class="content" and contains(text(), "Brand")]/../../td[2]/font//text()[normalize-space()]' ).extract() image_url = hxs.select( '//img[@class="imagePanelLarge"]/@src').extract() loader = WindowsCleaningProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) if sku: loader.add_value('sku', sku) loader.add_value('identifier', identifier) if brand: brand = brand[0].strip().split(' ')[0] loader.add_value('brand', brand) if category: loader.add_value('category', category[-1]) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) yield loader.load_item() except IndexError: retry = response.meta.get('retry', 0) if retry < 10: retry += 1 self.log('Retrying No. %s => %s' % (retry, response.url)) meta = response.meta.copy() meta['retry'] = retry yield Request(response.url, callback=self.parse_product, meta=meta, dont_filter=True)