def parse_frames(self, response): base_url = get_base_url(response) products = response.xpath('//tr/td[text()="Code"][1]') if products: margin = 3 else: products = response.xpath('//tr/td[span/text()="CODE"][1]') if products: margin = 2 if not products: self.log('No products found on %s' % response.url) identifiers = [] image_url = response.xpath( '//img[not (contains(@alt, "Doors"))]/@src[contains(., "images-thumb")]' ).extract() for product in products: for idx, option in enumerate( product.xpath( './../preceding-sibling::tr[1]/td[position()>1]')): name = option.xpath('.//text()').extract() for size in product.xpath('./../following-sibling::tr'): if size.xpath( 'td[(text()="Code") or (span/text()="CODE")]'): break if not size.xpath('./td[1][contains(.//text(), " x")]'): continue loader = ProductLoader(item=Product(), selector=size) loader.add_value('name', name) size_name = size.xpath('td[1]/text()').extract() loader.add_value('name', size_name) loader.add_xpath('sku', 'td[%d]/text()' % (idx * 2 + margin)) loader.add_xpath('price', 'td[%d]/text()' % (idx * 2 + margin + 1)) if not loader.get_output_value('sku'): continue identifier = loader.get_output_value( 'sku') + '-' + '-'.join(re.findall( '\d+', size_name[0])) identifier += '-' + response.url.split('/')[-1].split( '_')[0].split('.')[0] while identifier in identifiers or identifier in self.ids_seen: identifier += '-d' identifiers.append(identifier) self.ids_seen.append(identifier) loader.add_value('identifier', identifier) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin(base_url, image_url[0])) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) # identifier = hxs.select('').extract() sku = hxs.select('//p/span[@itemprop="sku"]/text()').extract() identifier = sku if not sku: identifier = response.url.split('/')[-1].split('.')[0] loader.add_value('identifier', identifier) loader.add_value('sku', sku) if identifier in self.seen_ids: return self.seen_ids.append(identifier) name = hxs.select('//h1[@class="first"]/span[@itemprop="name"]/text()' ).extract()[0].strip() try: loader.add_value('name', name) except: loader.add_value('name', name.decode('utf-8', 'replace')) category = hxs.select('//ol[@class="breadcrumb"]//a/text()').extract() loader.add_value('category', ' > '.join(category[1:][-3:])) image_url = hxs.select('//a[@class="lightbox"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('url', response.url) price = hxs.select( '//span[@class="price-big orange"]/text()').extract()[0] loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): sel = Selector(response) price = sel.re(re.compile('jsProductPrice = \'(.*)\';')) categories = sel.xpath('//div[@id="navBreadCrumb"]/a/text()')[1:].extract() brand = sel.xpath('//span[@class="product_manufacturer"]/text()').re('Manufactured by: (.*)') brand = brand[0].strip() if brand else '' sku = sel.xpath('//span[@class="product_model"]/text()').re('Ref: (.*)') sku = sku[0].strip() if sku else '' identifier = re.search('p-(.*)\.html', response.url).group(1) image_url = response.xpath('//div[@id="replace_image_zoom"]//img[@class="zoom_pic"]/@src').extract() if image_url: image_url = response.urljoin(image_url[0]) name = sel.xpath('//h1[@class="productGeneral"]/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('name', name) loader.add_value('price', price) price = loader.get_output_value('price') if price and Decimal(price) < Decimal('400.0'): loader.add_value('shipping_cost', Decimal('35.00')) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', image_url) for category in categories: loader.add_value('category', category) loader.add_value('brand', brand) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', re.findall('product_id.+?(\d+)', response.body)) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_value('name', re.findall('"name":"(.+?)"', response.body)) prices = re.findall('tier_price_total".+?([\d.]+)', response.body) if not prices: return price = Decimal(prices[0]).quantize(Decimal('.01')) loader.add_value('price', price) loader.add_value('sku', re.findall('product_id.+?(\d+)', response.body)) category = re.findall( '<span class="technical_label">Lenstype:</span><a href.+?>(.+?)</a', response.body ) or re.findall( '<span class="technical_label">Producttype:</span><a href.+?>(.+?)</a', response.body) loader.add_value('category', category) loader.add_value( 'image_url', re.findall('<img src="(\S+media/catalog/product\S+)"', response.body)) loader.add_value( 'brand', re.findall( '<span class="technical_label">Merk:</span><a href.+?>(.+?)</a', response.body)) if loader.get_output_value('price') < 70: loader.add_value('shipping_cost', '4.98') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath('//input[@name="product_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:] loader.add_value('category', category) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content') if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'): loader.add_value('stock', 0) sku = identifier name = loader.get_output_value('name') name_end = re.search('\S+$', name).group(0).strip(' ()') keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',') keywords = [word.strip() for word in keywords if word] shortest_keyword = min(keywords, key=len) if keywords else 'none' from_name = re.findall('\S*\d+\S*', name) if shortest_keyword.lower() == name_end.lower(): sku = name_end elif shortest_keyword.upper() == shortest_keyword: sku = shortest_keyword elif name_end.upper() == name_end: sku = name_end elif from_name: sku = max(from_name, key=len) if '(' in sku: sku = identifier loader.replace_value('sku', sku) yield loader.load_item()
def parse_doors(self, response): url = response.xpath('//link[@rel="canonical"]/@href').extract() category = response.xpath( '//p[@class="breadcrumbs"]/a[position()>1]/text()').extract() ids = response.xpath('//script/text()').re('ecomm_prodid.*(\[.+\])') ids = eval(ids[0]) for i, product in enumerate( response.xpath('//div[@itemprop="offers"]')): loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3[@itemprop="name"]/a/text()[1]') loader.add_value('identifier', ids[i]) loader.add_value('sku', ids[i]) loader.add_xpath('price', './/span[@itemprop="price"]/text()') local_url = product.xpath( './/h3[@itemprop="name"]/a/@href').extract() if local_url: local_url = response.urljoin(local_url[0]) else: local_url = url loader.add_value('url', local_url) image_url = product.xpath('.//a/img/@src').extract() loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('category', category) if not product.xpath( 'link[@itemprop="availability"][@href="http://schema.org/InStock"]' ): loader.add_value('stock', 0) if loader.get_output_value('price') < 750: loader.add_value('shipping_cost', 36) yield loader.load_item()
def parse_product(self, response): for url in response.css('.facet-nav a::attr(href)').extract(): yield Request(response.urljoin(url), self.parse_product) xpath = '//meta[@property="%s"]/@content' loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', xpath % 'product:retailer_part_no') loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', xpath % 'og:title') #loader.add_xpath('name', xpath %'product:color') loader.add_xpath('price', xpath % 'product:price:amount') loader.add_xpath('sku', xpath % 'product:retailer_part_no') category = response.xpath( '//ul[@itemprop="breadcrumb"]//a/text()').extract() category.remove('Home') category.remove('Products') category.pop(-1) loader.add_value('category', category[-3:]) loader.add_xpath('image_url', xpath % 'og:image') loader.add_xpath('brand', xpath % 'product:brand') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '3.99') item = loader.load_item() if item.get('identifier'): yield item
def parse_category(self, response): category = response.css('li.last::text').extract() products = response.xpath('//div[@typeof="Product"]') for product in products: loader = ProductLoader(Product(), selector=product) loader.add_xpath('identifier', './/*[@property="url"]/@sku') url = product.xpath('.//*[@property="url"]/@href').extract_first() loader.add_value('url', response.urljoin(url)) loader.add_xpath('name', './/*[@property="url"]/text()') loader.add_xpath('price', './/*[@property="price"]/text()') loader.add_xpath('sku', './/*[@property="url"]/@sku') loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a/text()') loader.add_value('category', category) loader.add_xpath('image_url', './/*[@property="image"]/@content') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '9.95') if product.xpath('.//button[starts-with(@id, "outOfStock")]'): loader.add_value('stock', 0) yield loader.load_item() if url_query_parameter(response.url, 'pn') or re.search('/cat_.+/.', response.url): return filters = response.css('ul.filters input::attr(id)').re('^\S{5}$') for filt in filters: url = response.url + '/' + filt yield Request(url, self.parse_category)
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) identifier = response.xpath('//div[@id="habtat-sku"]/text()').re( 'Product Code: (\d+)') if not identifier: return loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//div[@class="product-name"]/h1//text()') loader.add_xpath('name', '//div/text()', re='Colour.*:(.+)') product_name = loader.get_output_value('name') if 'product' in response.meta: category = response.meta['product']['category'].split(' > ') else: website_category = response.meta['website_categories'] categories = response.meta['category'] kwrds = response.meta.get('kwrds', '') category = self.get_category(product_name, categories, kwrds) if not category: category = website_category.split(' > ') loader.add_value('category', category) loader.add_xpath( 'price', '//div[@class="price-info"]//span[contains(@id, "product-price")]//span/text()' ) if not loader.get_output_value('price'): loader.add_xpath( 'price', '//p[@class="special-price"]/span[@class="price"]/text()') price = loader.get_output_value('price') if price and Decimal(price) < 50.0: loader.add_value('shipping_cost', '4.95') img = response.xpath( '//div[@class="product-img-box"]/div/a/@href').extract() if img: loader.add_value('image_url', response.urljoin(img[0])) if loader.get_output_value('price'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_value('category', response.meta['category']) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') option_name = response.css('.label-select-container').xpath( './/option[@selected]/text()').extract() loader.add_value('name', option_name) item_identifier = response.xpath( '//input[@id="item_details_item_id"]/@value').extract_first() if not item_identifier: self.logger.warning('No identifier on %s' % response.url) identifier = item_identifier + '-' + response.xpath( '//input[@id="item_details_product_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_xpath('price', '//meta[@itemprop="price"]/@content') sku = [] sku.append( response.css('.order-code').xpath( 'text()').extract_first().strip()) sku.extend(response.css('.order-code span::text').extract()) loader.add_value('sku', ' '.join(sku)) loader.add_xpath('image_url', '//img[@id="imageMain"]/@src') loader.add_css('brand', '.sku_kc_brand_id_ ::text') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '2.99') stock = response.xpath( '//meta[@itemprop="availability"]/@content').extract_first() stock = stock.replace(' ', '').lower() if stock not in self.instock: loader.add_value('stock', 0) if stock not in self.outofstock: self.logger.warning('Undefined stock status for %s' % response.url) item = loader.load_item() if item['identifier'] not in self.identifiers: self.identifiers.add(item['identifier']) yield item attributes = [] options = [] for attribute in response.css('.label-select-container select'): attribute_name = attribute.xpath('@id').extract_first() attribute_name = attribute_name.replace('_%s' % item_identifier, '') attributes.append(attribute_name) options.append([]) for value in attribute.xpath('option/@value').extract(): options[-1].append(value) for variant in itertools.product(*options): url = 'http://www.kiddicare.com/ajax.get_exact_product.php?instart_disable_injection=true&item_id=%s' % item_identifier for n, option in enumerate(variant): url += '&attributes[%s]=%s' % (attributes[n], option) url = url.replace('+', '%2B') meta = response.meta meta['sku'] = sku meta['attributes'] = attributes yield Request(url, self.parse_option, meta=meta)
def parse_price_from_cart(self, response): loader = ProductLoader(item=response.meta['product'], response=response) loader.replace_xpath( 'price', '//td[@class="right"]/div[@class="prodetail-price"][1]/text()') shipping_cost = 9.9 if loader.get_output_value('price') < 200 else 0 loader.replace_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse_node(self, response, node): identifier = node.select('./*[local-name()="id"]/text()')[0].extract() if identifier not in self.id_code_map: return product_code = self.id_code_map[identifier] loader = ProductLoader(item=Product(), selector=node) size = node.xpath('./*[local-name()="size"]/text()').extract() color = node.xpath('./*[local-name()="color"]/text()').extract() material = node.xpath('./*[local-name()="material"]/text()').extract() name = node.xpath('./*[local-name()="parent_title"]/text()').extract() if not name: name = node.xpath('./title/text()').extract() name = name[0] if material: name += u' {}'.format(material[0]) if color: name += u' {}'.format(color[0]) if size: name += u' {}'.format(size[0]) price = node.xpath('./*[local-name()="price"]/text()').extract_first() pack_size = node.xpath('./description/text()').re( 'Pack Size m: *([\d.]+)') if pack_size: price = extract_price(price) * extract_price(pack_size[0]) loader.add_value('name', name) loader.add_xpath('url', './link/text()') loader.add_xpath('image_url', './*[local-name()="image_link"]/text()') loader.add_value('identifier', identifier) loader.add_value('price', price) loader.add_xpath( 'shipping_cost', './*[local-name()="shipping"]/*[local-name()="price"]/text()') loader.add_xpath('brand', './*[local-name()="brand"]/text()') loader.add_xpath('category', './*[local-name()="google_product_category"]/text()') loader.add_xpath('sku', './*[local-name()="mpn"]/text()') stock = node.xpath('./*[local-name()="availability"]/text()').extract() if stock and stock[0] == 'out of stock': loader.add_value('stock', 0) item = loader.load_item() if product_code in self.cost_prices: try: cost_price = Decimal(self.cost_prices[product_code]) except: self.log('ERROR: unable to set cost price for item %r' % item) else: item['metadata'] = {'cost_price': str(cost_price)} if pack_size: yield Request(loader.get_output_value('url'), self.parse_pack_price, meta={'item': item}) else: yield item
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) css = '.nosto_product .%s ::text' loader.add_css('identifier', css % 'product_id') loader.add_css('sku', css % 'product_id') for field in ('url', 'name', 'image_url', 'brand'): loader.add_css(field, css % field) list_price = response.css(css % 'list_price').extract_first() sales_price = response.css(css % 'price').extract_first() loader.add_value('price', list_price) if 'InStock' not in response.css(css % 'availability').extract_first(): loader.add_value('stock', 0) category = response.css(css % 'category').extract_first() loader.add_value('category', category.split('/')[-1]) options_data = response.xpath('//script/text()').re( 'Product.Config.({.+})') if not options_data: item = loader.load_item() if sales_price != list_price: item['metadata'] = {'SalesPrice': Decimal(sales_price)} yield item return options_data = json.loads(options_data[0]) if len(options_data['attributes']) > 1: self.log('More than one options attributes found on %s' % response.url) return price = loader.get_output_value('price') name = loader.get_output_value('name') sales_price = Decimal(sales_price) for option in options_data['attributes'].values()[0]['options']: new_price = sales_price + Decimal(option['price']) loader.replace_value('price', price + Decimal(option['oldPrice'])) loader.replace_value('name', name + ' ' + option['label']) loader.replace_value('identifier', option['products'][0]) loader.replace_value('sku', option['products'][0]) loader.replace_xpath( 'image_url', '//li[@id="simple-product-image-%s"]/a/@href' % option['products'][0]) item = loader.load_item() if price + Decimal(option['oldPrice']) != new_price: item['metadata'] = {'SalesPrice': new_price} yield item
def parse_product(self, response): try: pdata = SpiderSchema(response).get_product() except: self.logger.error('No structured product data on %s' %response.url) return options = None js_line = '' for l in response.body.split('\n'): if 'variants:' in l: js_line = l break if js_line: options = demjson.decode(re.search(r'variants:(.*};)?', js_line).groups()[0][:-2].strip()) product_loader = ProductLoader(item=Product(), response=response) sku = response.css('span.pd_productVariant::text').extract_first() product_loader.add_css('sku', 'span.pd_productVariant::text') product_loader.add_xpath('identifier', '//input[@name="productId"]/@value') product_loader.add_value('url', response.url) try: product_loader.add_value('name', pdata['name']) except KeyError: return category = response.xpath('//*[@id="breadcrumb"]//a/text()').extract()[1:-1] product_loader.add_value('category', category) img = response.xpath('//meta[@property="og:image"]/@content').extract() if img: product_loader.add_value('image_url', response.urljoin(img.pop())) price = response.xpath('//p[@class="productOfferPrice"]/text()').extract()[0] product_loader.add_value('price', price) if product_loader.get_output_value('price') < 45: product_loader.add_value('shipping_cost', '3.5') brand = response.xpath('//*[@id="brandHeader"]/a/@href').extract() if brand: brand = brand[0].replace('/en/', '')[:-1] if '/' not in brand: product_loader.add_value('brand', brand) stock = response.xpath('//link[@itemprop="availability"]/@href').extract_first() if stock != 'http://schema.org/InStock': product_loader.add_value('stock', 0) product = product_loader.load_item() yield product if options: for k, val in options.items(): option_name = k.replace('_', ' ') option_product = Product(product) option_product['name'] = product['name'] + ' ' + option_name option_product['sku'] = val['productCode'] option_product['identifier'] = val['variantId'] option_product['price'] = extract_price(val['nowPrice']) yield option_product
def parse_product(self, response): category = response.xpath( '//div[@class="breadcrumbs"]//li[position()>1]/a/@title').extract( ) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_xpath('sku', '//meta[@itemprop="sku"]/@content') loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', '//div[@itemprop="name"]/h1/text()') loader.add_xpath( 'price', '//meta[@property="og:product:price:amount"]/@content') loader.add_xpath( 'price', '//span[@id="product-price-%s"]//span[@class="price"]/text()' % loader.get_output_value('identifier')) loader.add_value('category', category) loader.add_xpath('image_url', '//div[@class="product-img-box"]//img/@src') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') if self.shipping_cost: loader.add_value('shipping_cost', self.shipping_cost) if not response.xpath('//*[@class="availability in-stock"]'): loader.add_value('stock', 0) product = loader.load_item() if 'Doors, Joinery & Windows' in category: product['shipping_cost'] = Decimal('33') elif 'Flooring' in category: product['shipping_cost'] = Decimal('20') config = response.xpath('//script/text()').re( 'Product.Config\((.+)\);') if config: data = json.loads(config[0]) baseprice = Decimal(data['basePrice']) options = [] attributes = data['attributes'] for attribute_id in attributes: options.append(attributes[attribute_id]['options']) variants = itertools.product(*options) for variant in variants: item = Product(product) item['price'] = baseprice for option in variant: item['identifier'] += '-' + option['id'] item['name'] += ' ' + option['label'].strip() item['price'] += Decimal(option['price']) item['price'] *= Decimal('1.2') yield Product(item) return yield product
def parse_product(self, response): url = response.url l = ProductLoader(item=Product(), response=response) # name l.add_css('name', '.pro-des::text') # price price = '.'.join( response.xpath('//div[@class="price-strike"]/div/span//text()').re( '\d+')) l.add_value('price', price) # sku l.add_xpath('sku', '//div[@class="short-desc"]/span//text()') # identifier productid = response.xpath( '//input[@id="selectedProductIdd"]/@value').extract()[0] priceid = response.xpath('//input[@id="priceId"]/@value').extract()[0] identifier = '-'.join((productid, priceid)) l.add_value('identifier', identifier) # category l.add_xpath( 'category', "//div[@class='bread']//li[position() > 1]//text()[not(contains(., '>'))]" ) # product image l.add_xpath('image_url', "//meta[@property='og:image']/@content") # url l.add_value('url', url) # brand l.add_xpath('brand', '//div[@class="added-item"]/h2/text()') # shipping shipping_cost = 9.9 if l.get_output_value('price') < 200 else 0 l.add_value('shipping_cost', shipping_cost) product = l.load_item() if not price: storeid = response.xpath( '//input[@id="storeId"]/@value').extract()[0] url = 'http://www.courts.com.sg/home/addtocart.html?isAdd=true&newProduct=true&productId=%s&selectedCurrency=SGD&quantity=1&cartId=na&addQuantity=true&newQuantity=1&shippingOption=&shippingCity=&deliveryOption=&shippingDate=&cityId=&title=&inventorysensible=yes&priceId=%s&storeId=%s' yield Request(url % (productid, priceid, storeid), callback=self.parse_price_from_cart, meta={ 'product': Product(product), 'dont_merge_cookies': True }) else: yield product
def parse_product(self, response): if response.url.endswith('page-not-found.page'): return formdata = {} for inp in response.xpath('//form[@id="variant-form"]//input'): formdata[inp.xpath('@name').extract_first()] = inp.xpath( '@value').extract_first() if not formdata: self.logger.warning('No data on %s' % response.url) return del formdata[None] options = response.css('.vContainer .variantDataElement') for option in options: formdata[option.xpath('@name').extract_first()] = option.xpath( '@data-variant-value').extract_first() r = FormRequest.from_response( response, formxpath='//form[@id="variant-form"]', formdata=formdata, callback=self.parse_product) yield r loader = ProductLoader(item=Product(), response=response) sku = response.xpath('//input[@id="skuIdVal"]/@value').extract_first() if sku != url_query_parameter(response.url, 'skuId'): url = add_or_replace_parameter(url_query_cleaner(response.url), 'skuId', sku) yield Request(url, self.parse_product) return loader.add_value('identifier', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@id="productLabel"]//text()') #loader.add_css('name', '.selected .variantDisplayName_title ::text') loader.add_css('price', '.current-price ::text') loader.add_value('sku', sku) category = response.xpath( '//div[@id="breadcrumb"]//li//span[@itemprop="title"]/text()' ).extract() loader.add_value('category', category[-4:-1]) image_url = response.xpath( '//img[@itemprop="image"]/@src').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath( 'brand', '//div[@itemprop="brand"]//span[@itemprop="name"]/text()') loader.add_value('shipping_cost', 3) #if not response.css('.stock-tag.in-stock') and not response.xpath('//link[@href="http://schema.org/InStock"]') and not response.css('.available-from'): if not response.css('.add-to-basket'): loader.add_value('stock', 0) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): categories = response.xpath( '//li[@class="blockBreadcrumb__item"]/a/text()').extract()[-3:] loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', '//input[@name="simpleSku"]/@value') loader.add_xpath('sku', '//input[@id="configSku"]/@value') loader.add_value('url', response.url) loader.add_xpath('name', '//h1[contains(@class, "__heading")]/text()') loader.add_xpath('name', '//input[@name="simpleSku"]/../span/text()') loader.add_xpath('image_url', '//div[@class="layoutImage"]//img/@src') loader.add_xpath('price', '//input[@id="price"]/@value') loader.add_xpath('brand', '//input[@id="brand"]/@value') loader.add_value('category', categories) loader.add_xpath('stock', '//@data-instock') item = loader.load_item() options = response.xpath('//select[@id="js-simple-selector"]/option') if not options: if loader.get_output_value('identifier'): yield item return for option in options: loader = ProductLoader(item=Product(item), selector=option) loader.replace_xpath('identifier', './@value') loader.add_xpath('name', './text()') identifier = loader.get_output_value('identifier') price = response.xpath( '//div[@data-simple-sku="%s"]//span[contains(@class, "actualPrice")]/text()' % identifier).extract() loader.replace_value('price', price) image_url = response.xpath( '//div[@data-simple-sku="%s"]/a[contains(@class, "link_selected")]/@data-product-image' % identifier).extract() loader.replace_value('image_url', image_url) loader.replace_xpath('stock', './@data-instock') yield loader.load_item()
def parse_products(self, response): for url in response.css( '.leftoption :contains("Filter by Manufacturers")').xpath( 'following-sibling::*//a/@href').extract(): yield Request(response.urljoin(url), callback=self.parse_products) text = re.sub('Estimated *<', 'Estimated <', response.body) selector = Selector(text=text) category = selector.css('.crumword').xpath( './/*[@itemprop="title"]/text()').extract() try: identifiers = selector.xpath('//script/text()').re( 'ecomm_prodid: *\[(.+)\]')[0].replace("'", '').split(',') except IndexError: return next_page_url = response.xpath( '//div[@class="pagination"]/a[@class="next"]/@href').extract() if next_page_url: yield Request(response.urljoin(next_page_url[0]), callback=self.parse_products) for num, product in enumerate(selector.css('.grid')): loader = ProductLoader(item=Product(), selector=product) identifier = identifiers[num] loader.add_value('identifier', identifier) url = product.xpath('@href').extract_first() loader.add_value('url', response.urljoin(url)) name = product.css('.gridname').xpath('text()').extract() loader.add_value('name', name) price = product.css('.gridPriceVat').xpath('text()').extract() if not price: price = 0 loader.add_value('price', price) loader.add_value('sku', identifier) loader.add_value('category', category) image_url = product.css('.gridimage').xpath('.//@src').extract() loader.add_value('image_url', image_url) if price and loader.get_output_value('price') < 200: loader.add_value('shipping_cost', '4.99') if 'in stock' not in product.css('.pItemStock').xpath( 'text()').extract_first().strip().lower(): loader.add_value('stock', 0) item = loader.load_item() if price: yield item else: yield Request(response.urljoin(url), self.parse_product, meta={'product': Product(item)})
def parse_category(self, response): try: data = SpiderSchema(response).get_products() except: return products = False for product in data: if not product.get('sku'): continue products = True loader = ProductLoader(Product(), response=response) loader.add_value('identifier', product['sku']) loader.add_value('url', product['url'][0]) loader.add_value('name', product['name']) loader.add_value('sku', product['sku']) category = response.css('a.GTM-breadcumb::text').extract( )[1:] or response.meta.get('category') loader.add_value('category', category) loader.add_value('image_url', product['image']) loader.add_value('brand', product['brand']) if product['offers']['properties']['availability'] != 'in stock': loader.add_value('stock', 0) price = product['offers']['properties']['price'] yield Request(loader.get_output_value('url'), self.parse_product, meta={'item': Product(loader.load_item())}) if not products: return page = url_query_parameter(response.url, 'page') if page: url = add_or_replace_parameter(response.url, 'page', int(page) + 1) else: id_families = response.xpath( '//input[@data-key="idFamilies"]/@value').extract_first() if id_families: url = add_or_replace_parameter( 'https://www.pccomponentes.pt/listado/ajax?page=0&order=price-desc', 'idFamilies[]', id_families) elif response.url.endswith('/novedades/'): return elif response.url.endswith('/'): url = response.url + 'ajax?page=0&order=price-desc' else: return yield Request(url, self.parse_category, meta={'category': category})
def parse_treatment(self, response): base_url = get_base_url(response) product = response.xpath('//tr/td[(text()="Code")][1]')[0] identifiers = [] for size in product.xpath('./../following-sibling::tr[position()<5]'): loader = ProductLoader(item=Product(), selector=size) size_name = size.xpath('td[1]/text()').extract() loader.add_value('name', size_name) loader.add_xpath('sku', 'td[2]/text()') loader.add_xpath('price', 'td[3]/text()') if not loader.get_output_value('sku'): continue loader.add_xpath('identifier', 'td[2]/text()') loader.add_value('url', response.url) yield loader.load_item() else: self.treatment = True
def parse_product(self, response): product = SpiderSchema(response).get_product() if not product: return loader = ProductLoader(Product(), response=response) loader.add_value('identifier', product['sku']) loader.add_value('url', response.url) loader.add_value('name', product['name']) loader.add_value('price', product['offers']['properties']['price']) loader.add_value('sku', product['sku']) loader.add_xpath('category', '//a[@id="breadCrumbDetails"]/text()') loader.add_value('image_url', product['image']) if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '9.95') if product['offers']['properties']['availability'] != 'http://schema.org/InStock': loader.add_value('stock', 0) yield loader.load_item()
def parse_products(self, response): category = response.xpath( '//div[@id="breadcrumb"]//span[@itemprop="name"]/text()').extract( )[2:] for product in response.css('.productList .product'): loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('identifier', '@id', re='product-(.+)') loader.add_xpath('url', './/@href') brand = product.xpath('.//h3/em/text()').extract_first() name = product.xpath('.//h3/span/text()').extract_first() if name[0].islower(): loader.add_value('name', brand) loader.add_value('name', name) loader.add_css('price', '.productPrice dd:last-child::text') loader.add_xpath('sku', '@id', re='product-(.+)') loader.add_value('category', category) loader.add_css('image_url', '.productMainImage img::attr(src)') image_url = loader.get_output_value('image_url') promotion = None if image_url and '3for2' in image_url: promotion = '3 for 2' loader.add_value('brand', brand) loader.add_value('shipping_cost', '3.99') stock = product.css('.productStock dd').extract_first().title() if 'In Stock' not in stock and 'Low Stock' not in stock: loader.add_value('stock', 0) product = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] if promotion: metadata['promotions'] = promotion product['metadata'] = metadata prod_id = re.findall("/(\d+).prd", product['url'])[0] reviews_url = "http://api.bazaarvoice.com/data/batch.json?passkey=35w0b6mavcfmefkhv3fccjwcc&apiversion=5.5&displaycode=17045-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A" + prod_id + "&filter.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&sort.q0=isfeatured%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_57043" request = Request(reviews_url, meta={ 'product': product, 'offset': 0 }, callback=self.parse_reviews) yield request
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_value('url', response.url) category = response.css('div.treemenu a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'div#mainimage_holder img::attr(data-zoom-image)') identifier = response.xpath('//input[@name="fproduct_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_css('price', 'li.shelfBnormalprice::text') if loader.get_output_value('price') < 100: loader.add_value('shipping_cost', 10) item = loader.load_item() attributes = response.css('table.variabletable tr') attributes = [attr for attr in attributes if attr.xpath('td[1]/text()').extract_first() in self.options_to_extract] options = [] for attr in attributes: options.append(attr.xpath('td/select/option[not(contains(.,"Please Select"))]')) variants = itertools.product(*options) if not variants: yield item return for variant in variants: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] price = item['price'] for option in variant: identifier += '-' + option.xpath('@value').extract_first() name_and_price = option.xpath('text()').extract_first().split('(Add') loader.add_value('name', name_and_price[0]) if len(name_and_price) >1: price += extract_price(name_and_price[1]) loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.replace_value('price', price) if price >= 100: loader.replace_value('shipping_cost', 0) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) name = hxs.select('//h1[@class="product-info-head"]/div[1]/text()').extract() name = ''.join(name).strip() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', name) loader.add_xpath('price', ".//span[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" inline price bold productInfo-orgPrice product-info-price-current \")]/text()") image_url = hxs.select(".//div[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" productPage_image_default \")]/img[1][not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" photo \")]/@src").extract() if image_url: loader.add_value('image_url', 'http:' + image_url[0]) loader.add_xpath('brand', ".//dl[not(@id)][not(@class)][not(@style)]/dd[1][not(@id)][not(@class)][not(@style)]/text()") category = hxs.select(".//nav[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" breadcrumbs module small \")]/div[2][not(@id)][not(@class)][not(@style)]/a[1][not(@id)][not(@class)][not(@style)]//text()").extract() if category: category = ''.join(category).strip() loader.add_value('category', category) loader.add_value('url', response.url) loader.add_value('identifier', response.url.split('/')[-1]) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = re.search('(\d+)_BQ', response.url).group(1) loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', '.product-summary h1.product-title::text') loader.add_css('price', '.product-price::attr(content)') loader.add_css('sku', 'dl.product-code dd::text') loader.add_value('category', 'Bedroom') category = response.css('.breadcrumb').xpath( './/li/a/text()').extract()[-1] loader.add_value('category', category) image_url = response.css('.main-img img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath('brand', '//th[text()="Brand"]/following-sibling::td/text()') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', 5) yield loader.load_item()
def parse_product(self, response): schema = SpiderSchema(response) pdata = schema.get_product() if not pdata: return loader = ProductLoader(Product(), response=response) identifier = re.search('/(\d+)$', url_query_cleaner(response.url)).group(1) loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_value('name', pdata['name']) loader.add_xpath('price', '//span[@id="product_priceExVAT"]/text()') loader.add_value('sku', pdata['productID']) category = response.css('p.breadcrumb a::text').extract()[-3:] loader.add_value('category', category) loader.add_value('image_url', pdata.get('image')) if pdata['brand'].get('properties'): loader.add_value('brand', pdata['brand']['properties']['name']) if loader.get_output_value('price') < 90: loader.add_value('shipping_cost', '5.25') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('sku', '//script/@data-flix-sku') shipping_cost = self.shipping_costs.get(loader.get_output_value('sku'), None) if shipping_cost: loader.add_value('shipping_cost', extract_price(shipping_cost)) loader.add_xpath('identifier', '//input[contains(@id, "SKUID")]/@value') name = response.xpath('//h1/text()').extract() or response.xpath( '//h2[@itemprop="name"]/text()').extract() if not name: return name = name[0] loader.add_value('name', name) loader.add_xpath('price', '//span[@class="TotalPrice"]/text()') categories = response.xpath( '//a[@class="CMSBreadCrumbsLink"]/text()').extract() if not categories: categories = '' loader.add_value('category', categories) for brand in hxs.select( '//div[@title="Brand"]/following-sibling::div//span/@title' ).extract(): if name.title().startswith(brand.title()): break else: brand = '' loader.add_value('brand', brand) loader.add_value('shipping_cost', 19.99) if 'In stock' not in hxs.select( '//span[@class="stock available"]/text()').extract(): loader.add_value('stock', 0) product = loader.load_item() self.products[product['sku']].append(product)
def parse_item(self, response): url = response.xpath('//link[@rel="canonical"]/@href').extract() image_url = response.xpath('//a[@id="zoom1"]/@href').extract() image_url = response.urljoin(image_url[0]) category = response.xpath( '//p[@class="breadcrumbs"]/a[position()>1]/text()').extract() for product in response.xpath('//div[@class="buy_box internals"]'): loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'label[@itemprop="name"]/text()[1]') loader.add_xpath('identifier', 'input[@name="product[]"]/@value') loader.add_xpath('sku', 'input[@name="product[]"]/@value') loader.add_xpath('price', 'label/meta[@itemprop="price"]/@content') loader.add_value('url', url) loader.add_value('image_url', image_url) loader.add_value('category', category) if not product.xpath( 'link[@itemprop="availability"][@href="http://schema.org/InStock"]' ): loader.add_value('stock', 0) if loader.get_output_value('price') < 750: loader.add_value('shipping_cost', 36) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) name = ''.join( response.xpath( '//h1[@class="PrpdocutName"]//text()').extract()).strip() product_loader.add_value('name', name) brand = response.xpath( '//span[@class="parent_product_manufacture_logo"]/img/@alt' ).extract() brand = brand[0].strip() if brand else '' product_loader.add_value('brand', brand) identifier = response.xpath( '//input[@name="products_id"]/@value').extract() if not identifier: identifier = re.findall('custom_product_id=(\d+)', response.body) product_loader.add_value('identifier', identifier[0]) product_loader.add_value('sku', identifier[0]) category = response.xpath( '//div[@class="breadcrumb"]//span[@itemprop="title"]/text()' ).extract()[1:-1] product_loader.add_value('category', category) image_url = response.xpath( '//span[@class="image_container"]/img/@src').extract() if image_url: image_url = response.urljoin(image_url[0]) product_loader.add_value('image_url', image_url) product = product_loader.load_item() options = response.xpath( '//table[@id="product_price_list"]//tr[not(contains(@class, "HeadingRow"))]' ) if options: for option in options: prod = Product(product) product_loader = ProductLoader(item=prod, response=response) option_name = option.xpath( 'td/div[@class="subproduct_name"]/text()').extract() if option_name: option_name = name + ' ' + option_name[0].strip() product_loader.add_value('name', option_name) identifier = option.xpath( './/input[@name="sub_products_id[]"]/@value').extract() if not identifier: identifier = option.xpath( './/input[@name="email_me_products_id"]/@value' ).extract() if not identifier: identifier = option.xpath( './/input[@name="products_id"]/@value').extract() if identifier: product_loader.add_value( 'identifier', product['identifier'] + '-' + identifier[0]) else: log.msg(' >>>>>> Possible wrong identifier: ' + response.url) sku = product_loader.get_output_value('identifier') product_loader.add_value('sku', sku) price = option.xpath( './/span[@class="productSpecialPrice"]/text()').extract() if not price: price = option.xpath( './/span[@class="listing-price"]/text()').extract() price = price[0] if price else 0 product_loader.add_value('price', price) in_stock = option.xpath( './/span[@class="instock" and text()="In Stock"]').extract( ) if not in_stock or not product_loader.get_output_value( 'price'): product_loader.add_value('stock', 0) if product_loader.get_output_value('price') < 70: product_loader.add_value('shipping_cost', Decimal('9.90')) yield product_loader.load_item() else: log.msg(' >>>>> ERROR: NO OPTIONS' + response.url) #if product['price'] < 70: # product['shipping_cost'] = Decimal('9.90') '''