def spider_closed(self, spider): self.log('Loading remaining products') for p in self.all_prod_data: if p in seen: continue pr = self.all_prod_data[p] loader = ProductLoader(item=Product(), selector=HtmlXPathSelector()) loader.add_value('identifier', pr['identifier'].decode('utf8')) loader.add_value('brand', pr['brand'].decode('utf8')) loader.add_value('category', pr['category'].decode('utf8')) loader.add_value('url', pr['url'].decode('utf8')) loader.add_value('name', pr['name'].decode('utf8')) loader.add_value('sku', pr['sku'].decode('utf8')) loader.add_value('image_url', pr['image_url'].decode('utf8')) loader.add_value('price', pr['price']) product = loader.load_item() if p in self.sold_as: meta = TigerChefMeta() meta['sold_as'] = self.sold_as[p].decode('utf8') product['metadata'] = meta yield product shutil.copy('data/%s_products.csv' % spider.crawl_id, self.all_products_file)
def parse_product(self, response): page_schema = SpiderSchema(response) product_data = page_schema.get_product() sku = product_data['sku'] main_name = product_data['name'] main_price = extract_price( product_data['offers']['properties']['price'].replace(' ', '')) brand = product_data['brand'] image_url = product_data['image'] category = [ d['properties']['name'] for d in page_schema.data['items'][1] ['properties']['itemListElement'] ][0] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', main_name) loader.add_value('identifier', sku) loader.add_value('price', main_price) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('url', response.url) loader.add_value('image_url', image_url) description = ' '.join( response.xpath('//*[@class="product-details"]//text()').extract()) sold_as = '' if 'Priced per' in description: sold_as = description.split('Priced per')[1] if 'Priced by' in description: sold_as = description.split('Priced by')[1] if 'Price per' in description: sold_as = description.split('Price per')[1] if ';' in sold_as: sold_as = sold_as.split(';')[0] if '.' in sold_as: sold_as = sold_as.split('.')[0] if ',' in sold_as: sold_as = sold_as.split(',')[0] if not sold_as: sold_as = 'each' product = loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = sold_as product['metadata'] = metadata yield product for option in self._parse_options(response, product): yield option
def parse_product(self, response): itemno = response.xpath( '//div[@id="product-main-info"]//a[contains(@id, ' '"wishlist_link_")]/@id').re(r'(\d+)') if not itemno: self.log('ERROR: itemno not found => %s' % response.url) return else: itemno = itemno[0] price = ''.join( response.xpath('//span[@id="the-price"]//text()').re(r'[\d\.,]+') [-2:]) if not price: self.log('WARNING: price not found => %s' % response.url) price = '0.00' sku = response.xpath('//li[@itemprop="sku"]/text()').extract() if not sku: self.log('WARNING: SKU not found => %s' % response.url) else: sku = sku[0].replace('Model #:', '').strip() brand = response.xpath('//li[@itemprop="name"]/text()').extract() image_url = response.xpath( '//div[@id="zoom-div"]//img[@itemprop="image"]/@src').extract() category = response.xpath('//span[@class="breadcrumb-element"]' '//*[@itemprop="name"]/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]//text()') loader.add_value('price', price) if sku: loader.add_value('sku', sku) if image_url: loader.add_value('image_url', image_url) if brand: loader.add_value('brand', brand) loader.add_value('identifier', itemno + ' ' + sku) if category: loader.add_value('category', category[0].strip()) product = loader.load_item() sold_as = response.xpath('//li[contains(text(),"Sold As:")]/../li[2]/text()')\ .extract()[0].strip() metadata = TigerChefMeta() metadata['sold_as'] = sold_as product['metadata'] = metadata yield product
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta products = hxs.select('//li[contains(@itemtype, "Product")]') for product in products: product_loader = ProductLoader(Product(), product, spider_name=self.name) product_loader.add_xpath('name', './/a[@itemprop="name"]/text()') product_loader.add_xpath('url', './/a[@itemprop="name"]/@href') product_loader.add_xpath('price', './/span[@itemprop="price"]/text()') product_loader.add_xpath('image_url', 'div/a/img/@src') identifier = product.select('@id').extract()[0].split( 'product_')[-1] product_loader.add_value('identifier', identifier) product_loader.add_value('category', meta.get('category')) product_loader.add_value('brand', meta.get('brand')) sku = product.select('.//span[@itemprop="model"]/text()') if sku: sku = sku.extract()[0] ''' dash_pos = sku.find('-') if dash_pos >= 0: sku = sku[dash_pos + 1:] ''' product_loader.add_value('sku', sku) sold_as = product.select( 'div/div/div/div/span[contains(text(), "Sold As")]/text()' ).extract() product = product_loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].split( 'Sold As: ')[-1].strip() if sold_as else '1 ea' product['metadata'] = metadata yield product next_page = hxs.select( '//td[@class="next"]/a[@class="pagerlink"]/@href').extract() if next_page: yield Request(urljoin_rfc(base_url, next_page[0]), callback=self.parse_products, meta=meta)
def parse_product(self, response): product = response.meta['product'] product_loader = ProductLoader(Product(product), response=response) product_loader.add_xpath( 'price', '//meta[@property="og:price:amount"]/@content') product_loader.add_value('price', 0) name = response.xpath( '//div[@class="product-info"]/p[@class="h1"]/text()').extract() img_url = response.xpath('//img[@class="mainImgFix"]/@src').extract() if not img_url: self.log("ERROR img not found") else: product_loader.add_value('image_url', img_url[0]) category = response.xpath( '//ol[contains(@class, "breadcrumb")]/li/a/text()').extract() if not category: self.log("ERROR category not found") else: product_loader.add_value('category', category[-1]) brand = response.xpath('//div[@class="logo-area"]/a/@title').extract() if not brand: brand = response.xpath( '//td[contains(text(), "Manufacturer")]/following-sibling::td/text()' ).extract() if not brand: self.log("ERROR brand not found") else: product_loader.add_value('brand', brand[0]) product = product_loader.load_item() if name: product['name'] = name[0].strip() sold_as = response.xpath( '//strong[@class="price"]/span/text()').extract() metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].split('/ ')[-1] if sold_as else '1 ea' product['metadata'] = metadata yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta sku = hxs.select('.//span[@itemprop="sku"]/text()').extract() if not sku: return sku = sku[0].strip() name = meta.get('name', None) if not name: name = ''.join(hxs.select('//span[@itemprop="name"]/text()').extract()) brand_r = re.search(r'by (.*)$', name) if brand_r: brand = brand_r.group(1) else: if sku in name: try: brand = re.search(r'^(.*) %s' % re.escape(sku), name).groups()[0].strip() except AttributeError: brand = '' else: brand = '' if not brand: brand = response.xpath('//span[@itemprop="manufacturer"]/text()').extract() brand = brand[0].strip() if brand else '' product_loader = TigerChefLoader(Product(), response=response, spider_name=self.name) product_loader.add_value('name', name) if 'identifier' in meta: product_loader.add_value('identifier', meta['identifier']) elif 'item' in meta and 'identifier' in meta['item']: product_loader.add_value('identifier', meta['item']['identifier']) price = meta.get('price', None) if not price: price = hxs.select('//div[@itemprop="price"]/span/span/text()').extract() if not price: price = hxs.select('//div[@itemprop="price"]/span/text()').extract() product_loader.add_value('price', price or '0') product_loader.add_value('url', response.url) product_loader.add_value('sku', sku) category = hxs.select('//span[@class="SectionTitleText"]/li/a/text()') category = category[-1].extract() if category else '' product_loader.add_value('category', category) product_loader.add_value('brand', brand) image_url = hxs.select('//div[@id="prodImageMediumBox"]//div/div/img/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' product_loader.add_value('image_url', image_url) sold_as = hxs.select('//table[@id="prodInfo"]/tr/td[div/div[@itemprop="price"]]/span[@class="details"]/text()').extract() product = product_loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = ' '.join(sold_as[0].replace('/', '').split()) if sold_as else '' product['metadata'] = metadata yield product
def get_products(self, hxs, url): root_url = 'https://www.instawares.com' res = [] products = hxs.select('//ol[starts-with(@class, "productListResultOL")]/li') # self.log('%s products found' % len(products)) for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/div[@class="listResultsDescriptionDiv"]/a/text()') loader.add_xpath('identifier', './/div[@class="listResultsDescriptionDiv"]/dl/dd[1]/text()') loader.add_xpath('price', './/div[@class="listResultPrice"]/text()') loader.add_xpath('brand', './/div[@class="listResultsDescriptionDiv"]/dl/dt[contains(text(), "By")]/following-sibling::dd/text()') url = product.select('.//div[@class="listResultsDescriptionDiv"]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(root_url, url)) if loader.get_output_value('identifier') in self.prod_data: row = self.prod_data[loader.get_output_value('identifier')] loader.add_value('brand', row['brand'].decode('utf8')) loader.add_value('category', row['category'].decode('utf8')) loader.add_value('sku', row['sku'].decode('utf8')) image_url = product.select('.//img[@class="productimagelarge"]/@src').extract() if image_url: image_url = image_url[0] loader.add_value('image_url', urljoin_rfc(root_url, image_url)) p = loader.load_item() if p['identifier'] in self.sold_as: sold_as = self.sold_as[p['identifier']] metadata = TigerChefMeta() metadata['sold_as'] = sold_as p['metadata'] = metadata res.append(loader.load_item()) if not res and hxs.select('//h1[@class="productName fn"]/text()'): loader = ProductLoader(selector=hxs, item=Product(), spider_name=self.name) loader.add_value('url', url) loader.add_xpath('name', '//h1[@class="productName fn"]/text()') loader.add_xpath('price', '//li[@class="price"]//text()') loader.add_xpath('sku', '//div[starts-with(@class, "specificationContent")]' + '//td[contains(text(), "Manufacturer ID")]/following-sibling::td/text()') loader.add_xpath('identifier', '//td[@itemprop="productID"]/text()') brand = hxs.select('//td[@class="brand"]/text()').extract() if not brand: self.log("ERROR brand not found") else: loader.add_value("brand", brand[0].strip()) image_url = hxs.select('//div[@class="productImageDiv"]/a/img/@src').extract() if not image_url: self.log("ERROR image_url not found") else: loader.add_value("image_url", urljoin_rfc(root_url, image_url[0])) category = hxs.select('(//ol[@class="breadcrumbOL"]/a)[last()]/text()').extract() if not category: self.log("ERROR category not found") else: loader.add_value("category", category[0].strip()) sold_as = hxs.select('//dl[@class="soldAsPackedAsDL"]/dd[1]/text()').extract() product = loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].strip() if sold_as else '1 ea' product['metadata'] = metadata if product.get('identifier'): res.append(loader.load_item()) return res
def parse_product(self, response): schema = SpiderSchema(response) data = schema.get_product() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', data['Name']) loader.add_xpath('category', u'//div[@class="breadcrumbs"]/ul/li[2]/a/text()') price = response.xpath( '//form[@id="productform"]/input[@name="price"]/@value').extract() if price: loader.add_value('price', price[0]) else: loader.add_value( 'price', data.get('offers', {}).get('properties', {}).get('price', '0.0')) sku = map( unicode.strip, response.xpath( '//span[contains(@class, "mfr-number")]/text()').extract()) loader.add_value('identifier', data['productID']) if sku: loader.add_value('sku', sku) else: loader.add_value('sku', data['productID'].replace('#', '')) image_url = data.get('image', '').replace('www.example.com', 'www.webstaurantstore.com') if image_url: loader.add_value('image_url', response.urljoin(image_url)) brand = data.get('brand', '') if not brand: brand = response.xpath( '//tr[@class="highlight" and .//b[contains(text(), "Manufacturer Name")]]/td[not(b)]/text()' ).extract() brand = brand[0].strip() if brand else '' if brand: loader.add_value('brand', brand) sold_as = response.xpath( '//div[@id="subject"]/div/div/p/span[@class="each"]/text()' ).extract() product = loader.load_item() if product.get('identifier', '').strip() != '': metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].replace('/', '') if sold_as else '' product['metadata'] = metadata # Add to cart to see the price if response.xpath( '//*[@itemprop="price" and contains(@class, "strikeOutPrice")][1]' ): cart_url = 'http://www.webstaurantstore.com/viewcart.html' inputs = response.xpath('//form[@id="productform"]/input') formdata = dict( zip( inputs.select('./@name').extract(), inputs.select('./@value').extract())) # quantity formdata[u'qty'] = '1' f_request = FormRequest(url=cart_url, method='POST', formdata=formdata, callback=self.parse_price, meta={ 'product': product, 'dont_merge_cookies': True }, dont_filter=True) yield f_request else: yield product # loader.load_item()
def parse_products(self, response, hxs): products = response.css('.product-result') for product in products: loader = ProductLoader(selector=product, item=Product(), spider_name=self.name) url = product.select('.//a/@href').extract() if not url: self.log('ERROR: no product URL found! URL:{}'.format(response.url)) continue else: url = urljoin_rfc(get_base_url(response), url[0]) loader.add_value('url', url) sku = product.select('.//a/text()').re('\((.*?)\)') if not sku: self.log('ERROR: no SKU found!') else: loader.add_value('sku', sku[0]) product_id = product.select('.//a/@href').re('p(\d+)\.aspx') if not product_id: self.log('ERROR: no product ID found!') else: loader.add_value('identifier', product_id[0] + '_' + sku[0]) product_image = product.select('.//a/img/@psrc').extract() if not product_image: product_image = product.select('.//div/img/@src').extract() if not product_image: self.log('ERROR: no product Image found!') if product_image: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) loader.add_value('image_url', image) price = ''.join(product.select('./div[contains(@class,"-price")]/text()').extract()).strip() check_cart = False if 'Instant Rebate' in price or 'Add to Cart' in price: price = '0.0' check_cart = True if not price: price = ''.join(product.select('./div[contains(@class,"-price")]/span/text()').extract()).strip() if not price: self.log('ERROR: no price found! URL:{} Product URL:{}'.format(response.url, url)) continue loader.add_value('price', price.strip()) category = product.select('//div[contains(@class, "content")]/h1/text()').extract() if not category: self.log("ERROR: category not found") else: loader.add_value('category', category[0].strip()) name = product.select('.//a/text()').extract()[0] loader.add_value('name', name) brand = name.split(' (')[0] loader.add_value('brand', brand) sold_as = product.select('div//span[@class="unit-of-sale"]/text()').extract() sold_as = sold_as[0].split('/')[-1] if sold_as else '1 ea' metadata = TigerChefMeta() metadata['sold_as'] = sold_as if check_cart: sku_id = product.select('div[@class="adcWinnowedItem"]/button/@atc-skuid').extract()[0] add_cart_url = "https://www.foodservicewarehouse.com/ViewCart/AddSkuToCart?skuID=" + sku_id + "&quantity=1" req = Request(add_cart_url, dont_filter=True, callback=self.parse_cart, meta={'loader':loader, 'metadata':metadata, 'sku_id': sku_id}) req.meta['proxy'] = self.CART_PROXY yield req req = Request('https://www.foodservicewarehouse.com/ViewCart/RemoveAll/', dont_filter=True, callback=self.parse_cart, meta={'clean_cart':True}) req.meta['proxy'] = self.CART_PROXY yield req else: product = loader.load_item() product['metadata'] = metadata yield product
def parse_product(self, response): # self.log("parse_product") hxs = HtmlXPathSelector(response) name = hxs.select('//h1[@id="partNameId"]/text()').extract() quantity = hxs.select( '//label[@class="productdetail-qtytxt"]/../text()[last()]' ).extract() if quantity: quantity = quantity[0].replace('\n', ' ').replace('\r', ' ').replace( '\t', ' ').strip() quantity = re.sub(' +', ' ', quantity) loader = ProductLoader(response=response, item=Product(), spider_name=self.name) if not name: self.log("ERROR name not found") else: loader.add_value('name', name[0].strip()) brand = hxs.select( '//div[@class="productdetail-contentarea-wrapper"]/table/tr/td[.//b[contains(text(),"Manufacturer:")]]/a/text()' ).extract() if not brand: self.log("ERROR brand not found") else: loader.add_value("brand", brand[0].strip()) img_url = hxs.select( '//div[@class="productdetail-productimage"]/a/img/@src').extract() if not img_url: self.log("ERROR img_url not found") else: loader.add_value("image_url", img_url[0]) category = hxs.select( '(//div[@id="productdetail-crumbcategory"]/ul/li/a)[last()]/text()' ).extract() if not category: self.log("ERROR category not found") else: loader.add_value("category", category[0].strip()) # self.log("name = " + name[0].strip() + ", quantity = " + quantity.strip()) if quantity and quantity.lower() != 'each': loader.add_value('name', quantity) loader.add_value('url', response.url) loader.add_xpath('price', '//font[@class="txt-purchaseprice20blue"]/text()') sku = ''.join( hxs.select('//b[contains(text(), "Model #:")]/../text()').extract( )).strip() temp = sku.split() if len(temp) == 2 and temp[0] == temp[1]: sku = temp[0] loader.add_value('sku', sku) loader.add_xpath('identifier', '//form//input[@name="productId"]/@value') product = loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = quantity if quantity else '1 ea' product['metadata'] = metadata yield product