def parse_cart(self, response): item_id = response.meta['item_id'] price = response.xpath('//table[@width="980"]//tr[not(@class) and ' './/input[@name="Qty%s"]]/td[@align="right" and @valign="top"]/text()' % item_id)\ .re(r'[\d\.,]+') if price: loader = ProductLoader(response.meta['product'], response=response) loader.add_value('price', price) yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[contains(@class, "productResult")]') log.msg(">>>>>>>> FOUND %s ITEMS >>>" % len(products)) for product in products: product_loader = TigerChefLoader(Product(), product, spider_name=self.name) product_loader.add_xpath( 'name', './/h2[@class="productResultName"]/a/text()') try: name = product.select('.//h2[@class="productResultName"]/a/text()').extract()[0] except: self.log('Cannot find name %s' % response.url) url = product.select( './/h2[@class="productResultName"]/a/@href' ).extract()[0] url = canonicalize_url(urljoin_rfc(base_url, url)) price = ' '.join(product.select( './/span[@class="variantprice"]//text()').extract()) identifier = identifier_regex.search(url).group(1) yield Request(url, callback=self.parse_product, meta={'name': name, 'price': price, 'identifier': identifier}) products2 = hxs.select('//div[contains(@id, "ageContent_pnlContent")]/table/tr/td/table/tr[2]/td/a/@href').extract() for url in products2: identifier = identifier_regex.search(url).group(1) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta={'identifier': identifier}) if not products and not products2 and not hxs.select('//td[@id="featuredProductsTable"]'): retry = int(response.meta.get('retry', 0)) if retry < 10: self.log('WARNING: No products and no subcategories, Retry => %s' % response.url) retry += 1 new_meta = response.meta.copy() new_meta['retry'] = retry yield Request( response.url, meta=new_meta, cookies={'pagesize': 10000}, callback=self.parse_products, dont_filter=True) else: self.log('ERROR - NO PRODUCTS FOUND, retry limit reached, giving up, url: {}'.format(response.url))
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta sku = hxs.select('.//span[@itemprop="sku"]/text()').extract() if not sku: return sku = sku[0].strip() name = meta.get('name', None) if not name: name = ''.join(hxs.select('//span[@itemprop="name"]/text()').extract()) brand_r = re.search(r'by (.*)$', name) if brand_r: brand = brand_r.group(1) else: if sku in name: try: brand = re.search(r'^(.*) %s' % re.escape(sku), name).groups()[0].strip() except AttributeError: brand = '' else: brand = '' if not brand: brand = response.xpath('//span[@itemprop="manufacturer"]/text()').extract() brand = brand[0].strip() if brand else '' product_loader = TigerChefLoader(Product(), response=response, spider_name=self.name) product_loader.add_value('name', name) if 'identifier' in meta: product_loader.add_value('identifier', meta['identifier']) elif 'item' in meta and 'identifier' in meta['item']: product_loader.add_value('identifier', meta['item']['identifier']) price = meta.get('price', None) if not price: price = hxs.select('//div[@itemprop="price"]/span/span/text()').extract() if not price: price = hxs.select('//div[@itemprop="price"]/span/text()').extract() product_loader.add_value('price', price or '0') product_loader.add_value('url', response.url) product_loader.add_value('sku', sku) category = hxs.select('//span[@class="SectionTitleText"]/li/a/text()') category = category[-1].extract() if category else '' product_loader.add_value('category', category) product_loader.add_value('brand', brand) image_url = hxs.select('//div[@id="prodImageMediumBox"]//div/div/img/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' product_loader.add_value('image_url', image_url) sold_as = hxs.select('//table[@id="prodInfo"]/tr/td[div/div[@itemprop="price"]]/span[@class="details"]/text()').extract() product = product_loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = ' '.join(sold_as[0].replace('/', '').split()) if sold_as else '' product['metadata'] = metadata yield product
def parse_product(self, response): product = response.meta['product'] product_loader = ProductLoader(Product(product), response=response) product_loader.add_xpath( 'price', '//meta[@property="og:price:amount"]/@content') product_loader.add_value('price', 0) name = response.xpath( '//div[@class="product-info"]/p[@class="h1"]/text()').extract() img_url = response.xpath('//img[@class="mainImgFix"]/@src').extract() if not img_url: self.log("ERROR img not found") else: product_loader.add_value('image_url', img_url[0]) category = response.xpath( '//ol[contains(@class, "breadcrumb")]/li/a/text()').extract() if not category: self.log("ERROR category not found") else: product_loader.add_value('category', category[-1]) brand = response.xpath('//div[@class="logo-area"]/a/@title').extract() if not brand: brand = response.xpath( '//td[contains(text(), "Manufacturer")]/following-sibling::td/text()' ).extract() if not brand: self.log("ERROR brand not found") else: product_loader.add_value('brand', brand[0]) product = product_loader.load_item() if name: product['name'] = name[0].strip() sold_as = response.xpath( '//strong[@class="price"]/span/text()').extract() metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].split('/ ')[-1] if sold_as else '1 ea' product['metadata'] = metadata yield product
def parse(self, response): # Main categories for cat_url in response.xpath( '//ul[@id="main-nav"]/li/a/@href').extract(): yield Request(response.urljoin(cat_url)) sub_categories = response.xpath( '//div[contains(@class, "sub-categories")]' '/div/div//p/a/@href').extract() for sub_cat in sub_categories: yield Request( add_or_replace_parameter(response.urljoin(sub_cat), 'sort', 'lowest')) categories = response.xpath( '//ul[@class="category"]/li/a/@href').extract() categories += response.xpath( '//a[contains(@class, "shop-all-button")]/@href').extract() categories += response.css('.subcat-panel ::attr(href)').extract() for url in categories: yield Request( add_or_replace_parameter(response.urljoin(url), 'sort', 'lowest')) next_page = response.xpath( '//ul[@class="pagination"]/li/a[@class="next"]/@href').extract() if next_page: yield Request(url=response.urljoin(next_page[0])) products = response.xpath('//div[contains(@class, "product")]') for product_xs in products: url = product_xs.xpath('a/@href').extract() if not url: continue product_loader = ProductLoader(item=Product(), selector=product_xs) product_loader.add_value('url', url) try: sku = product_xs.xpath('p[@class="product-sku"]/text()').re( 'KaTom #: (.*)')[0] except: sku = None product_loader.add_value('sku', sku) product_loader.add_value('identifier', sku) product_loader.add_xpath('name', 'a/@title') product_loader.add_css('image_url', '.img ::attr(src)') product_loader.add_xpath('category', '//h1[@class="title"]/text()') product = product_loader.load_item() if len(product.get('sku', '').split('-')) > 1: product['sku'] = '-'.join(product['sku'].split('-')[1:]) yield Request(url=product_loader.get_output_value('url'), meta={"product": product}, callback=self.parse_product)
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta products = hxs.select('//li[contains(@itemtype, "Product")]') for product in products: product_loader = ProductLoader(Product(), product, spider_name=self.name) product_loader.add_xpath('name', './/a[@itemprop="name"]/text()') product_loader.add_xpath('url', './/a[@itemprop="name"]/@href') product_loader.add_xpath('price', './/span[@itemprop="price"]/text()') product_loader.add_xpath('image_url', 'div/a/img/@src') identifier = product.select('@id').extract()[0].split( 'product_')[-1] product_loader.add_value('identifier', identifier) product_loader.add_value('category', meta.get('category')) product_loader.add_value('brand', meta.get('brand')) sku = product.select('.//span[@itemprop="model"]/text()') if sku: sku = sku.extract()[0] ''' dash_pos = sku.find('-') if dash_pos >= 0: sku = sku[dash_pos + 1:] ''' product_loader.add_value('sku', sku) sold_as = product.select( 'div/div/div/div/span[contains(text(), "Sold As")]/text()' ).extract() product = product_loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].split( 'Sold As: ')[-1].strip() if sold_as else '1 ea' product['metadata'] = metadata yield product next_page = hxs.select( '//td[@class="next"]/a[@class="pagerlink"]/@href').extract() if next_page: yield Request(urljoin_rfc(base_url, next_page[0]), callback=self.parse_products, meta=meta)
def parse(self, response): next_page = response.xpath( '//*[@class="pagelinks"]/following-sibling::td//a[contains(text(), "Next")]/@href' ).extract() if next_page: yield Request(response.urljoin(next_page[0]), meta={'dont_merge_cookies': True}, dont_filter=True) if not next_page: self._search_done = True products_xs = response.xpath('//td[contains(@class, "search-prod")]') for product_xs in products_xs: sku = None product_id = product_xs.xpath( './/*[@class="search-item-title"]/a/@href').extract()[0].split( '/')[-1].split('.')[2] try: brand, sku = product_xs.xpath( './/*[@class="search-item-title"]/following-sibling::div/a/text()' ).extract() except ValueError: try: brand = product_xs.xpath( './/*[@class="search-item-title"]/following-sibling::div/a/text()' ).extract()[0] except: brand = None image_url = map(response.urljoin, product_xs.xpath('.//img/@src').extract()) price = product_xs.xpath('.//*[@class="search-item-price"]').re( r'[\d\.,]+') add_to_cart = bool( product_xs.xpath( './/*[@class="search-item-price"]/span[@class="see-price-sprite"]' )) loader = ProductLoader(item=Product(), selector=product_xs) identifier = product_id if sku: identifier = identifier + ' ' + sku.lower() loader.add_value('identifier', identifier) if sku: loader.add_value('sku', sku) loader.add_xpath('url', './/*[@class="search-item-title"]/a/@href') if image_url: loader.add_value( 'image_url', image_url[0].replace('/pics/sm/', '/pics/md/').replace('sm_', 'md_')) if brand: loader.add_value('brand', brand) loader.add_xpath( 'name', './/*[@class="search-item-title"]/a/strong/text()') if price: loader.add_value('price', price[0]) yield loader.load_item() elif add_to_cart: product = loader.load_item() url = response.urljoin( product_xs.xpath( './/a[@class="atc-primary"]/@href').extract()[0]) item_id = url_query_parameter(url, 'ItemID') self._add_to_cart_products.append((item_id, url, product))
def parse_product(self, response): schema = SpiderSchema(response) data = schema.get_product() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', data['Name']) loader.add_xpath('category', u'//div[@class="breadcrumbs"]/ul/li[2]/a/text()') price = response.xpath( '//form[@id="productform"]/input[@name="price"]/@value').extract() if price: loader.add_value('price', price[0]) else: loader.add_value( 'price', data.get('offers', {}).get('properties', {}).get('price', '0.0')) sku = map( unicode.strip, response.xpath( '//span[contains(@class, "mfr-number")]/text()').extract()) loader.add_value('identifier', data['productID']) if sku: loader.add_value('sku', sku) else: loader.add_value('sku', data['productID'].replace('#', '')) image_url = data.get('image', '').replace('www.example.com', 'www.webstaurantstore.com') if image_url: loader.add_value('image_url', response.urljoin(image_url)) brand = data.get('brand', '') if not brand: brand = response.xpath( '//tr[@class="highlight" and .//b[contains(text(), "Manufacturer Name")]]/td[not(b)]/text()' ).extract() brand = brand[0].strip() if brand else '' if brand: loader.add_value('brand', brand) sold_as = response.xpath( '//div[@id="subject"]/div/div/p/span[@class="each"]/text()' ).extract() product = loader.load_item() if product.get('identifier', '').strip() != '': metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].replace('/', '') if sold_as else '' product['metadata'] = metadata # Add to cart to see the price if response.xpath( '//*[@itemprop="price" and contains(@class, "strikeOutPrice")][1]' ): cart_url = 'http://www.webstaurantstore.com/viewcart.html' inputs = response.xpath('//form[@id="productform"]/input') formdata = dict( zip( inputs.select('./@name').extract(), inputs.select('./@value').extract())) # quantity formdata[u'qty'] = '1' f_request = FormRequest(url=cart_url, method='POST', formdata=formdata, callback=self.parse_price, meta={ 'product': product, 'dont_merge_cookies': True }, dont_filter=True) yield f_request else: yield product # loader.load_item()
def parse_product(self, response): page_schema = SpiderSchema(response) product_data = page_schema.get_product() sku = product_data['sku'] main_name = product_data['name'] main_price = extract_price( product_data['offers']['properties']['price'].replace(' ', '')) brand = product_data['brand'] image_url = product_data['image'] category = [ d['properties']['name'] for d in page_schema.data['items'][1] ['properties']['itemListElement'] ][0] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', main_name) loader.add_value('identifier', sku) loader.add_value('price', main_price) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('url', response.url) loader.add_value('image_url', image_url) description = ' '.join( response.xpath('//*[@class="product-details"]//text()').extract()) sold_as = '' if 'Priced per' in description: sold_as = description.split('Priced per')[1] if 'Priced by' in description: sold_as = description.split('Priced by')[1] if 'Price per' in description: sold_as = description.split('Price per')[1] if ';' in sold_as: sold_as = sold_as.split(';')[0] if '.' in sold_as: sold_as = sold_as.split('.')[0] if ',' in sold_as: sold_as = sold_as.split(',')[0] if not sold_as: sold_as = 'each' product = loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = sold_as product['metadata'] = metadata yield product for option in self._parse_options(response, product): yield option
def parse_product(self, response): itemno = response.xpath( '//div[@id="product-main-info"]//a[contains(@id, ' '"wishlist_link_")]/@id').re(r'(\d+)') if not itemno: self.log('ERROR: itemno not found => %s' % response.url) return else: itemno = itemno[0] price = ''.join( response.xpath('//span[@id="the-price"]//text()').re(r'[\d\.,]+') [-2:]) if not price: self.log('WARNING: price not found => %s' % response.url) price = '0.00' sku = response.xpath('//li[@itemprop="sku"]/text()').extract() if not sku: self.log('WARNING: SKU not found => %s' % response.url) else: sku = sku[0].replace('Model #:', '').strip() brand = response.xpath('//li[@itemprop="name"]/text()').extract() image_url = response.xpath( '//div[@id="zoom-div"]//img[@itemprop="image"]/@src').extract() category = response.xpath('//span[@class="breadcrumb-element"]' '//*[@itemprop="name"]/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]//text()') loader.add_value('price', price) if sku: loader.add_value('sku', sku) if image_url: loader.add_value('image_url', image_url) if brand: loader.add_value('brand', brand) loader.add_value('identifier', itemno + ' ' + sku) if category: loader.add_value('category', category[0].strip()) product = loader.load_item() sold_as = response.xpath('//li[contains(text(),"Sold As:")]/../li[2]/text()')\ .extract()[0].strip() metadata = TigerChefMeta() metadata['sold_as'] = sold_as product['metadata'] = metadata yield product
def parse_products(self, response, hxs): products = response.css('.product-result') for product in products: loader = ProductLoader(selector=product, item=Product(), spider_name=self.name) url = product.select('.//a/@href').extract() if not url: self.log('ERROR: no product URL found! URL:{}'.format(response.url)) continue else: url = urljoin_rfc(get_base_url(response), url[0]) loader.add_value('url', url) sku = product.select('.//a/text()').re('\((.*?)\)') if not sku: self.log('ERROR: no SKU found!') else: loader.add_value('sku', sku[0]) product_id = product.select('.//a/@href').re('p(\d+)\.aspx') if not product_id: self.log('ERROR: no product ID found!') else: loader.add_value('identifier', product_id[0] + '_' + sku[0]) product_image = product.select('.//a/img/@psrc').extract() if not product_image: product_image = product.select('.//div/img/@src').extract() if not product_image: self.log('ERROR: no product Image found!') if product_image: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) loader.add_value('image_url', image) price = ''.join(product.select('./div[contains(@class,"-price")]/text()').extract()).strip() check_cart = False if 'Instant Rebate' in price or 'Add to Cart' in price: price = '0.0' check_cart = True if not price: price = ''.join(product.select('./div[contains(@class,"-price")]/span/text()').extract()).strip() if not price: self.log('ERROR: no price found! URL:{} Product URL:{}'.format(response.url, url)) continue loader.add_value('price', price.strip()) category = product.select('//div[contains(@class, "content")]/h1/text()').extract() if not category: self.log("ERROR: category not found") else: loader.add_value('category', category[0].strip()) name = product.select('.//a/text()').extract()[0] loader.add_value('name', name) brand = name.split(' (')[0] loader.add_value('brand', brand) sold_as = product.select('div//span[@class="unit-of-sale"]/text()').extract() sold_as = sold_as[0].split('/')[-1] if sold_as else '1 ea' metadata = TigerChefMeta() metadata['sold_as'] = sold_as if check_cart: sku_id = product.select('div[@class="adcWinnowedItem"]/button/@atc-skuid').extract()[0] add_cart_url = "https://www.foodservicewarehouse.com/ViewCart/AddSkuToCart?skuID=" + sku_id + "&quantity=1" req = Request(add_cart_url, dont_filter=True, callback=self.parse_cart, meta={'loader':loader, 'metadata':metadata, 'sku_id': sku_id}) req.meta['proxy'] = self.CART_PROXY yield req req = Request('https://www.foodservicewarehouse.com/ViewCart/RemoveAll/', dont_filter=True, callback=self.parse_cart, meta={'clean_cart':True}) req.meta['proxy'] = self.CART_PROXY yield req else: product = loader.load_item() product['metadata'] = metadata yield product
def parse(self, response): data = response.xpath('//script/text()').re("products', (\[{.+}\])") if not data: return list_of_data = json.loads(data[0]) for data in list_of_data: loader = ProductLoader(item=Product(), response=response, spider_name=self.name) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_value('sku', data['sku']) loader.add_value('identifier', str(data['sqlProductID']) + '_' + data['sku']) loader.add_value('name', data['name']) loader.add_value('price', data['price']) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[-1]) loader.add_value('brand', data['manufacturer']) loader.add_xpath('image_url', '//meta[@property="og:image"]/@content') loader.add_value('stock', int(data['inventoryStatus'] != 3)) yield loader.load_item()
def parse_product(self, response): # self.log("parse_product") hxs = HtmlXPathSelector(response) name = hxs.select('//h1[@id="partNameId"]/text()').extract() quantity = hxs.select( '//label[@class="productdetail-qtytxt"]/../text()[last()]' ).extract() if quantity: quantity = quantity[0].replace('\n', ' ').replace('\r', ' ').replace( '\t', ' ').strip() quantity = re.sub(' +', ' ', quantity) loader = ProductLoader(response=response, item=Product(), spider_name=self.name) if not name: self.log("ERROR name not found") else: loader.add_value('name', name[0].strip()) brand = hxs.select( '//div[@class="productdetail-contentarea-wrapper"]/table/tr/td[.//b[contains(text(),"Manufacturer:")]]/a/text()' ).extract() if not brand: self.log("ERROR brand not found") else: loader.add_value("brand", brand[0].strip()) img_url = hxs.select( '//div[@class="productdetail-productimage"]/a/img/@src').extract() if not img_url: self.log("ERROR img_url not found") else: loader.add_value("image_url", img_url[0]) category = hxs.select( '(//div[@id="productdetail-crumbcategory"]/ul/li/a)[last()]/text()' ).extract() if not category: self.log("ERROR category not found") else: loader.add_value("category", category[0].strip()) # self.log("name = " + name[0].strip() + ", quantity = " + quantity.strip()) if quantity and quantity.lower() != 'each': loader.add_value('name', quantity) loader.add_value('url', response.url) loader.add_xpath('price', '//font[@class="txt-purchaseprice20blue"]/text()') sku = ''.join( hxs.select('//b[contains(text(), "Model #:")]/../text()').extract( )).strip() temp = sku.split() if len(temp) == 2 and temp[0] == temp[1]: sku = temp[0] loader.add_value('sku', sku) loader.add_xpath('identifier', '//form//input[@name="productId"]/@value') product = loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = quantity if quantity else '1 ea' product['metadata'] = metadata yield product