def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) name = hxs.select(u'//h1/text()').extract()[0].strip() if not name.lower().startswith(response.meta['brand'].lower()) and not response.meta.get('got_brand', True): logging.error("%s [%s] not matched" % (name, response.url)) return product_loader.add_value('name', name) product_loader.add_xpath('sku', u'//input[@id="primaryPartNumber"]/@value') product_loader.add_xpath('identifier', u'//input[@id="productId"]/@value') image_url = hxs.select(u'//div[@class="image"]/img/@src').extract() if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0]) product_loader.add_value('image_url', image_url) price = hxs.select('//span[@itemprop="price"]/text()').extract()[0] product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta['brand'].strip().lower()) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product reviews = hxs.select(u'//iframe[@id="BVFrame"]/@src').extract() if reviews: yield Request(reviews[0], meta=response.meta, callback=self.parse_review) else: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) name = hxs.select(u'//h1/text()').extract()[0] attrs = hxs.select(u'//td[starts-with(@id,"product_selling_attribute_value_")]/text()').extract() if [a for a in attrs if response.meta['brand'].lower() in a.lower()]: # got brand match pass elif not response.meta['brand'].lower() in name.lower(): return product_loader.add_value('name', name) product_loader.add_xpath('sku', u'//span[@itemprop="productID"]/text()') product_loader.add_xpath('identifier', u'//span[@itemprop="productID"]/text()') product_loader.add_xpath('image_url', '//*[@id="product_image"]/@src') product_loader.add_xpath('category', '//*[@id="breadcrumb_item_cat_top_1"]/text()') price = hxs.select('//span[@itemprop="price"]/text()').extract()[0] product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta['brand'].strip().lower()) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product reviews = hxs.select(u'//iframe[@id="BVFrame"]/@src').extract() if reviews: yield Request(reviews[0], meta=response.meta, callback=self.parse_review) else: yield product
def parse_sku(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=response.meta.get('product'), selector=hxs) stock = hxs.select( '//div[@class="rightColumn rightColumnV2 productDetailsRightColumn"]//input[@name="/com/castorama/CastShoppingCartFormHandler.addItemToOrder"]' ).extract() sku = hxs.select( '//div[@class="productDecription"]/span[@class="refNum"]/text()' ).re(u':[\xa0]?(.*)') product_loader.add_value('sku', sku[0] if sku else '') if not stock: product_loader.add_value('stock', 0) reviews_url = hxs.select('//script/text()').re('bvPage = \'(.*)\';') product = product_loader.load_item() meta = response.meta meta['product'] = product metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = response.meta.get('brand') or '' product['metadata'] = metadata if reviews_url: yield Request(reviews_url.pop(), meta=meta, callback=self.parse_review) else: request = self.retry(response, "identifier not found on " + response.url) if request: yield request return yield product
def parse_item(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', u'//h1/text()') sku = hxs.select(u'//div[@id="idAndAvailable"]/text()[1]').extract()[0] sku = sku.split(':')[1].strip() product_loader.add_value('brand', response.meta['brand'].lower()) product_loader.add_value('sku', sku) price = hxs.select('//span[@id="productPrice"]/text()').extract() if not price: price = hxs.select( '//span[@id="productPriceonsale"]/text()').extract() price = price[0].replace('$', '') product_loader.add_value('price', price) product_loader.add_value('url', response.url) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta.update({'product': product}) brand = response.meta['brand'].lower() product_name = product['name'].lower() if (brand in product_name.lower() or ''.join(brand.split()) in product_name.lower()): for x in self.parse_review(response): yield x
def parse_product(self, response): browser = PhantomJS() self.log('>>> BROWSER: GET => %s' % response.url) browser.get(response.url) self.log('>>> BROWSER: OK!') hxs = HtmlXPathSelector(text=browser.driver.page_source) browser.close() self.log('>>> BROWSER: Closed') sku = hxs.select(u'//*[@class="displaySkuCode"]//text()').extract() sku = sku[0].replace('#', '') product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath( 'name', u'//div[contains(@class,"title")]//h1/text()') product_loader.add_value('sku', sku) product_loader.add_xpath( 'category', u'//ul[contains(@class, "pd-breadcrumbs")]/li[2]/a/text()') product_loader.add_value('identifier', sku) price = hxs.select( u'//div[contains(@class, "product-price__reg-price")]/text()' ).extract() product_loader.add_value('price', price[0].replace('Reg.', '')) product_loader.add_value('brand', response.meta['brand'].lower()) product_loader.add_value('url', response.url) image_url = hxs.select( u'/html/head/link[@rel="image_src"]/@href').extract() if image_url: product_loader.add_value('image_url', image_url[0]) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product brand = response.meta['brand'].lower() if brand not in product['name'] and brand not in response.body.lower(): return # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script> part1 = hxs.select( u'//script[starts-with(@src,"http://canadiantire.ugc.bazaarvoice.com/static/")]/@src' ).extract()[0].split('/')[-2] part2 = hxs.select( '//div[@id="bazaarVoiceConfig"]/@data-product-code').extract()[0] yield Request( 'http://canadiantire.ugc.bazaarvoice.com/%s/%s/reviews.djs?format=embeddedhtml' % (part1, part2), meta=response.meta, callback=self.parse_review_js)
def parse(response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # Search result does not contain all products # so we scrape filtering pages from left part of page for more products if response.meta['brand'] == 'Keter': if not 'filter' in response.meta or not response.meta['filter']: for url in hxs.select( "//div[@id='pdxttyperesults']//a/@href").extract(): yield Request(urlparse.urljoin(base_url, url), callback=self.load_products(brand), meta={ 'filter': True, 'brand': response.meta['brand'] }) next_page = hxs.select( '//div[@class="pagnLinkNavigate"]//span[@class="pagnNext"]') if next_page: yield Request(urlparse.urljoin( base_url, next_page.select('.//a/@href').extract()[0]), callback=self.load_products(brand), meta={'brand': response.meta['brand']}) for product_box in hxs.select( '//ul[contains(@class, "prodsGrid")]//form[starts-with(@id,"addproduct")]' ): product_loader = ProductLoader(item=Product(), selector=product_box) url = product_box.select('.//h3/a/@href').extract()[0] identifier = url.split('/')[-1] product_loader.add_xpath('name', './/h3/a/text()') product_loader.add_xpath('image_url', 'div/a/img/@src') product_loader.add_value('url', url) product_loader.add_value('identifier', identifier) product_loader.add_xpath('price', './/span[@itemprop="price"]/text()') product_loader.add_value('brand', brand.strip().lower()) product = product_loader.load_item() product['metadata'] = KeterMeta() product['metadata']['brand'] = brand if brand.upper() in product['name'].upper()\ or brand in ('Keter', ): yield Request( url=product['url'], callback=self.check_product_category(product)) else: yield Request(url=product['url'], callback=self.check_product_brand( product, brand))
def parse_item(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) name = hxs.select('//h1/strong/text()').extract()[0] name += hxs.select('//h1/text()').extract()[0] product_loader.add_value('name', name) # SKU #: YEH1020 | Part #: 03958604 ''' sku_part = hxs.select('//div[contains(@class, "sku")]/*[contains(text(), "SKU")]/text()').extract()[0] sku, part = sku_part.split('|') sku = sku.replace('SKU #:', '').strip() part = part.replace('Part #:', '').strip() ''' try: category = hxs.select('//a[contains(@class, "currentcat")]/text()').extract().pop() except: category = u'' try: image_url = hxs.select('//img[@id="lgimage"]/@src').extract().pop() except: image_url = u'' sku = hxs.select('//div[contains(@class, "pdp_head_info")]/text()').re(r'SKU #:.([\w\d]+)')[0] identifier = hxs.select('//span[contains(text(), "Part #")]/text()').re(r':.([\w\d]+)')[0] product_loader.add_value('sku', sku) product_loader.add_value('identifier', sku) product_loader.add_value('brand', response.meta['brand']) product_loader.add_value('category', category) product_loader.add_value('image_url', image_url) price = hxs.select('//*[@class="dynamic_sku_price"]/span/text()').extract()[0] price += hxs.select('//*[@class="dynamic_sku_price"]/span/sup/text()').extract()[0] product_loader.add_value('price', price) product_loader.add_value('url', response.url) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta.update({'product': product}) for x in self.parse_review(response): yield x
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) name = hxs.select( u'//div[@class="column main-info"]/h1/text()').extract()[-1] product_loader.add_value('name', name.strip()) product_loader.add_xpath('sku', u'//span[@class="store-sku"]/text()') # async JS # product_loader.add_xpath('price', u'//p[@class="offer-price"]/text()') product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta['brand'].strip().lower()) image_url = hxs.select( '//*[@id="main-product-image"]/img/@src').extract() if not image_url: self.log('ERROR no IMAGE found!') else: image_url = urljoin_rfc(get_base_url(response), image_url[0]) product_loader.add_value('image_url', image_url) category = hxs.select( '//*[@id="global-crumb-trail"]//a[2]/text()').extract() if not category: self.log('ERROR no CATEGORY found!') else: product_loader.add_value('category', category[0]) identifier = hxs.select('//*[@id="internet-cat"]/text()').extract() if not identifier: self.log('ERROR no IDENTIFIER found!') else: product_loader.add_value('identifier', identifier[0]) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product reviews = hxs.select( u'//div[@class="bv-reviews"]//iframe/@src').extract() if reviews: yield Request(reviews[0], meta=response.meta, callback=self.parse_review) else: price_url = 'http://www.homedepot.ca/async-fetch-regional-price?storeId=9999&pnList=' price_url += product['url'].split('/')[-1] yield Request(price_url, meta=response.meta, callback=self.parse_price)
def parse_product(self, response): hxs = HtmlXPathSelector(response) category = hxs.select( '//div[@id="top-breadcrumb"]/ol[@class="breadcrumb"]//a/text()' ).extract()[-1] identifier = hxs.select( '//ul[@id="select-product-option"]/@data-product-id').extract()[0] image_url = hxs.select( '//meta[@itemprop="image"]/@content').extract()[0] name = hxs.select( '//meta[@itemprop="name"]/@content').extract()[0].strip() for option in hxs.select( '//ul[@id="select-product-option"]/li[contains(@class, "product-options-list")]' ): option_name = option.select( './/span[@class="product-option-name"]/text()').extract( )[0].strip() option_identifier = option.select('@data-option-id').extract() if option_identifier: option_identifier = identifier + '-' + option_identifier[0] else: option_identifier = identifier loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('name', u'%s %s' % (name, option_name)) loader.add_value('identifier', option_identifier) loader.add_value('category', category) loader.add_value('image_url', image_url) price = option.select( './/span[@class="price"]/strong/text()').extract() if not price: price = option.select( './/div/div[@class="now-price"]/strong/text()').extract() loader.add_value('price', price[0] if price else u'0.00') loader.add_value('brand', response.meta['brand'].strip().lower()) # reviews_url = hxs.select(u'//div[@id="reviews-container"]//a[@class="view-al-test"]/@href').extract() product = loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'].strip().lower() metadata['reviews'] = [] product['metadata'] = metadata yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=response.meta['product'], response=response) loader.add_value('url', response.url) identifier = response.xpath('//@data-product-id').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_css('name', 'h1.product-title::text') category = response.xpath('//script/text()').re_first( 'category: "(.+?)>') loader.add_value('category', category) img = response.xpath('//meta[@itemprop="image"]/@src').extract_first() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img)) loader.add_value('brand', response.meta.get('brand')) if response.css('div.product-add-to-cart'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') product = self.add_shipping_cost(loader.load_item()) metadata = KeterMeta() metadata['reviews'] = [] product['metadata'] = metadata identifier = loader.get_output_value('identifier') methods = ('[{"method":"main_widget","params":{"pid":"' + identifier + '"}},' + '{"method":"bottomline", "params":{"pid": ' + identifier + ',' + '"link":"' + hxs.select('//div/@data-url').extract()[0] + '", "skip_average_score":false,' + '"main_widget_pid": ' + identifier + '}}]') formdata = { 'app_key': hxs.select('//div/@data-appkey').extract()[0], 'is_mobile': 'false', 'methods': methods, 'widget_version': '2015-08-30_11-33-24' } req = FormRequest("http://w2.yotpo.com/batch", formdata=formdata, callback=self.parse_review, meta={'product': product}) yield req
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//div/h1/span[@itemprop="name"]/text()').extract() brand = response.meta['brand'].strip().lower() if name and brand and self._check_brand(name[0], brand): name = name[0] else: return price = hxs.select('//span[@class="leftVal onlinePrice"]/text()').extract() if not price: price = hxs.select('//span[@class="leftVal"]/text()').extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('identifier', '//input[@name="/atg/commerce/order/purchase/CartModifierFormHandler.baseProductId"]/@value') loader.add_value('name', name) loader.add_xpath('sku', '//div/div/span[@itemprop="model"]/text()') loader.add_xpath('image_url', '//div/div/div[@id="plImageHolder"]/img/@src') loader.add_xpath('category', '(//div[@id="breadcrumb"]/span/a)[last()]/text()') loader.add_value('price', price[0] if price else 0) loader.add_value('brand', brand) reviews_url = u'http://samsclub.ugc.bazaarvoice.com/1337/%s/reviews.djs?format=embeddedhtml' product = loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'].strip().lower() metadata['reviews'] = [] product['metadata'] = metadata try: prod_id = hxs.select('//form//input[contains(@name, "productId")]/@value').extract()[0] yield Request(reviews_url % prod_id, meta={'product': product, 'product_url': response.url, 'reviews_url': reviews_url % prod_id}, callback=self.parse_review) except: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_xpath('identifier', '//div[@id="dv_ref"]/@title') loader.add_xpath('sku', '//div[@id="dv_ref"]/@title') price = hxs.select( '//span[contains(@class, "prix")]/@data-prix-origine').extract() if not price: price = hxs.select( '//div[@class="fa-infos-prix"]/div//span[contains(@class, "prix")]/text()' ).extract() price = price[0] if price else 0 loader.add_value('price', price) loader.add_xpath('name', '//h1[@itemprop="name"]//text()') categories = hxs.select( '//div[@class="breadcrumb"]/span/a/span[@itemprop="title"]/text()' ).extract()[:-1] loader.add_value('category', categories) img = ''.join(hxs.select('//img[@itemprop="image"]/@src').extract()) if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img)) loader.add_value('brand', response.meta.get('brand')) in_stock = hxs.select( '//div[contains(@class, "text-dispo") and contains(text(), "En stock")]' ) if in_stock: loader.add_value('stock', '1') else: loader.add_value('stock', '0') product = loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product for x in self.parse_review(response): yield x
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', u'//h1[@id="itemNameID"]/text()') product_loader.add_xpath('sku', u'//input[@id="cmItemNumber"]/@value') product_loader.add_xpath('image_url', '//*[@id="pThumbnail"]/@src') category = hxs.select('//*[@id="pagepath"]/a[2]/text()').extract() if category: product_loader.add_value('category', category[0].strip()) product_loader.add_xpath('identifier', '//form[@id="OrderItemAddForm"]/input[@name="catEntryId"]/@value') price = hxs.select('//td[@class="yourpricenumber"]/text()').extract() if not price: price = hxs.select('//tr/td[contains(text(),"Your Price") and position() = 1]/../td[2]/text()').extract() product_loader.add_value('price', price[0]) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta['brand'].lower()) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product n_reviews = hxs.select(u'//a/span[@class="imageontab"]/text()').extract() if n_reviews and n_reviews[0].strip('(0)'): n_reviews = int(n_reviews[0].strip('()')) review_sku = hxs.select(u'//input[@id="partNumber"]/@value').extract()[0] # 10 reviews per page pages = n_reviews / 10 if n_reviews % 10 > 0: pages += 1 response.meta['review_sku'] = review_sku response.meta['review_pages'] = pages response.meta['review_n'] = 1 yield Request(review_url(response.meta['review_sku'], response.meta['review_n']), meta=response.meta, callback=self.parse_review) else: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=response.meta.get('product', Product()), selector=hxs) url = hxs.select( '//form[@id="product_addtocart_form"]/@action').extract() identifier = [x for x in url[0].split('/') if x][-1] loader.add_value('url', response.url) loader.add_value('identifier', identifier) loader.add_xpath( 'sku', '//td[contains(text(),"Reference Produit")]/../td[2]/text()') loader.add_xpath('name', '//h2[contains(@class, "product-name")]/text()') loader.add_xpath('category', '//div[@class="fil-ariane"]/a[2]/strong/text()') img = hxs.select( '//div[@class="bloc-img-vignettes"]//img/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('brand', response.meta.get('brand')) product = self.add_shipping_cost(loader.load_item()) metadata = KeterMeta() metadata['reviews'] = [] product['metadata'] = metadata ratings = hxs.select('//p[@class="rating-links"]//a/@href').extract() if not ratings or '#review-form' in ratings[0]: yield product else: yield Request(ratings[0], meta={'product': product}, callback=self.parse_review)
def parse_reviews(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] reviews = hxs.select('//div[@class="detMainRating"]/div[contains(@class, "detRating")]') if reviews: metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = product.get('brand', '') for review in reviews: review_data = {} review_data['rating'] = review.select('div//div[@class="rat"]/span/text()').extract()[0] review_date = review.select('div//div[@class="date"]/@content').extract().pop() review_data['date'] = time.strftime('%d/%m/%Y', time.strptime(review_date, "%Y-%m-%d")) review_data['product_url'] = product['url'] review_data['sku'] = product['sku'] review_title = review.select('div//div[@class="title"]/span/text()').extract() review_text = review.select('div//div[@class="comm"]/text()').extract() review_data['full_text'] = "\n".join(review_title + review_text) metadata['reviews'].append(self.create_review_loader(response, review_data)) product['metadata'] = metadata yield product
def _get_reviews_url(self, product): identifier = product['identifier'] reviews_url = u'http://www.bhphotovideo.com/pwr/content/%s/%s-en_US-1-reviews.js' % ( self.calculate_url(identifier), identifier) metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = product.get('brand', '') product['metadata'] = metadata meta = { 'dont_retry': True, 'handle_httpstatus_list': [404], 'cur_page': 1, 'product': product, 'product_url': product['url'], 'reviews_url': u'http://www.bhphotovideo.com/' + u'%s/%s' % (self.calculate_url(identifier), identifier) + u'-en_US-%s-reviews.js' } return Request(reviews_url, meta=meta, callback=self.parse_review)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_id = hxs.select('//aside/span/span/text()')[0].extract() product_loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//article/header/h1/text()').extract() product_loader.add_value('name', u'{}'.format(name[0].strip())) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta.get('brand') or '') product_loader.add_value('identifier', '{}'.format(product_id)) product_loader.add_value('sku', product_id) try: category = hxs.select( '//ul[@class="breadcrumb"]//a/i/text()')[-1].extract() except: category = hxs.select( '//ul[@class="breadcrumb"]//a/text()')[-1].extract() product_loader.add_value('category', category) image_url = hxs.select( '//img[@id="img-01"]/@data-zoom-image').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) product_loader.add_value('image_url', image_url) price = hxs.select( '//aside[contains(@class, "price-container")]/div/p[@class="price"]//text()' ).extract() product_loader.add_value('price', extract_price(price[0]) if price else '0.00') if not hxs.select( '//div[@class="infos-checkout"]/a[contains(@class,"cta green")]' ): product_loader.add_value('stock', 0) weight = hxs.select( '//section[@id="description-technique"]//th[@scope="row" and contains(text(),"Poids")]/following-sibling::td/text()' ).extract() if weight: product_loader.add_value('shipping_cost', self._get_shipping_cost(weight[-1])) product = product_loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = response.meta.get('brand') or '' product['metadata'] = metadata reviews_url = 'http://www.leroymerlin.fr/v3/bazaarvoice/viewReviews.do?reflm={}&page={}&maxItems=4' yield Request(reviews_url.format(product_id, '1'), meta={ 'product': product, 'page': 1, 'product_url': response.url, 'product_id': product_id, 'reviews_url': reviews_url }, callback=self.parse_review, dont_filter=True)
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url brand = response.meta.get('brand', '') name = hxs.select( "//div[@class='primary-content']//div[@id='product-title']/h1/text()" ).extract() if not name: logging.error("ERROR! NO NAME! %s" % url) return name = name[0] price = hxs.select( "//div[@class='secondary-content']//ul[@class='pricing']/li[@class='current-price']/span/text()" ).extract() if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) price = '' else: price = "".join(price[:2]) sku = url.lower().split('skuid=')[-1] if len( url.lower().split('skuid=')) > 0 else None if not sku: logging.error("ERROR! SKU! %s %s" % (url, name)) return categories = " ".join( hxs.select("//div[@id='breadcrumbs']//li//a/text()").extract() ).lower().replace('\n', ' ').split(' ') if 'books' in categories: logging.error("ERROR! Product not valid %s %s" % (url, name)) return #is_valid = [t for t in self.cats if t in categories] #if brand.lower() != 'keter' and not is_valid: # logging.error("ERROR! Product not valid %s %s" % (url, name)) # return l = ProductLoader(item=Product(), response=response) l.add_value('identifier', sku) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) l.add_value('brand', brand.strip().lower()) #l.add_value('sku', sku) product = l.load_item() metadata = KeterMeta() metadata['brand'] = brand.strip().lower() metadata['reviews'] = [] product['metadata'] = metadata review_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=asiwwvlu4jk00qyffn49sr7tb&apiversion=5.4&displaycode=1235-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A' + sku + '&filter.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&sort.q0=rating%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv182_28795' request = Request(review_url, meta={ 'product': product, 'offset': 0, 'sku': sku }, callback=self.parse_reviews) yield request
def parse_item(self, response): hxs = HtmlXPathSelector(response) # Ensure the search matched brand, not some part of name or description brand = hxs.select( u'//div/div/p/b[contains(text(),"Brand")]/../../../div[2]/p/text()' ).extract() brand = brand and brand[0].strip().lower() # XXX No brand field for some suncast products, but they have brand in name if not brand: logging.warning('Brand not found [%s]' % response.url) brand = '' name = hxs.select(u'//h1/text()').extract()[0].strip() if response.meta['brand'].lower() in name.lower(): logging.warning('Assume [%s] from name' % response.meta['brand']) brand = response.meta['brand'].lower() if 'keter' in brand.lower(): brand = 'keter' if response.meta['brand'].lower() != brand: logging.warning( 'Brand [%s] not equal to search result brand [%s] [%s]' % (response.meta['brand'], brand, response.url)) return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', u'//h1/text()') sku = hxs.select(u'//meta[@property="eb:id"]/@content').extract()[0] product_loader.add_value('sku', sku) product_loader.add_value('identifier', sku) price = hxs.select('//span[@class="ppPrice"]/text()').extract()[0] price += hxs.select( '//span[@class="ppPrice"]/span/text()').extract()[0] product_loader.add_value('price', price) product_loader.add_value('brand', brand.lower()) product_loader.add_xpath('image_url', '//*[@id="jqzoom"]/@href') product_loader.add_value('url', response.url) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = brand metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product n_reviews = hxs.select( u'//div[@class="prSnippetReadReviews"]/a/text()').extract() if n_reviews: n_reviews = int(n_reviews[0].split()[1]) review_sku = hxs.select( u'//div[@id="HN_PP"]/@ppskunum').extract()[0] # 5 reviews per page pages = n_reviews / 5 if n_reviews % 5 > 0: pages += 1 response.meta['review_sku'] = review_sku response.meta['review_pages'] = pages response.meta['review_n'] = 1 yield Request(review_url(response.meta['review_sku'], response.meta['review_n']), meta=response.meta, callback=self.parse_review) else: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) shipping_cost = hxs.select( './/a[contains(text(), "Delivery Surcharge")]//../..//td[2]//span/text()' ).extract() if not shipping_cost: shipping_cost = hxs.select( './/td[contains(text(), "Shipping Surcharge")]//..//td[2]//span/text()' ).extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@id="ProductNameH1"]/text()') loader.add_value( 'category', hxs.select('//div[@class="breadcrum"]/div/a/text()').extract()[-1]) loader.add_xpath( 'identifier', '//form//input[@id="hdnProdId" or @name="hdnProdId"]/@value') price = hxs.select( './/td[contains(text(), "Price:")]//..//td[2]//span/text()' ).extract() if price: loader.add_value('price', price[0]) else: loader.add_value('price', 0) try: loader.add_value('shipping_cost', shipping_cost[0].strip()) except: pass item = hxs.select('//td/strong') if item and item[0].select('../text()'): loader.add_value( 'sku', item[0].select('../text()').extract()[1].strip('#() ')) image_url = hxs.select( '//div[@id="divImageBlock"]//img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('brand', 'Rubbermaid') product = loader.load_item() product['sku'] = product['sku'].upper() metadata = KeterMeta() metadata['brand'] = 'Rubbermaid' metadata['reviews'] = [] product['metadata'] = metadata self.log('>> BROWSER => GET < %s />' % response.url) self._browser.get(response.url) self.log('>> OK') self.log('>> BROWSER => Looking for more reviews ...') try: load_more_button = self._browser.find_element_by_xpath( '//div[@class="bv-content-pagination"]//button') more_reviews = load_more_button.is_displayed() max_pages = 25 while more_reviews and max_pages: self.log('>> More reviews found...') load_more_button.click() self.log('>> BROWSER => CLICK "Load more"') time.sleep(20) self.log('>> OK') load_more_button = self._browser.find_element_by_xpath( '//div[@class="bv-content-pagination"]//button') more_reviews = load_more_button.is_displayed() max_pages -= 1 self.log('>> No more reviews...') except Exception, e: self.log('>> ERROR FOUND => %s' % e)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( u'//div[@class="product_detail_main" or @class="product_lead"]/h1/text()' ).extract() if not name: self.log('ERROR: no product NAME found! URL:{}'.format( response.url)) else: loader.add_value('name', name[0].strip()) prod_id = re.search('\((.*)\)', response.url).group(1) loader.add_value('identifier', prod_id) # commented out because purchase form is not available when product is not in stock # prod_id = hxs.select('//*[@id="contentSubView:productDetailForm"]/../../../../../table/@id').extract() # if not prod_id: # self.log('ERROR: no product ID found! URL:{}'.format(response.url)) # return # else: # prod_id = prod_id[0] # loader.add_value('identifier', prod_id) loader.add_value('url', response.url) price = hxs.select( u'//div[@class="pricing prices_new"]/ul/li[@class="price"]/text()' ).extract() if not price: price = hxs.select( u'//li[@class="price_bucket"]/ul/li[@class="total_price"]/text()' ).extract() if not price: self.log('ERROR: no product PRICE found! URL:{}'.format( response.url)) return if price: loader.add_value('price', price[0]) product_image = hxs.select( '//*[@id="contentSubView:productImagesForm:productDetailImage"]/@src' ).extract() if not product_image: self.log('ERROR: no product Image found!') else: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) loader.add_value('image_url', image) category = hxs.select( '//div[@class="breadcrumb"]/ul/li[1]/a[1]/text()').extract() if not category: self.log('ERROR: category not found! URL:{}'.format(response.url)) else: loader.add_value('category', category[0].strip()) sku = hxs.select('//ul[@class="product_meta"]/li[1]/text()').re( '(\d+)') if not sku: self.log('ERROR: no SKU found! URL:{}'.format(response.url)) else: loader.add_value('sku', sku[0].strip()) loader.add_value('brand', response.meta['brand'].strip().lower()) product = loader.load_item() reviews_url = u'http://www.toysrus.co.uk/pwr/content/%s/%s-en_GB-1-reviews.js' % ( self.calculate_url(prod_id), prod_id) metadata = KeterMeta() metadata['brand'] = response.meta['brand'].strip().lower() metadata['reviews'] = [] product['metadata'] = metadata meta = { 'dont_retry': True, 'handle_httpstatus_list': [404], 'cur_page': 1, 'product': product, 'product_url': response.url, 'reviews_url': u'http://www.toysrus.co.uk/pwr/content/' + u'%s/%s' % (self.calculate_url(prod_id), prod_id) + u'-en_GB-%s-reviews.js' } yield Request(reviews_url, meta=meta, callback=self.parse_review)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) option_label = ' '.join( hxs.select('//div[@class="variationSelected"]' '/*[@class="variationLabel"]/text()').extract()) loader = ProductLoader(item=Product(), selector=hxs) soup = BeautifulSoup(response.body) try: name = ' '.join( [soup.find('span', id='btAsinTitle').text, option_label]).strip() except: name = ' '.join([ hxs.select('//h1[@id="title"]/text()').extract()[0].strip(), option_label ]).strip() loader.add_value('name', name) loader.add_value('url', response.url) no_price_ = False try: soup_form = soup.find(id='handleBuy') price = soup_form.find('b', 'priceLarge') if not price: price = soup_form.find('span', 'price') if not price: price = soup_form.find('span', 'pa_price') if not price: no_price_ = True else: loader.add_value('price', price.text) except: price = hxs.select('//div[@id="price"]//td[text()="Price:"]' '/following-sibling::td/span/text()').extract() if not price: no_price_ = True else: loader.add_value('price', price[0]) if no_price_: self.log('ERROR: no price found! URL:{}'.format(response.url)) return reviews_url = hxs.select( u'//a[contains(text(),"customer review") and contains(@href, "product-reviews") ' u'and not(contains(@href, "create-review"))]/@href').extract() loader.add_value('brand', response.meta['brand'].strip().lower()) sku = hxs.select( '//span[@class="tsLabel" and contains(text(), "Part Number")]/../span[2]/text()' ).extract() if not sku: sku = hxs.select( '//b[contains(text(), "model number")]/../text()').extract() if sku: loader.add_value('sku', sku[0].strip().lower()) else: self.log('ERROR: no SKU found! URL:{}'.format(response.url)) identifier = hxs.select('//form/input[@name="ASIN"]/@value').extract() if not identifier: self.log('ERROR: no identifier found! URL:{}'.format(response.url)) return else: loader.add_value('identifier', identifier) product_image = hxs.select( '//*[@id="main-image" or @id="prodImage"]/@src').extract() if not product_image: self.log('ERROR: no product Image found!') else: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) loader.add_value('image_url', image) category = hxs.select('//*[@id="nav-subnav"]/li[1]/a/text()').extract() if not category: self.log("ERROR: category not found") else: loader.add_value('category', category[0].strip()) product = loader.load_item() if product['identifier'] not in self.ids: self.ids.append(product['identifier']) metadata = KeterMeta() metadata['brand'] = response.meta['brand'].strip().lower() metadata['reviews'] = [] product['metadata'] = metadata if reviews_url: yield Request(urljoin_rfc(base_url, reviews_url[0]), meta={'product': product}, callback=self.parse_review) else: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) name = hxs.select('//h1[@itemprop="name"]/span[not(@class)]/text()' )[0].extract().strip() category = hxs.select( '//a[contains(@class,"breadcrumb")]/text()').extract() if category: h = HTMLParser() category = h.unescape(category[-1]) image_url = re.search('var imageUrl = \'(.*)\'', response.body) or [] if image_url: image_url = 'https://basspro.scene7.com/is/image/BassPro/%s' % image_url.group( 1).split('/')[-1] sku = hxs.select('//div[@id="description"]/text()').re( 'Manufacturer model #: (.*)\.') brand = hxs.select( '//a[@class="breadcrumb brand name" and contains(@href,"Brand")]/text()' ).extract() if sku: bushnell_product = self.bushnell_products.get( sku[0].upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] log.msg('Extracts category "%s" from bushnell file, URL: %s' % (category, response.url)) products = [] for option in hxs.select( '//table[@id="chart"]//tr[starts-with(@id,"sku_")]'): loader = ProductLoader(item=Product(), response=response, selector=option) identifier = option.select( './/span[@itemprop="sku"]/text()')[0].extract().strip() loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('name', '%s %s' % (name, identifier)) price = option.select( './/span[@itemprop="price"]/text()').extract() if not price: price = option.select( './/span[@itemprop="minPrice"]/text()').extract() if price: price = price[0] else: log.msg('No price: ' + response.url) loader.add_value('price', price) product = loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = brand[0] if brand else '' product['metadata'] = metadata products.append(product) if not products: loader = ProductLoader(item=Product(), response=response, selector=hxs) identifier = hxs.select( './/input[@type="hidden" and @name="productId"]/@value' )[0].extract().strip() loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('name', '%s %s' % (name, identifier)) price = re.search('sku_\d+\.price = \'(.*)\' ;', response.body) if not price: price = re.search('sku_\d+\.regPrice = \'(.*)\' ;', response.body) price = price.group(1) if price else '' loader.add_value('price', price) product = loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = brand[0] if brand else '' product['metadata'] = metadata products.append(product) reviews_url = u'http://basspro.ugc.bazaarvoice.com/2010category/%s/reviews.djs?format=embeddedhtml' prod_id = response.url.split('/')[-2] yield Request(reviews_url % prod_id, meta={ 'products': products, 'product_url': response.url, 'reviews_url': reviews_url % prod_id }, callback=self.parse_review)
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url brand = response.meta.get('brand', '') l = ProductLoader(item=Product(), response=response) name = hxs.select("//div[@id='pdpProduct']/h1/text()").extract() if not name: self.log("ERROR! NO NAME! %s" % url) log.msg('ERROR! NO NAME!') return name = name[0].strip() if brand.lower() == 'lifetime' and name.lower().find('lifetime') == -1: return price = hxs.select( "//div[@id='pdpPricing']/span[@class='actualprice']/span/text()" ).extract() if not price: self.log("ERROR! NO PRICE! %s %s" % (url, name)) return price = "".join(price) sku = hxs.select( "//span[@class='identifier']/span[contains(@class, 'partnumber')]/text()" ).extract() if not sku: self.log("ERROR! SKU! %s %s" % (url, name)) # return else: l.add_value('sku', sku[0]) category = '' s = hxs.select( "//script[contains(text(),'EFFECTIVE_URL')]/text()").extract() if s: s = s[0].strip() pos = s.find('category_root') if pos != -1: s = s[pos:].split('|') if len(s) > 1: category = s[1].replace('+', ' ') l.add_value('category', category) if category == '': self.log("ERROR! NO Category found! %s %s" % (url, name)) product_image = hxs.select('//*[@id="mainimage"]/@src').extract() if not product_image: self.log('ERROR: no product Image found!') else: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) l.add_value('image_url', image) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) l.add_value('brand', brand.strip().lower()) l.add_xpath('identifier', u'//form/input[@name="productId"]/@value') product = l.load_item() metadata = KeterMeta() metadata['brand'] = brand.strip().lower() metadata['reviews'] = [] product['metadata'] = metadata reviews_url = 'http://argos.ugc.bazaarvoice.com/1493-en_gb/%s/reviews.djs?format=embeddedhtml' # part_number = hxs.select(u'//form/input[@name="partNumber"]/@value').extract()[0] part_number = re.search(r'/partNumber/(\d+)', response.url).group(1) yield Request(reviews_url % part_number, callback=self.parse_review_page, meta={'product': product})
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: product_data = json.loads( hxs.select( '//script[contains(text(), "walPP.variantDataRawArr")]/text()' ).re(r'walPP.variantDataRawArr = (\[.*\])')[0])[0] except: self.errors.append('WARNING: No product data in %s' % response.url) return price = product_data.get(u'price_store_price', None) if not price: browser = PhantomJS.create_browser() self.log('>>> BROWSER: GET => %s' % response.url) browser.get(response.url) self.log('>>> BROWSER: OK') time.sleep(5) hxs = HtmlXPathSelector(text=browser.page_source) browser.quit() # Monitor all products even without a price (as requested in #248) price = '.'.join( hxs.select( '//div[@id="pricing"]/div[@class="price-main"]//text()'). re(r'(\d+)')).strip() if not price: price_elem = hxs.select( '//span[@id="store-price"][1]/text()').extract() if price_elem: price = price_elem[0] if not price: store_prices = hxs.select( '//div[contains(@id, "store-")]//div[@class="price"]//text()' ).extract() try: price = '.'.join( re.findall(r'(\d+)', '.'.join(store_prices[:3]))) except: price = '0.00' else: price = price[0] product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('category', product_data[u'Category']) product_loader.add_value('name', product_data[u'prod_name_en']) product_loader.add_value('sku', product_data[u'P_RollupKey']) product_loader.add_value('price', price.replace(',', '')) product_loader.add_value('identifier', product_data[u'P_UniqueKey']) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta['brand'].strip().lower()) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product # the same as canadiantire.ca # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script> try: part2 = product['sku'] except: self.errors.append('WARNING: No sku in %s' % response.url) yield product else: if not part2: self.errors.append('WARNING: No sku in %s' % response.url) yield product else: reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=e6wzzmz844l2kk3v6v7igfl6i&apiversion=5.4&displaycode=2036-en_ca&resource.q2=reviews&filter.q2=isratingsonly%3Aeq%3Afalse&filter.q2=productid%3Aeq%3A' + part2 yield Request(reviews_url, meta=response.meta, callback=self.parse_reviews)
def parse_(self, response): brand = response.meta['brand'] hxs = HtmlXPathSelector(response) base_url = get_base_url(response) next_page = hxs.select('//div[@id="sub-nav-top"]//li[@class="next"]') if next_page: yield Request(urlparse.urljoin( base_url, next_page.select('.//a/@href').extract()[0]), meta={'brand': response.meta['brand']}) for product_box in hxs.select('//div[@id="product-list"]/div'): product_name = product_box.select( './/div[@class="product-info"]//h3/a/text()').extract()[0] if not brand.upper() in product_name.upper(): continue if not product_box.select( './/span[@class="price"]/text()').extract(): continue product_url = product_box.select( './/div[@class="product-info"]//h3/a/@href').extract()[0] option_trows = product_box.select( './/table[@class="option-prices"]//tr') for option_tr in option_trows: if not option_tr.select('.//td[@class="option-name"]/text()'): continue product_loader = ProductLoader(item=Product(), selector=option_tr) product_loader.add_value('url', product_url) option_name = option_tr.select( './/td[@class="option-name"]/text()').extract()[0] product_loader.add_value( 'name', product_name.strip() + " " + option_name.strip()) product_loader.add_xpath( 'price', './/td[starts-with(@class,"option-price")]/text()') product_loader.add_value('brand', brand.lower()) product = product_loader.load_item() product['metadata'] = KeterMeta() product['metadata']['brand'] = brand yield Request(product_url, callback=self.visit_product_page, meta={'product': product}) else: product_loader = ProductLoader(item=Product(), selector=product_box) product_loader.add_value('name', product_name) product_loader.add_value('url', product_url) product_loader.add_value('brand', brand.lower()) price = product_box.select( './/div[@class="price-container"]/span[@class="price"]/span/text()' ).extract() if price: product_loader.add_xpath( 'price', './/div[@class="price-container"]/span[@class="price"]/span/text()' ) else: product_loader.add_value('price', '0') product = product_loader.load_item() product['metadata'] = KeterMeta() product['metadata']['brand'] = brand yield Request(product_url, callback=self.visit_product_page, meta={'product': product})
def parse_product(self, response): hxs = HtmlXPathSelector(response) sku = hxs.select(u'//meta[@name="WT.pn_sku"]/@content').re('(\d+)') sku = sku[0] if sku else '' name = hxs.select('//div[@id="productInfo"]/div/h1[@class="label"]/text()[normalize-space()]')[0].extract().strip() category = hxs.select('//ul[@class="breadcrumb"]/li/a/text()').extract() if category: h = HTMLParser() category = h.unescape(category[-1]) bushnell_product = self.bushnell_products.get(sku.upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] log.msg('Extracts category "%s" from bushnell file, URL: %s' % (category, response.url)) image_url = hxs.select('//meta[@property="og:image"]/@content').extract() if image_url: image_url = image_url[0] products = [] for option in hxs.select('//div[@id="productChart"]/table//tr')[2:-1]: loader = ProductLoader(item=Product(), response=response, selector=option) option_name = ' '.join([x.strip() for x in option.select('.//td/text()').extract() if x.strip()]) identifier = option.select('.//input[@type="hidden" and @name="productId"]/@value')[0].extract().strip() variant_identifier = option.select('.//input[@type="hidden" and @name="productVariantId"]/@value')[0].extract().strip() loader.add_value('identifier', '%s.%s' % (identifier, variant_identifier)) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('name', '%s %s' % (name, option_name)) price = option.select('.//td')[-1].select('.//dl[@class="salePrice"]/dd[@class="saleprice"]/text()') if not price: price = option.select('.//td')[-1].select('.//div[@class="price"]/dl/dd[@class="nprange"]/text()') price = price[0].extract() loader.add_value('price', price) product = loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = 'Bushnell' product['metadata'] = metadata products.append(product) if not products: identifier = hxs.select('.//input[@type="hidden" and @name="productId"]/@value')[0].extract().strip() price = hxs.select('.//dl[@class="salePrice"]/dd[@class="saleprice"]/text()') if not price: price = hxs.select('.//div[@class="price"]/dl/dd[@class="nprange"]/text()') if price: price = price[0].extract() loader = ProductLoader(item=Product(), response=response, selector=hxs) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('name', '%s' % name) loader.add_value('price', price) product = loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = 'Bushnell' product['metadata'] = metadata products.append(product) else: options = re.findall('labels : \[(.*? Stock.)\]', response.body.replace('\n', '')) if options: for i, option in enumerate(options): option_value = option.replace('\'', '') option_parts = option.split(' - ') option_price = option_parts[-2] option_name = ' - '.join(option_parts[:-2]) loader = ProductLoader(item=Product(), response=response, selector=hxs) loader.add_value('identifier', '%s.%s' % (identifier, str(i))) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('name', '%s %s' % (name, option_name)) loader.add_value('price', option_price) product = loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = 'Bushnell' product['metadata'] = metadata products.append(product) reviews_url = u'http://reviews.cabelas.com/8815/%s/reviews.djs?format=embeddedhtml&scrollToTop=true' try: prod_id = re.search('productId: \"(\d+)', response.body).groups() except AttributeError: return if not prod_id: prod_id = hxs.select('//input[@name="productId"]/@value').extract() if not prod_id: prod_id = re.search(r'/(\d+)\.uts', response.url).groups() yield Request(reviews_url % prod_id[0], meta={'products': products, 'product_url': response.url, 'reviews_url': reviews_url % prod_id[0]}, callback=self.parse_review, dont_filter=True)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) category = hxs.select('//ul[@id="breadcrumbs"]/li/a/text()').extract()[-1] identifier = hxs.select('//input[@id="prodID"]/@value').extract()[0] image_url = urljoin_rfc(get_base_url(response), hxs.select('//img[@class="Product-Main-Image"]/@src').extract()[0]) name = hxs.select(u'//div[@id="product-options-container"]/h1/text()').extract()[0].strip() for option in hxs.select(u'//div[@class="product-options"]/div'): option_name = option.select(u'.//label/text()').extract()[0].strip() option_identifier = option.select(u'input[@name="optionId"]/@value').extract() if option_identifier: option_identifier = identifier + '-' + option_identifier[0] else: option_identifier = identifier loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('identifier', option_identifier) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('name', u'%s %s' % (name, option_name)) price = option.select(u'.//div[@class="price-container sale"]/span[@class="now-price price-history-link"]/text()').extract() try: price = [re.search(u'\xa3([\d\.,]+)', price[0]).group(1)] except IndexError: price = None if not price: price = option.select(u'.//div[@class="price-container"]/span[@class="option-price"]/text()').extract() try: loader.add_value('price', price[0]) except IndexError: loader.add_value('price', u'0.00') reviews_url = hxs.select(u'//div[@id="reviews-container"]//a[starts-with(text(), "View All")]/@href').extract() loader.add_value('brand', response.meta['brand'].strip().lower()) product = loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'].strip().lower() metadata['reviews'] = [] product['metadata'] = metadata reviews = hxs.select(u'//div[@class="review"]') frontpage_reviews = [] for review in reviews: rating = review.select(u'.//img/@alt')[0].extract() rating = re.search(u'(\d) out of', rating).group(1) res = dict() res['rating'] = rating res['full_text'] = review.select(u'./p[1]/text()')[0].extract().strip() frontpage_reviews.append(res) if reviews_url: reviews_url = urljoin_rfc(get_base_url(response), reviews_url[0]) yield Request(reviews_url, meta={'frontpage_reviews': frontpage_reviews, 'product': product}, callback=self.parse_review, dont_filter=True) else: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_id = hxs.select('//td[@itemprop="sku"]/@content')[0].extract() product_loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//div[@id="titre"]/h1/text()').extract() product_loader.add_value('name', u'{}'.format(name[0].strip())) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta.get('brand') or '') product_loader.add_value('identifier', product_id) product_loader.add_value('sku', product_id) image_url = hxs.select('//img[@id="visuelprincipal"]/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) product_loader.add_value('image_url', image_url) price = hxs.select( '//div[@class="bloc-price-fiche"]/p/span[@class="price"]/text()' ).extract() if not price: price = ''.join( hxs.select( '//div[@class="bloc-price-fiche"]/span[@class="price"]//text()' ).extract()) else: price = price[0] product_loader.add_value('price', price.replace(',', '.') if price else '0.00') in_stock = hxs.select( '//div[@class="postitdelais"]/span[contains(@class,"postitfond-dom1") or contains(@class,"postitfond-2h")]' ).extract() if not in_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = response.meta.get('brand') or '' product['metadata'] = metadata response.meta['product'] = product shipping_cost = hxs.select('//button/@onclick').re( 'ajoutPanier\(\'(.*)\'\)') for product in self.parse_review(response): if shipping_cost: p_id = re.search('product/(.*?)/', shipping_cost[0]).group(1) yield Request(urljoin_rfc(base_url, shipping_cost[0]), callback=self.parse_shipping_cost_url, meta={ 'product': product.copy(), 'cookiejar': p_id }, errback=lambda failure, product=product: self. parse_error(failure, product), dont_filter=True) else: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//div[@class="item"]/h1/text()')[0].extract().strip() category = hxs.select('//span[@class="breadcrumbs"]/span/a/span/text()').extract() if category: category = category[-1].strip() image_url = hxs.select('//div[@class="gallery-image"]/img/@data-src').extract() if image_url: image_url = image_url[0] brand = response.xpath('//meta[@itemprop="brand manufacturer"]/@content').extract() products = [] options = hxs.select('//div[@id="fancy-options-variants"]/div[@class!="e-filtration-result-empty"]') if options: for option in options: loader = ProductLoader(item=Product(), response=response, selector=option) identifier = option.select('.//@data-variant-id')[0].extract().strip() loader.add_value('identifier', identifier) loader.add_value('brand', brand) sku = hxs.select('.//div[@itemprop="mpn"]/span/text()').extract() if sku: loader.add_value('sku', sku[0]) bushnell_product = self.bushnell_products.get(sku[0].upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] log.msg('Extracts category "%s" from bushnell file, URL: %s' % (category, response.url)) loader.add_value('url', response.url) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('name', option.select('.//@data-variant-name')[0].extract()) price = option.select('.//span[@class="variant-price"]/@content')[0].extract() loader.add_value('price', price) product = loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = brand[0] if brand else '' product['metadata'] = metadata products.append(product) else: loader = ProductLoader(item=Product(), response=response, selector=hxs) loader.add_value('name', name) identifier = hxs.select('.//@data-variant-id')[0].extract().strip() loader.add_value('identifier', identifier) loader.add_value('brand', brand) sku = hxs.select('.//div[@itemprop="mpn"]/span/text()').extract() if sku: loader.add_value('sku', sku[0]) bushnell_product = self.bushnell_products.get(sku[0].upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] log.msg('Extracts category "%s" from bushnell file, URL: %s' % (category, response.url)) loader.add_value('url', response.url) loader.add_value('category', category) loader.add_value('image_url', image_url) price = hxs.select('.//span[@class="variant-price"]/@content')[0].extract() loader.add_value('price', price) product = loader.load_item() metadata = KeterMeta() metadata['brand'] = brand[0] if brand else '' metadata['reviews'] = [] product['metadata'] = metadata products.append(product) if hxs.select(u'//span[@id="product-social-header-ratings-text"]/span[@id="product-social-header-review-not-rated"]'): for product in products: yield product return try: reviews_url = hxs.select(u'//div[@id="product-customer-reviews"]/span[@class="all-reviews"]/a/@href').extract()[0] except: reviews_url = hxs.select(u'//div[@id="product-customer-reviews"]//span[@class="all-reviews"]/a/@href').extract()[0] yield Request(urljoin_rfc(base_url, reviews_url), meta={'products': products, 'product_url': response.url}, callback=self.parse_review)