def parse_review(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] for review in hxs.select(u'//div[@class="pr-review-wrap"]'): item = Review() loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') loader.add_xpath( 'date', u'.//div[contains(@class, "pr-review-author-date")]/text()') comments = review.select( u'.//p[@class="pr-comments"]/text()').extract()[0] bottom_line = review.select( u'.//div[@class="pr-review-bottom-line-wrapper"]/p/text()[2]' ).extract() if bottom_line: bottom_line = bottom_line[0] else: bottom_line = '' pros = hxs.select( './/div[contains(@class,"pr-attribute-pros")]//li/text()' ).extract() cons = hxs.select( './/div[contains(@class,"pr-attribute-cons")]//li/text()' ).extract() best_uses = hxs.select( './/div[contains(@class,"pr-attribute-bestuses")]//li/text()' ).extract() loader.add_value( 'full_text', u'%s\nBottom Line: %s\nPros: %s\nCons: %s\nBest Uses: %s\n' % (comments, bottom_line, u', '.join(pros), u', '.join(cons), u', '.join(best_uses))) loader.add_value( 'rating', int( float( review.select( u'.//span[contains(@class,"pr-rating")]/text()'). extract()[0]))) loader.add_value('url', response.url) product['metadata']['reviews'].append(loader.load_item()) next_url = hxs.select( u'//span[@class="pr-page-next"]/a/@href').extract() if next_url: yield Request(next_url[0], meta=response.meta, callback=self.parse_review) else: yield product
def create_review_loader(self, response, data): fields = ['url', 'date', 'rating', 'product_url', 'sku', 'full_text'] loader = ReviewLoader(item=Review(), response=response, date_format=u'%d/%m/%Y') for key, value in data.items(): if key in fields: loader.add_value(key, value) return loader.load_item()
def process_product_reviews(self, response): hxs = HtmlXPathSelector(response) visited_reviews = response.meta.get('visited_reviews', set()) product = response.meta['product'] visited_reviews.add(response.url) base_url = get_base_url(response) for review_box in hxs.select( '//div[@class="boxproductinfo"]//table//tr'): loader = ReviewLoader(item=Review(), selector=hxs, date_format='%d/%m/%Y') date = review_date_format.findall( review_box.select("./td/div[1]//p//span/text()").extract()[1]) if date: loader.add_value('date', date[0]) loader.add_value( 'full_text', review_box.select("./td/div[2]/text()").extract()[0].strip( '" \r\n"')) loader.add_value('url', response.url) product['metadata']['reviews'].append(loader.load_item()) for link in hxs.select( '//table[@class="pricingbox"]//a/@href').extract(): next_page = urlparse.urljoin(base_url, link) if "productreviews" in link and not next_page in visited_reviews: yield Request(next_page, callback=self.process_product_reviews, meta={ 'product': product, 'visited_reviews': visited_reviews }) return yield product
def parse_review(self, response, product=None): hxs = HtmlXPathSelector(response) if not product: product = response.meta['product'] for review in hxs.select('//div[@class="pr-review-main"]'): loader = ReviewLoader(item=Review(), selector=review, date_format=u'%Y-%m-%d') loader.add_xpath('date', './/span[@itemprop="dtreviewed"]/@datetime') loader.add_xpath('full_text', './/div[@class="pr-review-infos-title"]/text()') loader.add_xpath('full_text', './/div[@class="pr-comments"]/text()') loader.add_value('product_url', product['url']) loader.add_value('url', product['url']) loader.add_value('sku', product['sku']) loader.add_value( 'rating', len( review.select( './/div[@class="pr-stars pr-stars-small"]/span[contains(@class, "pr-star")]' ).extract())) product['metadata']['reviews'].append(loader.load_item()) next = hxs.select( '//span[@class="pr-page-next"]/a[@href!="#"]/@href').extract() if not next: yield product else: yield Request(urljoin_rfc(get_base_url(response), next[0]), callback=self.parse_review, meta=response.meta)
def parse_review(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] for review in hxs.select('//ul[@class="liste-avis"]/li'): loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') date_review = ''.join( review.select('.//a[@href="#"]/strong/text()').extract()) date_review = datetime.datetime.strptime(date_review[:8], "%d/%m/%y").date() date_review = date_review.strftime("%m/%d/%Y") loader.add_value('date', date_review) loader.add_value( 'full_text', normalize_space(' '.join( review.select('.//text()').extract()))) loader.add_value('product_url', product['url']) loader.add_value('url', product['url']) loader.add_value('sku', product['sku']) loader.add_value( 'rating', hxs.select( './/a[starts-with(@class, "note-produit note-produit-")]/@class' ).extract()[0][-1]) product['metadata']['reviews'].append(loader.load_item()) yield product
def parse_review(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] for review in hxs.select(u'//tr[@class="singlereview"]'): item = Review() date = review.select(u'.//div[contains(@class,"ltbodytext")]/text()').extract()[0] date = date.split('/') item['date'] = date[1] + '/' + date[0] + '/' + date[2] title = review.select(u'.//p[@class="subtitle"]/text()').extract() if title: title = title[0] else: title = '' text = review.select(u'.//div[@class="bodytext"]/p/text()').extract() if text: text = text[0] else: text = '' item['full_text'] = title + '\n' + text item['rating'] = int(float(review.select(u'.//div[contains(@class,"rating_avg_sm")]/text()').extract()[0])) item['url'] = response.url product['metadata']['reviews'].append(item) next_url = hxs.select(u'//div[contains(@class,"pagination")]/a[contains(text(),"Next")]/@href').extract() logging.error(next_url) if next_url: yield Request('http://www.wayfair.com/ajax/view_reviews_action.php?prsku=%s&rvpg=%s&rvso=0' % ( product['sku'], next_url[0].split('curpage=')[1]), meta=response.meta, callback=self.parse_review) else: yield product
def parse_review_page(self, response): item_ = response.meta.get('product', '') hxs = HtmlXPathSelector(text=self._extract_html(response)) reviews = hxs.select('//div[@class="BVRRReviewDisplayStyle5"]') for review in reviews: l = ReviewLoader(item=Review(), response=response, date_format='%m/%d/%Y') rating = review.select( ".//span[contains(@class,'BVRRRatingNumber')]/text()").extract( )[0] date = review.select( ".//span[contains(@class,'BVRRValue BVRRReviewDate')]/text()" ).extract()[0] review = review.select( ".//span[contains(@class,'BVRRReviewText')]/text()" )[1].extract() l.add_value('rating', rating) l.add_value('url', response.url) l.add_value( 'date', datetime.strptime(date, '%d %B %Y').strftime('%m/%d/%Y')) l.add_value('full_text', review) item_['metadata']['reviews'].append(l.load_item()) next = hxs.select( '//span[@class="BVRRPageLink BVRRNextPage"]/a/@data-bvjsref' ).extract() if next: yield Request(next[0], callback=self.parse_review_page, meta={'product': item_}) else: yield item_
def parse_review(self, response): hxs = HtmlXPathSelector(response) reviews = hxs.select( u'//div[contains(@id,"BVRRDisplayContentReviewID")]') product = response.meta['product'] if not reviews: yield product return for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') date = review.select( u'.//span[contains(@class, "BVRRReviewDate")]/span[@class="value-title"]/@title' ).extract() if date: date = time.strptime(date.pop(), u'%Y-%m-%d') date = time.strftime(u'%m/%d/%Y', date) loader.add_value('date', date) title = review.select( u'.//span[@class="BVRRValue BVRRReviewTitle"]/text()').extract( ) if not title: title = u'Untitled' else: title = title[0] text = '\n'.join( review.select( './/div[@class="BVRRReviewDisplayStyle3Summary"]//text()[normalize-space()]' ).extract()) text += '\n' + '\n'.join( review.select( u'.//div[@class="BVRRReviewDisplayStyle3Content"]//text()[normalize-space()]' ).extract()) loader.add_value('full_text', u'%s\n%s' % (title, text)) loader.add_value('product_url', product['url']) loader.add_value('url', product['url']) loader.add_value('sku', product.get('sku') or '') loader.add_xpath( 'rating', u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@class="BVRRNumber BVRRRatingNumber"]/text()' ) product['metadata']['reviews'].append(loader.load_item()) next_page = hxs.select( u'.//a[contains(text(),"Next page")]/@data-bvjsref').extract() if not next_page: yield product return else: yield Request(urljoin_rfc(get_base_url(response), next_page[0]), meta=response.meta, callback=self.parse_review, dont_filter=True)
def parse_review(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] for review in hxs.select( u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]'): review_loader = ReviewLoader(item=Review(), selector=review, date_format="%B %d, %Y") review_loader.add_value( 'date', review.select( u'.//span[contains(@class,"BVRRReviewDate")]/text()'). extract()[1]) title = review.select( u'.//span[contains(@class,"BVRRCustomFullTitle")]/text()' ).extract() text = ' '.join( review.select( u'.//span[contains(@class,"BVRRReviewText")]/text()'). extract()) if title: full_text = title[0] + '\n' + text else: full_text = text pros = review.select( u'.//span[contains(@class,"BVRRReviewProTags")]/span/text()' ).extract() cons = review.select( u'.//span[contains(@class,"BVRRReviewConTags")]/span/text()' ).extract() if pros: full_text += '\nPros: ' + ', '.join(pros) if cons: full_text += '\nCons: ' + ', '.join(cons) review_loader.add_value('full_text', full_text) rating = review.select( u'.//img[@class="BVImgOrSprite"]/@title').extract()[0] review_loader.add_value('rating', rating.split()[0]) review_loader.add_value('url', response.url) product['metadata']['reviews'].append(review_loader.load_item()) next_url = hxs.select( u'//a[contains(@name,"BV_TrackingTag_Review_Display_NextPage")]/@data-bvjsref' ).extract() if next_url: yield Request(next_url[0], meta=response.meta, callback=self.parse_review_js) else: yield product
def parse_review(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] for review in hxs.select( u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]'): review_loader = ReviewLoader(item=Review(), selector=review, date_format="%B %d, %Y") review_loader.add_xpath( 'date', u'.//span[contains(@class,"BVRRReviewDate")]/text()') title = review.select( u'.//span[contains(@class,"BVRRReviewTitle")]/text()').extract( ) text = review.select( u'.//span[contains(@class,"BVRRReviewText")]/text()').extract( ) text = ' '.join(text) if title: full_text = title[0] + '\n' + text else: full_text = text pros = review.select( u'.//span[contains(@class,"BVRRReviewProTags")]/span/text()' ).extract() cons = review.select( u'.//span[contains(@class,"BVRRReviewConTags")]/span/text()' ).extract() if pros: full_text += '\nPros: ' + ', '.join(pros) if cons: full_text += '\nCons: ' + ', '.join(cons) review_loader.add_value('full_text', full_text) review_loader.add_xpath( 'rating', u'.//span[contains(@class,"BVRRRatingNumber")]/text()') review_loader.add_value('url', response.url) product['metadata']['reviews'].append(review_loader.load_item()) next_url = hxs.select( u'//div[contains(@class,"BVRRNextPage")]/a/@href').extract() if next_url: yield Request(next_url[0], meta=response.meta, callback=self.parse_review) else: price_url = 'http://www.homedepot.ca/async-fetch-regional-price?storeId=9999&pnList=' price_url += product['url'].split('/')[-1] yield Request(price_url, meta=response.meta, callback=self.parse_price)
def parse_review(self, response): hxs = HtmlXPathSelector(response) reviews = hxs.select(u'//div[contains(@class, "review-item")]') products = response.meta['products'] if not reviews: for product in products: yield product return for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') review_id = review.select('@data-review-id').extract()[0] loader.add_value('review_id', review_id) date = review.select(u'.//div[@class="item-author"]//text()').re(r'Written on (.*)')[0].strip() date = time.strptime(date, u'%B %d, %Y') date = time.strftime(u'%m/%d/%Y', date) loader.add_value('date', date) title = review.select(u'./h2/a/text()').extract() if not title: title = u'Untitled' else: title = title[0] text = ' '.join([s.strip().replace('\n', '') for s in review.select(u'.//div[@class="item-text"]//text()').extract() if s.strip()]) text = re.sub(' {2,}', ' ', text) loader.add_value('full_text', u'%s\n%s' % (title, text)) loader.add_value('product_url', response.meta['product_url']) loader.add_value('url', response.url) product = products[0] if products else {} loader.add_value('sku', product.get('sku') or '') rating = review.select(u'./div[@class="item-rating"]/div[contains(@class, "stars")]/div/@style').re(r'width: (\d+)%;')[0] loader.add_value('rating', int(rating) / 20) products[0]['metadata']['reviews'].append(loader.load_item()) next_page = hxs.select('//div[contains(@class, "next-button") and not(contains(@class, "disabled"))]') if next_page: identifier = hxs.select('//input[@name="identifier"]/@value').extract()[0] next_page = int(response.meta.get('current_page', 1)) + 1 meta = response.meta.copy() meta['current_page'] = next_page req = FormRequest( response.url.split('?')[0], headers={'Accept': 'application/json, text/javascript, */*; q=0.01'}, formdata={'identifier': identifier, 'page': str(next_page), 'page_size': '10', 'sort': 'newest'}, callback=self.parse_review, meta=meta) yield req else: for product in products: yield product
def parse_reviews(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] sku = response.meta['sku'] body = response.body.strip().partition('(')[-1].replace('});', '}').replace( '})', '}') json_body = json.loads(body) reviews = json_body['BatchedResults']['q0']['Results'] for review in reviews: review_loader = ReviewLoader(item=Review(), response=response, date_format="%B %d, %Y") review_date = datetime.datetime.strptime( review['SubmissionTime'].split('.')[0], '%Y-%m-%dT%H:%M:%S') review_loader.add_value('date', review_date.strftime("%B %d, %Y")) title = review['Title'] text = review['ReviewText'] if title: full_text = title[0] + '\n' + text else: full_text = text pros = review['Pros'] cons = review['Cons'] if pros: full_text += '\nPros: ' + ', '.join(pros) if cons: full_text += '\nCons: ' + ', '.join(cons) review_loader.add_value('full_text', full_text) rating = review['Rating'] review_loader.add_value('rating', rating) review_loader.add_value('url', product['url']) product['metadata']['reviews'].append(review_loader.load_item()) if len(reviews) == 100: offset = response.meta['offset'] + 100 next_reviews = 'http://api.bazaarvoice.com/data/batch.json?passkey=asiwwvlu4jk00qyffn49sr7tb&apiversion=5.4&displaycode=1235-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A' + sku + '&filter.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&sort.q0=rating%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&limit.q0=100&offset.q0=' + str( offset) + '&limit_comments.q0=3&callback=bv182_28795' request = Request(next_reviews, meta={ 'product': product, 'offset': offset, 'sku': sku }, callback=self.parse_reviews) yield request else: if product['price']: yield product
def parse_review(self, response): html = re.search('var materials={.*?(<div.*?)"},.initializers', response.body, re.DOTALL).group(1) html = re.sub(r'\\n', r'\n', html) html = re.sub(r'\\(.)', r'\1', html) hxs = HtmlXPathSelector(text=html) reviews = hxs.select(u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]') products = response.meta['products'] if not reviews: for product in products: yield product return for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') date = review.select(u'.//span[@class="BVRRValue BVRRReviewDate"]/text()').extract()[0] date = time.strptime(date, u'%B %d, %Y') date = time.strftime(u'%m/%d/%Y', date) loader.add_value('date', date) title = review.select(u'.//span[@class="BVRRValue BVRRReviewTitle"]/text()').extract() if not title: title = u'Untitled' else: title = title[0] text = review.select(u'.//span[@class="BVRRReviewText"]/text()').extract() if text: text = text[0] else: text = u'No text supplied.' loader.add_value('full_text', u'%s\n%s' % (title, text)) loader.add_value('product_url', response.meta['product_url']) review_id = review.select('@id').re(r'ReviewID_(\d+)$')[0] loader.add_value('review_id', review_id) loader.add_value('url', response.meta['product_url']) product = products[0] if products else {} loader.add_value('sku', product.get('sku') or '') loader.add_xpath('rating', u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@class="BVRRNumber BVRRRatingNumber"]/text()') products[0]['metadata']['reviews'].append(loader.load_item()) next_page = hxs.select(u'.//a[contains(text(),"Next page")]/@data-bvjsref').extract() if not next_page: for product in products: yield product return else: yield Request(urljoin_rfc(get_base_url(response), next_page[0]), meta=response.meta, callback=self.parse_review, dont_filter=True)
def parse_review(self, response): html = re.search('var materials={.*?(<div.*?)"},.initializers', response.body, re.DOTALL).group(1) html = re.sub(r'\\n', r'\n', html) html = re.sub(r'\\(.)', r'\1', html) hxs = HtmlXPathSelector(text=html) reviews = hxs.select(u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]') product = response.meta['product'] if not reviews: yield product return for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%d/%m/%Y') date = review.select(u'.//span[@class="BVRRValue BVRRReviewDate"]/text()').extract()[0] date = time.strptime(date, u'%B %d, %Y') date = time.strftime(u'%d/%m/%Y', date) loader.add_value('date', date) title = review.select(u'.//span[@class="BVRRValue BVRRReviewTitle"]/text()').extract() if not title: title = u'Untitled' else: title = title[0] text = review.select(u'.//span[@class="BVRRReviewText"]/text()').extract() if text: text = text[0] else: text = u'No text supplied.' loader.add_value('full_text', u'%s\n%s' % (title, text)) loader.add_value('url', response.meta['product_url']) loader.add_xpath('rating', u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@class="BVRRNumber BVRRRatingNumber"]/text()') product['metadata']['reviews'].append(loader.load_item()) cur_page = hxs.select(u'//span[@class="BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber"]/text()').extract() if not cur_page: yield product return else: cur_page = int(cur_page[0]) if 'last_page' not in response.meta: response.meta['last_page'] = int(hxs.select(u'//span[@class="BVRRPageLink BVRRPageNumber"]/a/text()').extract()[-1]) if cur_page < response.meta['last_page']: url = response.meta['reviews_url'] + u'&page=%s' % str(cur_page + 1) yield Request(url, meta=response.meta, callback=self.parse_review) else: yield product
def parse_review(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) soup = BeautifulSoup(response.body) product = response.meta['product'] reviews = hxs.select( u'//table[@id="productReviews"]//div[@style="margin-left:0.5em;"]') if not reviews: yield product return for review in reviews: loader = ReviewLoader(item=Review(), selector=hxs, date_format=u'%d/%m/%Y') date = review.select(u'.//nobr/text()')[0].extract() res = None date_formats = (u'%B %d, %Y', u'%d %b %Y', u'%d %B %Y') for fmt in date_formats: try: res = time.strptime(date, fmt) except ValueError: pass if res: break date = time.strftime(u'%d/%m/%Y', res) loader.add_value('date', date) rating = review.select(u'.//text()').re( u'([\d\.]+) out of 5 stars')[0] rating = int(float(rating)) loader.add_value('rating', rating) loader.add_value('url', response.url) title = review.select(u'.//b/text()')[0].extract() text = ''.join([ s.strip() for s in review.select( u'div[@class="reviewText"]/text()').extract() ]) loader.add_value('full_text', u'%s\n%s' % (title, text)) product['metadata']['reviews'].append(loader.load_item()) next_page = soup.find('a', text=re.compile('Next')) if next_page and next_page.parent.get('href'): next_page = next_page.parent['href'] yield Request(urljoin_rfc(base_url, next_page), meta=response.meta, callback=self.parse_review) else: yield product
def parse_review(self, response): reviews = re.search(u'= (.*);$', response.body, re.DOTALL) product = response.meta['product'] if response.status != 200 or not reviews: yield product return reviews = reviews.group(1) reviews = map(lambda x: x.get('r'), demjson.decode(reviews)) for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') loader.add_value('review_id', review['id']) date_review = datetime.strptime(review.get('d'), "%m/%d/%Y").date() date_review = date_review.strftime("%m/%d/%Y") loader.add_value('date', date_review) title = review['h'] text = review['p'] review_data = {} if review.get('g'): for data in review['g']: review_data[data['n']] = u', '.join(map(str, data['v'])) fields = [u'Pros', u'Cons', u'Best Uses'] text += u'\n' for field in fields: if review_data.get(field): text += u'%s:\n%s\n' % (field, review_data.get(field)) if review.get('b'): if review['b']['k'] == 'Yes': text += u'Yes, I would recommend this to a friend.' else: text += u'No, I would not recommend this to a friend.' loader.add_value('full_text', u'%s\n%s' % (title, text)) loader.add_value('product_url', response.meta['product_url']) loader.add_value('url', response.meta['product_url']) loader.add_value('rating', review['r']) product['metadata']['reviews'].append(loader.load_item()) cur_page = response.meta['cur_page'] url = response.meta['reviews_url'] % str(cur_page + 1) response.meta['cur_page'] += 1 yield Request(url, meta=response.meta, callback=self.parse_review)
def parse_review(self, response): product = response.meta['product'] body = response.body.split('] = ')[1] body = body.rstrip(';') data = load_js_objects(body) for r in data: r = r['r'] review = Review() date = r['db'] review['date'] = date[8:10] + '/' + date[5:7] + '/' + date[:4] comments = r['p'] header = r['h'] pros = cons = best_uses = [] for short in r.get('g', []): k = short['k'] if k == 'cons': cons = short['v'] elif k == 'pros': pros = short['v'] elif k == 'bestuses': bestuses = short['v'] bottom = { "yes": "Yes, I would recommend this to a friend", "no": "No, I would not recommend this to a friend" } try: bottom_line = bottom[r['b']['k'].lower()] except: bottom_line = '' review[ 'full_text'] = u'%s\n%s\nBottom Line: %s\nPros: %s\nCons: %s\nBest Uses: %s\n' % ( header, comments, bottom_line, u', '.join(pros), u', '.join(cons), u', '.join(best_uses)) review['rating'] = r['r'] review['url'] = response.url product['metadata']['reviews'].append(review) # XXX maybe there is a better way to yield product after all review have been fetched if response.meta['review_n'] == response.meta['review_pages']: yield product else: response.meta['review_n'] = response.meta['review_n'] + 1 yield Request(review_url(response.meta['review_sku'], response.meta['review_n']), meta=response.meta, callback=self.parse_review)
def parse_review(self, response): product = response.meta['product'] html = json.loads(response.body)[0]['result'] hxs = HtmlXPathSelector(text=html) reviews = hxs.select( '//div[contains(@class, "yotpo-reviews")]/div[contains(@class, "yotpo-regular-box")]' ) for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') date_review = review.select( './/label[contains(@class, "yotpo-review-date")]/text()' ).extract()[0] for month, num in ((u'janvier', '01'), (u'f\xe9vrier', '02'), (u'mars', '03'), (u'avril', '04'), (u'mai', '05'), (u'juin', '06'), (u'juillet', '07'), (u'ao\xfbt', '08'), (u'septembre', '09'), (u'octobre', '10'), (u'novembre', '11'), (u'd\xe9cembre', '12')): date_review = date_review.replace(month, num) date_review = datetime.datetime.strptime(date_review, "%d/%m/%y").date() date_review = date_review.strftime("%m/%d/%Y") loader.add_value('date', date_review) loader.add_xpath( 'full_text', './/div[contains(@class, "content-title")]/text()') content = ''.join( review.select( './/div[contains(@class, "content-review")]/text()'). extract()).strip() if not content: continue loader.add_value('full_text', content) loader.add_value('product_url', product['url']) loader.add_value('url', product['url']) loader.add_value('sku', product['sku']) loader.add_value( 'rating', len( review.select('.//span[@class="yotpo-review-stars"]/span'). extract())) product['metadata']['reviews'].append(loader.load_item()) yield product
def parse_review(self, response): hxs = HtmlXPathSelector(response) reviews = hxs.select( '//div[@id="bvseo-reviewsSection"]/div[@itemprop="review"]') product = response.meta['product'] if not reviews: yield product return for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') date = review.select( './meta[@itemprop="datePublished"]/@content').extract()[0] date = time.strptime(date, u'%Y-%m-%d') date = time.strftime(u'%m/%d/%Y', date) loader.add_value('date', date) title = ''.join( review.select('./span[@itemprop="name"]/text()').extract()) if not title: title = u'Untitled' text = ''.join( review.select('./span[@itemprop="description"]/text()'). extract()).strip() if not text: text = u'No text supplied.' loader.add_value('full_text', u'%s\n%s' % (title, text)) loader.add_value('product_url', product['url']) loader.add_value('url', product['url']) loader.add_value('sku', product.get('sku') or '') rating = review.select( './/span[@itemprop="ratingValue"]/text()').extract()[0] loader.add_value('rating', rating) product['metadata']['reviews'].append(loader.load_item()) yield product
def parse_review(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] for r in hxs.select(u'//div[starts-with(@id,"BVRRDisplayContentReviewID_")]'): loader = ReviewLoader(item=Review(), selector=r, date_format='%d %B %Y') title = r.select(u'.//span[contains(@class,"BVRRReviewTitle")]/text()').extract() text = ' '.join(r.select(u'.//span[contains(@class,"BVRRReviewText")]/text()').extract()) if title: text = title[0] + '\n' + text loader.add_value('full_text', text) loader.add_xpath('date', u'.//span[contains(@class,"BVRRReviewDate") and position()=1]/text()') loader.add_value('rating', r.select(u'.//div[@class="BVRRRatingNormalImage"]/img/@title').extract()[0].split()[0]) loader.add_value('url', response.url) product['metadata']['reviews'].append(loader.load_item()) next_url = hxs.select(u'//span[contains(@class,"BVRRNextPage")]/a/@href').extract() if next_url: yield Request(next_url[0], meta=response.meta, callback=self.parse_review) else: yield product
def parse_reviews(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] json_body = json.loads(response.body) reviews = json_body['BatchedResults']['q2']['Results'] for review in reviews: review_loader = ReviewLoader(item=Review(), response=response, date_format="%B %d, %Y") review_date = datetime.datetime.strptime( review['SubmissionTime'].split('.')[0], '%Y-%m-%dT%H:%M:%S') review_loader.add_value('date', review_date.strftime("%B %d, %Y")) title = review['Title'] text = review['ReviewText'] if title: full_text = title[0] + '\n' + text else: full_text = text pros = review['Pros'] cons = review['Cons'] if pros: full_text += '\nPros: ' + ', '.join(pros) if cons: full_text += '\nCons: ' + ', '.join(cons) review_loader.add_value('full_text', full_text) rating = review['Rating'] review_loader.add_value('rating', rating) review_loader.add_value('url', response.url) product['metadata']['reviews'].append(review_loader.load_item()) yield product
def parse_review(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) reviews = hxs.select(u'//div[@class="boxproductinfo"]/table/tr') product = response.meta['product'] if not reviews: yield product return for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%d/%m/%Y') loader.add_value( 'date', review.select(u'./td/div/p/span/text()').re( u'(\d{2}/\d{2}/\d{4})')[0]) loader.add_xpath('full_text', u'./td/div[2]/text()') loader.add_value('url', response.url) product['metadata']['reviews'].append(loader.load_item()) next_page = hxs.select( u'//h4/a[contains(text(),"Next")]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, meta={'product': product}, callback=self.parse_review) else: yield product
class RubbermaidSpider(BaseSpider): name = 'keter-rubbermaid.com' allowed_domains = ['rubbermaid.com'] start_urls = [ 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=shed-accessories', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=VerticalSheds', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=HorizontalSheds', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=DeckBoxesPatioBenches', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=GarageOrganization&SubCatId=ResinCabinets', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=GarageOrganization&SubCatId=FastTrackGarageOrganizationSystem' ] def __init__(self, *args, **kwargs): super(RubbermaidSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browser = PhantomJS.create_browser() max_wait = 60 self._browser.set_page_load_timeout(max_wait) self._browser.set_script_timeout(max_wait) def spider_closed(self): self._browser.quit() def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select( '//div[@id="foodStorageBlock"]//a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) shipping_cost = hxs.select( './/a[contains(text(), "Delivery Surcharge")]//../..//td[2]//span/text()' ).extract() if not shipping_cost: shipping_cost = hxs.select( './/td[contains(text(), "Shipping Surcharge")]//..//td[2]//span/text()' ).extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@id="ProductNameH1"]/text()') loader.add_value( 'category', hxs.select('//div[@class="breadcrum"]/div/a/text()').extract()[-1]) loader.add_xpath( 'identifier', '//form//input[@id="hdnProdId" or @name="hdnProdId"]/@value') price = hxs.select( './/td[contains(text(), "Price:")]//..//td[2]//span/text()' ).extract() if price: loader.add_value('price', price[0]) else: loader.add_value('price', 0) try: loader.add_value('shipping_cost', shipping_cost[0].strip()) except: pass item = hxs.select('//td/strong') if item and item[0].select('../text()'): loader.add_value( 'sku', item[0].select('../text()').extract()[1].strip('#() ')) image_url = hxs.select( '//div[@id="divImageBlock"]//img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('brand', 'Rubbermaid') product = loader.load_item() product['sku'] = product['sku'].upper() metadata = KeterMeta() metadata['brand'] = 'Rubbermaid' metadata['reviews'] = [] product['metadata'] = metadata self.log('>> BROWSER => GET < %s />' % response.url) self._browser.get(response.url) self.log('>> OK') self.log('>> BROWSER => Looking for more reviews ...') try: load_more_button = self._browser.find_element_by_xpath( '//div[@class="bv-content-pagination"]//button') more_reviews = load_more_button.is_displayed() max_pages = 25 while more_reviews and max_pages: self.log('>> More reviews found...') load_more_button.click() self.log('>> BROWSER => CLICK "Load more"') time.sleep(20) self.log('>> OK') load_more_button = self._browser.find_element_by_xpath( '//div[@class="bv-content-pagination"]//button') more_reviews = load_more_button.is_displayed() max_pages -= 1 self.log('>> No more reviews...') except Exception, e: self.log('>> ERROR FOUND => %s' % e) hxs = HtmlXPathSelector(text=self._browser.page_source) for review in hxs.select( '//ol[contains(@class, "bv-content-list-Reviews")]//li[contains(@class, "bv-content-review")]' ): review_loader = ReviewLoader(item=Review(), selector=review, date_format='%m/%d/%Y') review_loader.add_xpath( 'date', u'.//div[@class="bv-content-datetime"][1]//meta[@itemprop="dateCreated"]/@content' ) review_loader.add_xpath( 'full_text', u'.//div[@itemprop="reviewBody"]/p/text()') review_loader.add_xpath( 'rating', u'.//abbr[contains(@class, "bv-rating-stars-on")][1]/@title') review_loader.add_value('url', response.url) product['metadata']['reviews'].append(review_loader.load_item()) yield product
def parse_review(self, response): hxs = HtmlXPathSelector(response) reviews = hxs.select(u'//div[@class="rating-box"]') product = response.meta['product'] if not reviews: yield product return months = enumerate( u'janvier, f\xe9vrier, mars, avril, mai, juin, juillet, ao\xfbt, septembre, octobre, novembre, d\xe9cembre' .split(', '), 1) months = dict(((y, x) for x, y in months)) for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') date = review.select(u'.//footer/p/text()').extract()[0] for month, number in months.items(): if month in date: date = date.replace(month, str(number)).replace(' - ', '') break date = time.strptime(date, u'%d %m %Y') date = time.strftime(u'%m/%d/%Y', date) loader.add_value('date', date) title = review.select(u'.//article/header/h3/text()').extract() if not title: title = u'Untitled' else: title = title[0] text = '' ratings = review.select('.//div[@class="infos-note"]/p') for rating in ratings: text += u'{} {}\n'.format( *rating.select('.//text()[normalize-space()]').extract()) lines = review.select( './/article//p//text()[normalize-space()]').extract() for line in lines: text += u'{}\n'.format(line.strip()) if not text: text = u'No text supplied.' loader.add_value('full_text', u'%s\n%s' % (title, text)) loader.add_value('product_url', response.meta['product_url']) loader.add_value('url', response.meta['product_url']) loader.add_value('sku', product.get('sku') or '') loader.add_xpath('rating', u'.//span[@itemprop="ratingValue"]/text()') product['metadata']['reviews'].append(loader.load_item()) reviews_url = response.meta.get('reviews_url') meta = response.meta meta['page'] += 1 yield Request(reviews_url.format(response.meta.get('product_id'), str(response.meta.get('page') + 1)), meta=meta, callback=self.parse_review, dont_filter=True)
def parse_review(self, response): html = re.search('var materials={.*?(<div.*?)"},.initializers', response.body, re.DOTALL).group(1) html = re.sub(r'\\n', r'\n', html) html = re.sub(r'\\(.)', r'\1', html) hxs = HtmlXPathSelector(text=html) reviews = hxs.select( u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]') products = response.meta['products'] if not reviews: for product in products: yield product return for review in reviews: loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y') review_id = review.select("@id").re( r'BVRRDisplayContentReviewID_(\d+)')[0] loader.add_value('review_id', review_id) date = review.select( u'.//span[contains(@class, "BVRRValue BVRRReviewDate")]/text()' ).extract()[0] date = time.strptime(date, u'%B %d, %Y') date = time.strftime(u'%m/%d/%Y', date) loader.add_value('date', date) title = review.select( u'.//span[@class="BVRRValue BVRRReviewTitle summary"]/text()' ).extract() if not title: title = u'Untitled' else: title = title[0] pros_cons_text = u' '.join(reviews[0].select( u'.//div[@class="BVRRReviewProsConsContainer"]//text()'). extract()) text = review.select( u'.//span[@class="BVRRReviewText"]/text()').extract() if text: text = text[0] else: text = u'No text supplied.' extra_information = u' '.join( review.select( u'.//div[@class="BVRRContextDataContainer"]//text()'). extract()) text = '%s\n%s\n%s' % (pros_cons_text, text, extra_information) loader.add_value('full_text', u'%s\n%s' % (title, text)) loader.add_value('product_url', response.meta['product_url']) loader.add_value('url', response.meta['product_url']) product = products[0] if products else {} loader.add_value('sku', product.get('sku') or '') loader.add_xpath( 'rating', u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@itemprop="ratingValue"]/text()' ) products[0]['metadata']['reviews'].append(loader.load_item()) cur_page = hxs.select( u'//span[@class="BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber"]/text()' ).extract() if not cur_page: for product in products: yield product return else: cur_page = int(cur_page[0]) if 'last_page' not in response.meta: response.meta['last_page'] = int( hxs.select( u'//span[@class="BVRRPageLink BVRRPageNumber"]/a/text()'). extract()[-1]) if cur_page < response.meta['last_page']: url = response.meta['reviews_url'] + u'&page=%s' % str(cur_page + 1) yield Request(url, meta=response.meta, callback=self.parse_review) else: for product in products: yield product