def parse_review(self, response, reviewData, extra_parser=None): product = response.meta['product'] review = ReviewItem.from_product(product=product, rating=reviewData['Rating'], scale=reviewData['RatingRange'], date=date_format( reviewData['SubmissionTime'], '%Y-%m-%dT%H:%M:%S'), author=reviewData['UserNickname'], title=reviewData['Title'], summary=reviewData['ReviewText'], pros=reviewData['Pros'], cons=reviewData['Cons'], tp='USER') if not review.get('TestPros', ''): review['TestPros'] = ' ; '.join( reviewData.get('TagDimensions', {}).get('Pro', {}).get('Values', [])) if not review.get('TestCons', ''): review['TestCons'] = ' ; '.join( reviewData.get('TagDimensions', {}).get('Con', {}).get('Values', [])) if extra_parser: review = extra_parser(review, reviewData) return review
def parse_reviews(self, response): product = response.meta['product'] rating_xpath = ".//*[@class='review--header-rating']/text()" title_xpath = ".//h3[contains(@class, 'review--header-title')]/text()" summary_xpath = ".//div[contains(@class, 'review--description')]//text()" header_xpath = ".//div[@class='review--header-review-info']//text()" date_xpath = ".//div[@class='review--header-review-info']/time/@datetime" pros_xpath = ".//li[contains(@class, 'pros-and-cons-pro')]//*[@class!='is-visually-hidden']/text()" cons_xpath = ".//li[contains(@class, 'pros-and-cons-con')]//*[@class!='is-visually-hidden']/text()" next_page_xpath = "//a[@rel='next']/@href" reviews = response.xpath("//li[contains(@class, 'reviews__list-item')]") last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"] ) for review in reviews: date = self.extract_xpath(review, date_xpath) if date: date = date_format(date, '') current_user_review = dateparser.parse(date, date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return title = self.extract_xpath(review, title_xpath) rating = self.extract_xpath(review, rating_xpath) splitted = rating.split('/') if splitted: rating = splitted[0] summary = self.extract_all_xpath(review, summary_xpath) pros = self.extract_all_xpath(review, pros_xpath, separator=' ; ') cons = self.extract_all_xpath(review, cons_xpath, separator=' ; ') author = '' header = self.extract_all_xpath(review, header_xpath) if header: author = header.split('|') author = strip(author[0]) user_review = ReviewItem.from_product(product=product, tp='USER', rating=rating, title=title, date=date, summary=summary, pros=pros, cons=cons, author=author, scale=10) yield user_review next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_review(self, response): review_xpath = "//ul[@class='comments']/li" title_xpath = "./p[@class='hdr']/text()" summary_xpath = "./p[@class='msg']/text()" rating_xpath = "./ul[contains(@class, 'rating')]/@title" date_and_author_xpath = "./p[@class='auth']/text()" next_page_xpath = "//div[@class='pg']/a[@class='n']/@href" product = response.meta['product'] last_user_review = response.meta['last_user_review'] for review in response.xpath(review_xpath): date_and_author = self.extract_xpath(review, date_and_author_xpath) if date_and_author.startswith('Reviewed'): date_and_author = date_and_author[len('Reviewed'):] date_and_author = date_and_author.split(',')[0] splitted = date_and_author.split('by') date = splitted[0].strip() if len(splitted) > 1: author = splitted[1].strip() if date: date = date_format(date, '') current_user_review = dateparser.parse( date, date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return title = self.extract_xpath(review, title_xpath) rating = self.extract_xpath(review, rating_xpath) splitted = rating.split(' out') if splitted: rating = splitted[0] summary = self.extract_all_xpath(review, summary_xpath) user_review = ReviewItem.from_product(product=product, tp='USER', rating=rating, title=title, date=date, summary=summary, author=author, scale=5) yield user_review next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_request = Request(url=get_full_url( response, next_page_url), callback=self.parse_review, meta=response.meta) yield next_page_request
def review_microdata_extruct(review_extruct, product=None, tp='', verdict='', url='', pros='', cons='', award='', award_pic=''): properties = review_extruct['properties'] rating = properties.get('reviewRating', {}).get('ratingValue', '') if not rating: rating = properties.get('reviewRating', {}).get('properties', {}).get('ratingValue', '') scale = properties.get('reviewRating', {}).get('bestRating', '') if not scale: scale = properties.get('reviewRating', {}).get('properties', {}).get('bestRating', '') summary = properties.get('description', '') title = properties.get('name', '') if not title: title = properties.get('headline', '') if not title: # mm.de uses summary as review title. Makes a bit of # sense, therefore it is here title = properties.get('summary', '') author = properties.get('author', '') if not isinstance(author, basestring): author = author.get('properties', {}).get('name', '') date = properties.get('datePublished', '') return ReviewItem.from_product(product=product, tp=tp, rating=rating, scale=scale, date=date, author=author, title=title, summary=summary, verdict=verdict, url=url, pros=pros, cons=cons, award=award, award_pic=award_pic)
def parse_reviews(self, response): reviews = response.xpath('//div[contains(@class,"detRating")]') product = response.meta['product'] date_xpath = './/div[@class="date"]/@content' rating_xpath = './/div[@class="rat"]/span[1]/text()' title_xpath = './/div[@class="title"]//text()' summary_xpath = './/div[@class="comm"]//text()' date = None for review in reviews: date = self.extract_xpath(review, date_xpath) rating = self.extract_xpath(review, rating_xpath) title = self.extract_xpath(review, title_xpath) summary = self.extract_all_xpath(review, summary_xpath) user_review = ReviewItem.from_product(product=product, tp='USER', date=date, rating=rating, title=title, summary=summary) yield user_review current_page = response.meta['current_page'] total_pages = response.meta['total_pages'] latest_db_date = response.meta['latest_db_date'] if not date: return latest_date_page = dateparser.parse(date, ["%Y-%m-%d"]) if not total_pages: return if current_page == total_pages: return if latest_db_date: if latest_db_date > latest_date_page: return next_page = current_page + 1 next_page_url = set_query_parameter(response.url, 'CurrentPage', next_page) print next_page_url request = Request(url=next_page_url, callback=self.parse_reviews) request.meta['product'] = product request.meta['current_page'] = next_page request.meta['total_pages'] = total_pages request.meta['latest_db_date'] = latest_db_date yield request
def parse_review(self, response): is_not_logged = self.is_not_logged(response) if is_not_logged: raise Exception("Not Logged: %s" % response.url) product_model_xpath = "//tr[contains(@class, 'model')]/td[@colspan=0]/text()" product_manu_xpath = "//tr[contains(@class, 'manufacturer')]/td[@colspan=0]/text()" product_pic_url_xpath = "//td[@class='compare-table__image']//img/@src" test_date_xpath = "//span[@class='push-property' and contains(text(), 'Datum')]/../following-sibling::td/text()" rating_xpath = "//div[@class='c-big-rating__num']/text()" category = response.meta['category'] source_internal_id = get_query_parameter(response.url, 'products') product_model = self.extract_xpath(response, product_model_xpath) manufacturer = self.extract_xpath(response, product_manu_xpath) product_name = "%s %s" % (manufacturer, product_model) pic_url = self.extract_xpath(response, product_pic_url_xpath) pic_url = get_full_url(response, pic_url) product = ProductItem.from_response( response, category=category, source_internal_id=source_internal_id, product_name=product_name, url=response.url, manufacturer=manufacturer, pic_url=pic_url) yield product review_verdict = self.build_verdict(response) test_date = self.extract_xpath(response, test_date_xpath) rating = self.extract_xpath(response, rating_xpath) review = ReviewItem.from_product(product=product, tp='PRO', rating=rating, scale='100', date=test_date, verdict=review_verdict) yield review
def _parse_reviews(self, response, product=None): review_list_xpath = "//ul[contains(@class, 'reviews-content')]/li" rating_string_xpath = ".//div[@class='rating']/div/@class" author_xpath = ".//div[@class='rating']/following::strong[1]/text()" date_xpath = ".//div[@class='rating']/following::small[1]/text()" title_xpath = './/h3/text()' summary_xpath = './/article/p[not(@class)]/text()' pros_xpath = ".//div[contains(@class, 'review-features') and " \ "contains(@class, 'review-pros')]/text()" cons_xpath = ".//div[contains(@class, 'review-features') and " \ "contains(@class, 'review-cons')]/text()" review_list = response.xpath(review_list_xpath) if not product: product = response.meta['product'] for review_selector in review_list: rating = '' rating_string = self.extract_xpath(review_selector, rating_string_xpath) rating_match = re.match(self.rating_regex, rating_string) if rating_match: rating = rating_match.group(1) title = self.extract_xpath(review_selector, title_xpath) date = self.extract_xpath(review_selector, date_xpath) if date: date = date author = self.extract_xpath(review_selector, author_xpath) summary = self.extract_all_xpath(review_selector, summary_xpath) pros = self.extract_all_xpath(review_selector, pros_xpath) cons = self.extract_all_xpath(review_selector, cons_xpath) review = ReviewItem.from_product(product=product, author=author, summary=summary, date=date, pros=pros, cons=cons, title=title, rating=rating, tp='USER', scale=5) yield review
def parse_reviews(self, response): product_name_xpath = "//div[contains(@class, 'product-title')]//text()" product_url_xpath = "(//a[@data-hook='product-link'])[1]/@href" reviews_xpath = "//div[@id='cm_cr-review_list']/div[@id]" next_page_xpath = "//div[@id='cm_cr-pagination_bar']//li[@class='a-last']/a/@href" title_xpath = ".//a[contains(@class,'review-title')]/text()" review_url_xpath = ".//a[contains(@class,'review-title')]/@href" summary_xpath = ".//span[contains(@class,'review-text')]/text()" author_xpath = ".//a[contains(@class,'author')]/text()" rating_xpath = ".//i[contains(@class, 'review-rating')]/@class" date_xpath = ".//span[contains(@class, 'review-date')]/text()" product = response.meta.get('product') if not product: product_url = self.extract_xpath(response, product_url_xpath) if self.asin not in product_url: product_url = response.url else: product_url = get_full_url(response, product_url) product_name = self.extract_xpath(response, product_name_xpath) product = ProductItem.from_response(response, product_name=product_name, source_internal_id=self.asin, url=product_url) yield product reviews = response.xpath(reviews_xpath) date = '' for raw_review in reviews: rating = '' title = self.extract_xpath(raw_review, title_xpath) review_url = self.extract_xpath(raw_review, review_url_xpath) review_url = get_full_url(response.url, review_url) summary = self.extract_all_xpath(raw_review, summary_xpath) author = self.extract_xpath(raw_review, author_xpath) raw_rating = self.extract_xpath(raw_review, rating_xpath) match = re.search(self.rating_re, raw_rating) if match: rating = match.group(1) date = self._format_date(raw_review, date_xpath) review = ReviewItem.from_product(product=product, tp='USER', rating=rating, scale=5, date=date, author=author, summary=summary, url=review_url, title=title) yield review if not date: retries = response.meta.get('ama_retries', 0) if retries >= 8: #8 tor processes incremental_value = '0' if self.incremental is None: incremental = ProductIdItem() incremental['source_internal_id'] = self.asin incremental['ID_kind'] = 'incremental_scraping' incremental['ID_value'] = incremental_value yield incremental elif self.update_incremental_kind: update_incremental(self.mysql_manager, self.spider_conf['source_id'], self.amazon_kind, self.asin, incremental_value) self.logger.warning("Max retries, blocked: %s" % response.url) return retryreq = response.request.copy() retryreq.meta['ama_retries'] = retries + 1 retryreq.meta['dont_merge_cookies'] = True retryreq.dont_filter = True retryreq.cookies = {} yield retryreq return last_date_in_page = dateparser.parse(date, ["%Y:%m:%d"]) if self.last_review_in_db and self.incremental: if self.last_review_in_db > last_date_in_page: return next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response.url, next_page_url) request = Request(next_page_url, callback=self.parse_reviews) request.meta['product'] = product yield request else: incremental_value = '1' if self.incremental is None: incremental = ProductIdItem() incremental['source_internal_id'] = self.asin incremental['ID_kind'] = 'incremental_scraping' incremental['ID_value'] = incremental_value yield incremental elif self.update_incremental_kind: update_incremental(self.mysql_manager, self.spider_conf['source_id'], self.amazon_kind, self.asin, incremental_value)
def parse_reviews(self, response): product = response.meta['product'] title_xpath = "//meta[@property='og:title']/@content" summary_xpath = '//meta[@property="og:description"]/@content' alt_summary_xpath = '//meta[@name="Description"]/@content' rating_xpath = "//div[contains(@class, 'final-score')]//div[@class='score-fill']/@data-score" alt_rating_xpath = '//span[@class="score"]/text()' pros_xpath = "//td[contains(@class, 'content-plus')]//li/text()" cons_xpath = "//td[contains(@class, 'content-cons')]//li/text()" alt_pros_xpath = "//ul[contains(@class, 'article-blurb-features')]//li/text()" alt_cons_xpath = "//ul[contains(@class, 'disadvantages')]//li/text()" author_xpath = "//span[@class='reviewer']/text()" date_xpath = "//span[@class='dtreviewed']/text()" last_review_page_url_xpath = '//ol[contains(@class, "page-options")]' \ '/li/a[@href=""]/parent::li/preceding-sibling::li[1]/a/@href' title = self.extract_xpath(response, title_xpath) summary = self.extract_xpath(response, summary_xpath) if not summary: summary = self.extract_xpath(response, alt_summary_xpath) rating = self.extract_xpath(response, rating_xpath) if not rating: rating = self.extract_xpath(response, alt_rating_xpath) scale = '' if rating: scale = '5' pros = self.extract_all_xpath(response, pros_xpath, separator=' ; ') if not pros: pros = self.extract_all_xpath(response, alt_pros_xpath, separator=' ; ') cons = self.extract_all_xpath(response, cons_xpath, separator=' ; ') if not cons: cons = self.extract_all_xpath(response, alt_cons_xpath, separator=' ; ') author = self.extract_xpath(response, author_xpath) date = self.extract_xpath(response, date_xpath) if date: date = date_format(date, "%d %B %Y", languages=['en']) current_page_review = ReviewItem.from_product(product=product, tp='PRO', rating=rating, scale=scale, pros=pros, cons=cons, author=author, title=title, summary=summary, date=date, url=response.url) accumulated_review = response.meta.get('review') if accumulated_review: self.merge_review(accumulated_review, current_page_review) else: accumulated_review = current_page_review last_review_page_url = self.extract_xpath(response, last_review_page_url_xpath) if last_review_page_url: #If there are other pages on the review, goes to the last one last_review_page_url = get_full_url(response, last_review_page_url) request = Request(last_review_page_url, callback=self.parse_reviews) request.meta['review'] = accumulated_review request.meta['product'] = response.meta['product'] yield request else: # If it's the last review page, try to get the veredict verdict_xpath = "//div[@id='review-body']/p[1]//text()" verdict = self.extract_xpath(response, verdict_xpath) accumulated_review['TestVerdict'] = verdict yield accumulated_review
def parse_reviews(self, response): product = response.meta['product'] summary_xpath = ".//article/text()" rating_xpath = ".//meta[@itemprop='rating']/@content" title_xpath = ".//meta[@itemprop='summary']/@content" date_xpath = ".//meta[@itemprop='dtreviewed']/@content" author_xpath = ".//meta[@itemprop='reviewer']/@content" pros_xpath = ".//div[contains(@class, 'review-features') and " \ "contains(@class, 'review-pros')]/text()" cons_xpath = ".//div[contains(@class, 'review-features') and " \ "contains(@class, 'review-cons')]/text()" review_selectors = response.xpath('//li') for review_selector in review_selectors: rating = self.extract_xpath(review_selector, rating_xpath) title = self.extract_xpath(review_selector, title_xpath) date = self.extract_xpath(review_selector, date_xpath) author = self.extract_xpath(review_selector, author_xpath) summary = self.extract_all_xpath(review_selector, summary_xpath) pros = self.extract_all_xpath(review_selector, pros_xpath) cons = self.extract_all_xpath(review_selector, cons_xpath) pros = re.sub("[\s]+", ' ', pros) cons = re.sub("[\s]+", ' ', cons) review = ReviewItem.from_product(product=product, title=title, rating=rating, tp='USER', scale=5, date=date, summary=summary, pros=pros, cons=cons, author=author) yield review last_date_in_page = dateparser.parse(date, ["%Y-%m-%d"]) next_page_url = response.meta.get('next_page_review_url', None) if next_page_url: paging_parameter = response.meta['paging_parameter'] current_index = response.meta['current_index'] reviews_per_page = response.meta['reviews_per_page'] total_reviews = response.meta['total_reviews'] last_review_db = response.meta['last_review_db'] if current_index >= total_reviews: #We reached the end return if last_review_db > last_date_in_page: #reached the end of new data return next_page_url = set_query_parameter(next_page_url, paging_parameter, current_index) headers = { 'Referer': response.request.headers['Referer'], 'X-Requested-With': response.request.headers['X-Requested-With'] } meta = { 'next_page_review_url': next_page_url, 'reviews_per_page': reviews_per_page, 'total_reviews': total_reviews, 'current_index': current_index + reviews_per_page, 'paging_parameter': paging_parameter, 'last_review_db': last_review_db, 'product': product } request = Request(next_page_url, meta=meta, headers=headers, callback=self.parse_reviews) yield request
def parse_reviews(self, response): print response.url asin = response.meta['asin'] product_name_xpath = "//div[contains(@class, 'product-title')]//text()" reviews_xpath = "//div[@id='cm_cr-review_list']/div[@id]" next_page_xpath = "//div[@id='cm_cr-pagination_bar']//li[@class='a-last']/a/@href" title_xpath = ".//a[contains(@class,'review-title')]/text()" review_url_xpath = ".//a[contains(@class,'review-title')]/@href" summary_xpath = ".//span[contains(@class,'review-text')]/text()" author_xpath = ".//a[contains(@class,'author')]/text()" rating_xpath = ".//i[contains(@class, 'review-rating')]/@class" date_xpath = ".//span[contains(@class, 'review-date')]/text()" product_name = self.extract_xpath(response, product_name_xpath) product = ProductItem.from_response(response, product_name=product_name, source_internal_id=asin) reviews = response.xpath(reviews_xpath) date = '' for raw_review in reviews: rating = '' title = self.extract_xpath(raw_review, title_xpath) review_url = self.extract_xpath(raw_review, review_url_xpath) review_url = get_full_url(response.url, review_url) summary = self.extract_all_xpath(raw_review, summary_xpath) author = self.extract_xpath(raw_review, author_xpath) raw_rating = self.extract_xpath(raw_review, rating_xpath) match = re.search(self.rating_re, raw_rating) if match: rating = match.group(1) date = self._format_date(raw_review, date_xpath) review = ReviewItem.from_product(product, tp='USER', rating=rating, scale=5, date=date, author=author, summary=summary, url=review_url, title=title) yield review if not date: retries = response.meta.get('ama_retries', 0) if retries >= 8: #8 tor processes self.logger.warning("Max retries, blocked: %s" % response.url) return retryreq = response.request.copy() retryreq.meta['ama_retries'] = retries + 1 retryreq.meta['dont_merge_cookies'] = True retryreq.dont_filter = True retryreq.cookies = {} yield retryreq return last_review_in_db = response.meta['last_review_in_db'] last_date_in_page = dateparser.parse(date, ["%Y:%m:%d"]) if last_date_in_page == 'None': print 'in here' last_date_in_page = self.parserdate(date) if last_review_in_db: if last_review_in_db > last_date_in_page: return next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response.url, next_page_url) request = Request(next_page_url, callback=self.parse_reviews) request.meta['asin'] = asin request.meta['last_review_in_db'] = last_review_in_db yield request