示例#1
0
    def parse_review(self, response, reviewData, extra_parser=None):
        product = response.meta['product']

        review = ReviewItem.from_product(product=product,
                                         rating=reviewData['Rating'],
                                         scale=reviewData['RatingRange'],
                                         date=date_format(
                                             reviewData['SubmissionTime'],
                                             '%Y-%m-%dT%H:%M:%S'),
                                         author=reviewData['UserNickname'],
                                         title=reviewData['Title'],
                                         summary=reviewData['ReviewText'],
                                         pros=reviewData['Pros'],
                                         cons=reviewData['Cons'],
                                         tp='USER')

        if not review.get('TestPros', ''):
            review['TestPros'] = ' ; '.join(
                reviewData.get('TagDimensions', {}).get('Pro',
                                                        {}).get('Values', []))

        if not review.get('TestCons', ''):
            review['TestCons'] = ' ; '.join(
                reviewData.get('TagDimensions', {}).get('Con',
                                                        {}).get('Values', []))

        if extra_parser:
            review = extra_parser(review, reviewData)

        return review
示例#2
0
    def parse_reviews(self, response):
        product = response.meta['product']
        rating_xpath = ".//*[@class='review--header-rating']/text()"
        title_xpath = ".//h3[contains(@class, 'review--header-title')]/text()"
        summary_xpath = ".//div[contains(@class, 'review--description')]//text()"
        header_xpath = ".//div[@class='review--header-review-info']//text()"
        date_xpath =  ".//div[@class='review--header-review-info']/time/@datetime"

        pros_xpath = ".//li[contains(@class, 'pros-and-cons-pro')]//*[@class!='is-visually-hidden']/text()"
        cons_xpath = ".//li[contains(@class, 'pros-and-cons-con')]//*[@class!='is-visually-hidden']/text()"

        next_page_xpath = "//a[@rel='next']/@href"
        reviews = response.xpath("//li[contains(@class, 'reviews__list-item')]")

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"]
        )

        for review in reviews:
            date = self.extract_xpath(review, date_xpath)
            if date:
                date = date_format(date, '')
                current_user_review = dateparser.parse(date,
                                                       date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            title = self.extract_xpath(review, title_xpath)
            rating = self.extract_xpath(review, rating_xpath)
            splitted = rating.split('/')
            if splitted:
                rating = splitted[0]

            summary = self.extract_all_xpath(review, summary_xpath)
            pros = self.extract_all_xpath(review, pros_xpath, separator=' ; ')
            cons = self.extract_all_xpath(review, cons_xpath, separator=' ; ')
            author = ''
            header = self.extract_all_xpath(review, header_xpath)
            if header:
                author = header.split('|')
                author = strip(author[0])

            user_review = ReviewItem.from_product(product=product, tp='USER', rating=rating,
                                                  title=title, date=date, summary=summary,
                                                  pros=pros, cons=cons, author=author, scale=10)
            yield user_review

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
示例#3
0
    def parse_review(self, response):
        review_xpath = "//ul[@class='comments']/li"
        title_xpath = "./p[@class='hdr']/text()"
        summary_xpath = "./p[@class='msg']/text()"
        rating_xpath = "./ul[contains(@class, 'rating')]/@title"
        date_and_author_xpath = "./p[@class='auth']/text()"
        next_page_xpath = "//div[@class='pg']/a[@class='n']/@href"

        product = response.meta['product']
        last_user_review = response.meta['last_user_review']

        for review in response.xpath(review_xpath):
            date_and_author = self.extract_xpath(review, date_and_author_xpath)
            if date_and_author.startswith('Reviewed'):
                date_and_author = date_and_author[len('Reviewed'):]
            date_and_author = date_and_author.split(',')[0]
            splitted = date_and_author.split('by')
            date = splitted[0].strip()
            if len(splitted) > 1:
                author = splitted[1].strip()

            if date:
                date = date_format(date, '')
                current_user_review = dateparser.parse(
                    date, date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            title = self.extract_xpath(review, title_xpath)
            rating = self.extract_xpath(review, rating_xpath)
            splitted = rating.split(' out')
            if splitted:
                rating = splitted[0]

            summary = self.extract_all_xpath(review, summary_xpath)

            user_review = ReviewItem.from_product(product=product,
                                                  tp='USER',
                                                  rating=rating,
                                                  title=title,
                                                  date=date,
                                                  summary=summary,
                                                  author=author,
                                                  scale=5)
            yield user_review

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_request = Request(url=get_full_url(
                response, next_page_url),
                                        callback=self.parse_review,
                                        meta=response.meta)
            yield next_page_request
示例#4
0
def review_microdata_extruct(review_extruct,
                             product=None,
                             tp='',
                             verdict='',
                             url='',
                             pros='',
                             cons='',
                             award='',
                             award_pic=''):
    properties = review_extruct['properties']

    rating = properties.get('reviewRating', {}).get('ratingValue', '')
    if not rating:
        rating = properties.get('reviewRating',
                                {}).get('properties',
                                        {}).get('ratingValue', '')

    scale = properties.get('reviewRating', {}).get('bestRating', '')
    if not scale:
        scale = properties.get('reviewRating',
                               {}).get('properties', {}).get('bestRating', '')

    summary = properties.get('description', '')

    title = properties.get('name', '')
    if not title:
        title = properties.get('headline', '')
    if not title:  # mm.de uses summary as review title. Makes a bit of
        # sense, therefore it is here
        title = properties.get('summary', '')

    author = properties.get('author', '')
    if not isinstance(author, basestring):
        author = author.get('properties', {}).get('name', '')

    date = properties.get('datePublished', '')

    return ReviewItem.from_product(product=product,
                                   tp=tp,
                                   rating=rating,
                                   scale=scale,
                                   date=date,
                                   author=author,
                                   title=title,
                                   summary=summary,
                                   verdict=verdict,
                                   url=url,
                                   pros=pros,
                                   cons=cons,
                                   award=award,
                                   award_pic=award_pic)
示例#5
0
    def parse_reviews(self, response):
        reviews = response.xpath('//div[contains(@class,"detRating")]')
        product = response.meta['product']
        date_xpath = './/div[@class="date"]/@content'
        rating_xpath = './/div[@class="rat"]/span[1]/text()'
        title_xpath = './/div[@class="title"]//text()'
        summary_xpath = './/div[@class="comm"]//text()'
        date = None
        for review in reviews:
            date = self.extract_xpath(review, date_xpath)
            rating = self.extract_xpath(review, rating_xpath)
            title = self.extract_xpath(review, title_xpath)
            summary = self.extract_all_xpath(review, summary_xpath)
            user_review = ReviewItem.from_product(product=product,
                                                  tp='USER',
                                                  date=date,
                                                  rating=rating,
                                                  title=title,
                                                  summary=summary)
            yield user_review

        current_page = response.meta['current_page']
        total_pages = response.meta['total_pages']
        latest_db_date = response.meta['latest_db_date']

        if not date:
            return
        latest_date_page = dateparser.parse(date, ["%Y-%m-%d"])

        if not total_pages:
            return

        if current_page == total_pages:
            return

        if latest_db_date:
            if latest_db_date > latest_date_page:
                return

        next_page = current_page + 1
        next_page_url = set_query_parameter(response.url, 'CurrentPage',
                                            next_page)
        print next_page_url

        request = Request(url=next_page_url, callback=self.parse_reviews)
        request.meta['product'] = product
        request.meta['current_page'] = next_page
        request.meta['total_pages'] = total_pages
        request.meta['latest_db_date'] = latest_db_date
        yield request
示例#6
0
    def parse_review(self, response):
        is_not_logged = self.is_not_logged(response)
        if is_not_logged:
            raise Exception("Not Logged: %s" % response.url)

        product_model_xpath = "//tr[contains(@class, 'model')]/td[@colspan=0]/text()"
        product_manu_xpath = "//tr[contains(@class, 'manufacturer')]/td[@colspan=0]/text()"
        product_pic_url_xpath = "//td[@class='compare-table__image']//img/@src"

        test_date_xpath = "//span[@class='push-property' and contains(text(), 'Datum')]/../following-sibling::td/text()"
        rating_xpath = "//div[@class='c-big-rating__num']/text()"

        category = response.meta['category']

        source_internal_id = get_query_parameter(response.url, 'products')
        product_model = self.extract_xpath(response, product_model_xpath)
        manufacturer = self.extract_xpath(response, product_manu_xpath)
        product_name = "%s %s" % (manufacturer, product_model)
        pic_url = self.extract_xpath(response, product_pic_url_xpath)
        pic_url = get_full_url(response, pic_url)
        product = ProductItem.from_response(
            response,
            category=category,
            source_internal_id=source_internal_id,
            product_name=product_name,
            url=response.url,
            manufacturer=manufacturer,
            pic_url=pic_url)
        yield product

        review_verdict = self.build_verdict(response)
        test_date = self.extract_xpath(response, test_date_xpath)

        rating = self.extract_xpath(response, rating_xpath)

        review = ReviewItem.from_product(product=product,
                                         tp='PRO',
                                         rating=rating,
                                         scale='100',
                                         date=test_date,
                                         verdict=review_verdict)
        yield review
示例#7
0
    def _parse_reviews(self, response, product=None):
        review_list_xpath = "//ul[contains(@class, 'reviews-content')]/li"
        rating_string_xpath = ".//div[@class='rating']/div/@class"
        author_xpath = ".//div[@class='rating']/following::strong[1]/text()"
        date_xpath = ".//div[@class='rating']/following::small[1]/text()"
        title_xpath = './/h3/text()'
        summary_xpath = './/article/p[not(@class)]/text()'
        pros_xpath = ".//div[contains(@class, 'review-features') and " \
                     "contains(@class, 'review-pros')]/text()"
        cons_xpath = ".//div[contains(@class, 'review-features') and " \
                     "contains(@class, 'review-cons')]/text()"

        review_list = response.xpath(review_list_xpath)

        if not product:
            product = response.meta['product']

        for review_selector in review_list:
            rating = ''
            rating_string = self.extract_xpath(review_selector,
                                               rating_string_xpath)
            rating_match = re.match(self.rating_regex, rating_string)
            if rating_match:
                rating = rating_match.group(1)
            title = self.extract_xpath(review_selector, title_xpath)
            date = self.extract_xpath(review_selector, date_xpath)
            if date:
                date = date
            author = self.extract_xpath(review_selector, author_xpath)
            summary = self.extract_all_xpath(review_selector, summary_xpath)
            pros = self.extract_all_xpath(review_selector, pros_xpath)
            cons = self.extract_all_xpath(review_selector, cons_xpath)

            review = ReviewItem.from_product(product=product, author=author, summary=summary,
                                             date=date, pros=pros, cons=cons, title=title,
                                             rating=rating, tp='USER', scale=5)
            yield review
示例#8
0
    def parse_reviews(self, response):
        product_name_xpath = "//div[contains(@class, 'product-title')]//text()"
        product_url_xpath = "(//a[@data-hook='product-link'])[1]/@href"
        reviews_xpath = "//div[@id='cm_cr-review_list']/div[@id]"
        next_page_xpath = "//div[@id='cm_cr-pagination_bar']//li[@class='a-last']/a/@href"

        title_xpath = ".//a[contains(@class,'review-title')]/text()"
        review_url_xpath = ".//a[contains(@class,'review-title')]/@href"
        summary_xpath = ".//span[contains(@class,'review-text')]/text()"
        author_xpath = ".//a[contains(@class,'author')]/text()"
        rating_xpath = ".//i[contains(@class, 'review-rating')]/@class"
        date_xpath = ".//span[contains(@class, 'review-date')]/text()"

        product = response.meta.get('product')

        if not product:
            product_url = self.extract_xpath(response, product_url_xpath)
            if self.asin not in product_url:
                product_url = response.url
            else:
                product_url = get_full_url(response, product_url)

            product_name = self.extract_xpath(response, product_name_xpath)
            product = ProductItem.from_response(response,
                                                product_name=product_name,
                                                source_internal_id=self.asin,
                                                url=product_url)
            yield product

        reviews = response.xpath(reviews_xpath)
        date = ''

        for raw_review in reviews:
            rating = ''
            title = self.extract_xpath(raw_review, title_xpath)
            review_url = self.extract_xpath(raw_review, review_url_xpath)
            review_url = get_full_url(response.url, review_url)
            summary = self.extract_all_xpath(raw_review, summary_xpath)
            author = self.extract_xpath(raw_review, author_xpath)
            raw_rating = self.extract_xpath(raw_review, rating_xpath)
            match = re.search(self.rating_re, raw_rating)
            if match:
                rating = match.group(1)
            date = self._format_date(raw_review, date_xpath)

            review = ReviewItem.from_product(product=product,
                                             tp='USER',
                                             rating=rating,
                                             scale=5,
                                             date=date,
                                             author=author,
                                             summary=summary,
                                             url=review_url,
                                             title=title)
            yield review

        if not date:
            retries = response.meta.get('ama_retries', 0)
            if retries >= 8:  #8 tor processes
                incremental_value = '0'
                if self.incremental is None:
                    incremental = ProductIdItem()
                    incremental['source_internal_id'] = self.asin
                    incremental['ID_kind'] = 'incremental_scraping'
                    incremental['ID_value'] = incremental_value
                    yield incremental
                elif self.update_incremental_kind:
                    update_incremental(self.mysql_manager,
                                       self.spider_conf['source_id'],
                                       self.amazon_kind, self.asin,
                                       incremental_value)
                self.logger.warning("Max retries, blocked: %s" % response.url)
                return

            retryreq = response.request.copy()
            retryreq.meta['ama_retries'] = retries + 1
            retryreq.meta['dont_merge_cookies'] = True
            retryreq.dont_filter = True
            retryreq.cookies = {}
            yield retryreq
            return

        last_date_in_page = dateparser.parse(date, ["%Y:%m:%d"])
        if self.last_review_in_db and self.incremental:
            if self.last_review_in_db > last_date_in_page:
                return

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response.url, next_page_url)
            request = Request(next_page_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
        else:
            incremental_value = '1'
            if self.incremental is None:
                incremental = ProductIdItem()
                incremental['source_internal_id'] = self.asin
                incremental['ID_kind'] = 'incremental_scraping'
                incremental['ID_value'] = incremental_value
                yield incremental
            elif self.update_incremental_kind:
                update_incremental(self.mysql_manager,
                                   self.spider_conf['source_id'],
                                   self.amazon_kind, self.asin,
                                   incremental_value)
示例#9
0
    def parse_reviews(self, response):
        product = response.meta['product']

        title_xpath = "//meta[@property='og:title']/@content"
        summary_xpath = '//meta[@property="og:description"]/@content'
        alt_summary_xpath = '//meta[@name="Description"]/@content'
        rating_xpath = "//div[contains(@class, 'final-score')]//div[@class='score-fill']/@data-score"
        alt_rating_xpath = '//span[@class="score"]/text()'
        pros_xpath = "//td[contains(@class, 'content-plus')]//li/text()"
        cons_xpath = "//td[contains(@class, 'content-cons')]//li/text()"
        alt_pros_xpath = "//ul[contains(@class, 'article-blurb-features')]//li/text()"
        alt_cons_xpath = "//ul[contains(@class, 'disadvantages')]//li/text()"
        author_xpath = "//span[@class='reviewer']/text()"
        date_xpath = "//span[@class='dtreviewed']/text()"

        last_review_page_url_xpath = '//ol[contains(@class, "page-options")]' \
            '/li/a[@href=""]/parent::li/preceding-sibling::li[1]/a/@href'

        title = self.extract_xpath(response, title_xpath)
        summary = self.extract_xpath(response, summary_xpath)
        if not summary:
            summary = self.extract_xpath(response, alt_summary_xpath)
        rating = self.extract_xpath(response, rating_xpath)
        if not rating:
            rating = self.extract_xpath(response, alt_rating_xpath)

        scale = ''
        if rating:
            scale = '5'
        pros = self.extract_all_xpath(response, pros_xpath, separator=' ; ')
        if not pros:
            pros = self.extract_all_xpath(response,
                                          alt_pros_xpath,
                                          separator=' ; ')
        cons = self.extract_all_xpath(response, cons_xpath, separator=' ; ')
        if not cons:
            cons = self.extract_all_xpath(response,
                                          alt_cons_xpath,
                                          separator=' ; ')
        author = self.extract_xpath(response, author_xpath)
        date = self.extract_xpath(response, date_xpath)
        if date:
            date = date_format(date, "%d %B %Y", languages=['en'])

        current_page_review = ReviewItem.from_product(product=product,
                                                      tp='PRO',
                                                      rating=rating,
                                                      scale=scale,
                                                      pros=pros,
                                                      cons=cons,
                                                      author=author,
                                                      title=title,
                                                      summary=summary,
                                                      date=date,
                                                      url=response.url)

        accumulated_review = response.meta.get('review')
        if accumulated_review:
            self.merge_review(accumulated_review, current_page_review)
        else:
            accumulated_review = current_page_review

        last_review_page_url = self.extract_xpath(response,
                                                  last_review_page_url_xpath)

        if last_review_page_url:
            #If there are other pages on the review, goes to the last one
            last_review_page_url = get_full_url(response, last_review_page_url)
            request = Request(last_review_page_url,
                              callback=self.parse_reviews)
            request.meta['review'] = accumulated_review
            request.meta['product'] = response.meta['product']
            yield request
        else:
            # If it's the last review page, try to get the veredict
            verdict_xpath = "//div[@id='review-body']/p[1]//text()"
            verdict = self.extract_xpath(response, verdict_xpath)
            accumulated_review['TestVerdict'] = verdict
            yield accumulated_review
示例#10
0
    def parse_reviews(self, response):
        product = response.meta['product']

        summary_xpath = ".//article/text()"
        rating_xpath = ".//meta[@itemprop='rating']/@content"
        title_xpath = ".//meta[@itemprop='summary']/@content"
        date_xpath = ".//meta[@itemprop='dtreviewed']/@content"
        author_xpath = ".//meta[@itemprop='reviewer']/@content"
        pros_xpath = ".//div[contains(@class, 'review-features') and " \
                     "contains(@class, 'review-pros')]/text()"
        cons_xpath = ".//div[contains(@class, 'review-features') and " \
                     "contains(@class, 'review-cons')]/text()"

        review_selectors = response.xpath('//li')
        for review_selector in review_selectors:
            rating = self.extract_xpath(review_selector, rating_xpath)
            title = self.extract_xpath(review_selector, title_xpath)
            date = self.extract_xpath(review_selector, date_xpath)
            author = self.extract_xpath(review_selector, author_xpath)
            summary = self.extract_all_xpath(review_selector, summary_xpath)
            pros = self.extract_all_xpath(review_selector, pros_xpath)
            cons = self.extract_all_xpath(review_selector, cons_xpath)

            pros = re.sub("[\s]+", ' ', pros)
            cons = re.sub("[\s]+", ' ', cons)

            review = ReviewItem.from_product(product=product,
                                             title=title,
                                             rating=rating,
                                             tp='USER',
                                             scale=5,
                                             date=date,
                                             summary=summary,
                                             pros=pros,
                                             cons=cons,
                                             author=author)
            yield review

        last_date_in_page = dateparser.parse(date, ["%Y-%m-%d"])
        next_page_url = response.meta.get('next_page_review_url', None)
        if next_page_url:
            paging_parameter = response.meta['paging_parameter']
            current_index = response.meta['current_index']
            reviews_per_page = response.meta['reviews_per_page']
            total_reviews = response.meta['total_reviews']
            last_review_db = response.meta['last_review_db']

            if current_index >= total_reviews:  #We reached the end
                return

            if last_review_db > last_date_in_page:  #reached the end of new data
                return

            next_page_url = set_query_parameter(next_page_url,
                                                paging_parameter,
                                                current_index)

            headers = {
                'Referer': response.request.headers['Referer'],
                'X-Requested-With':
                response.request.headers['X-Requested-With']
            }

            meta = {
                'next_page_review_url': next_page_url,
                'reviews_per_page': reviews_per_page,
                'total_reviews': total_reviews,
                'current_index': current_index + reviews_per_page,
                'paging_parameter': paging_parameter,
                'last_review_db': last_review_db,
                'product': product
            }

            request = Request(next_page_url,
                              meta=meta,
                              headers=headers,
                              callback=self.parse_reviews)
            yield request
示例#11
0
    def parse_reviews(self, response):
        print response.url
        asin = response.meta['asin']

        product_name_xpath = "//div[contains(@class, 'product-title')]//text()"
        reviews_xpath = "//div[@id='cm_cr-review_list']/div[@id]"
        next_page_xpath = "//div[@id='cm_cr-pagination_bar']//li[@class='a-last']/a/@href"

        title_xpath = ".//a[contains(@class,'review-title')]/text()"
        review_url_xpath = ".//a[contains(@class,'review-title')]/@href"
        summary_xpath = ".//span[contains(@class,'review-text')]/text()"
        author_xpath = ".//a[contains(@class,'author')]/text()"
        rating_xpath = ".//i[contains(@class, 'review-rating')]/@class"
        date_xpath = ".//span[contains(@class, 'review-date')]/text()"

        product_name = self.extract_xpath(response, product_name_xpath)
        product = ProductItem.from_response(response,
                                            product_name=product_name,
                                            source_internal_id=asin)

        reviews = response.xpath(reviews_xpath)
        date = ''

        for raw_review in reviews:
            rating = ''
            title = self.extract_xpath(raw_review, title_xpath)
            review_url = self.extract_xpath(raw_review, review_url_xpath)
            review_url = get_full_url(response.url, review_url)
            summary = self.extract_all_xpath(raw_review, summary_xpath)
            author = self.extract_xpath(raw_review, author_xpath)
            raw_rating = self.extract_xpath(raw_review, rating_xpath)
            match = re.search(self.rating_re, raw_rating)
            if match:
                rating = match.group(1)
            date = self._format_date(raw_review, date_xpath)

            review = ReviewItem.from_product(product,
                                             tp='USER',
                                             rating=rating,
                                             scale=5,
                                             date=date,
                                             author=author,
                                             summary=summary,
                                             url=review_url,
                                             title=title)
            yield review

        if not date:
            retries = response.meta.get('ama_retries', 0)
            if retries >= 8:  #8 tor processes
                self.logger.warning("Max retries, blocked: %s" % response.url)
                return

            retryreq = response.request.copy()
            retryreq.meta['ama_retries'] = retries + 1
            retryreq.meta['dont_merge_cookies'] = True
            retryreq.dont_filter = True
            retryreq.cookies = {}
            yield retryreq
            return

        last_review_in_db = response.meta['last_review_in_db']

        last_date_in_page = dateparser.parse(date, ["%Y:%m:%d"])
        if last_date_in_page == 'None':
            print 'in here'
            last_date_in_page = self.parserdate(date)

        if last_review_in_db:
            if last_review_in_db > last_date_in_page:
                return

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response.url, next_page_url)
            request = Request(next_page_url, callback=self.parse_reviews)
            request.meta['asin'] = asin
            request.meta['last_review_in_db'] = last_review_in_db
            yield request