Пример #1
0
    def parse_review(self, node, response):
        review = ReviewItem()

        # No author for the source page
        meta_info = node.get('meta', {})
        review['ProductName'] = node.get('title', '')
        review['source_internal_id'] = meta_info.get('id', '')
        review['TestDateText'] = meta_info.get('review_date', '')
        review['TestSummary'] = node.get('description', '')
        review['TestTitle'] = review.get('ProductName')
        review['TestUrl'] = node.get('url', '')
        review['SourceTestRating'] = meta_info.get('expert_evaluation_float',
                                                   '')
        # source rating scale based on scale of 10
        if review.get('SourceTestRating'):
            review['SourceTestScale'] = 10
        review['source_id'] = self.spider_conf['source_id']
        review['DBaseCategoryName'] = 'PRO'

        if meta_info.get('conclusion', ''):
            review['TestVerdict'] = meta_info.get('conclusion', '')
        if meta_info.get('reviewer', ''):
            review['Author'] = meta_info.get('reviewer', '')

        return review
Пример #2
0
    def parse_review(self, response):

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, "Review")
        article_json_ld = extruct_helper.extract_json_ld(
            response.text, "NewsArticle")

        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld)
        elif article_json_ld:
            review = extruct_helper.review_item_from_article_json_ld(
                article_json_ld)
        else:
            review = ReviewItem()

        review['DBaseCategoryName'] = 'PRO'
        if not review.get('TestUrl', ''):
            review['TestUrl'] = response.url

        review['ProductName'] = self.extract(
            response.xpath(
                "//div[@class='productDataBlock']/ul/li[1]/strong/text()"))
        if not review.get('ProductName', ''):
            review['ProductName'] = self.get_product_name(response)

        source_internal_id = str(response).split("/")[4]
        review['source_internal_id'] = source_internal_id.rstrip('>')

        review['TestPros'] = self.extract(
            response.xpath("//div[@id='ahReviewPros']/ul/li/text()"))
        review['TestCons'] = self.extract(
            response.xpath("//div[@id='ahReviewCons']/ul/li/text()"))

        return review
Пример #3
0
    def parse_review(self, response, reviewData, extra_parser=None):
        product = response.meta['product']

        review = ReviewItem.from_product(product=product,
                                         rating=reviewData['Rating'],
                                         scale=reviewData['RatingRange'],
                                         date=date_format(
                                             reviewData['SubmissionTime'],
                                             '%Y-%m-%dT%H:%M:%S'),
                                         author=reviewData['UserNickname'],
                                         title=reviewData['Title'],
                                         summary=reviewData['ReviewText'],
                                         pros=reviewData['Pros'],
                                         cons=reviewData['Cons'],
                                         tp='USER')

        if not review.get('TestPros', ''):
            review['TestPros'] = ' ; '.join(
                reviewData.get('TagDimensions', {}).get('Pro',
                                                        {}).get('Values', []))

        if not review.get('TestCons', ''):
            review['TestCons'] = ' ; '.join(
                reviewData.get('TagDimensions', {}).get('Con',
                                                        {}).get('Values', []))

        if extra_parser:
            review = extra_parser(review, reviewData)

        return review
Пример #4
0
    def _parse_reviews(self, selector, browser, product):
        review_container_xpath = "//article[contains(@id, 'review_')]"

        author_xpath = ".//h4[@class='attribution-name']/text()"
        rating_xpath = ".//div[@class='overall_score_stars']/@title"
        pros_xpath = ".//dd[@class='pros']/text()"
        cons_xpath = ".//dd[@class='cons']/text()"
        next_page_xpath = "//a[@class='next_page']"
        review_containers = selector.xpath(review_container_xpath)

        for review_container in review_containers:
            review = ReviewItem()
            review['DBaseCategoryName'] = "USER"
            review['ProductName'] = product['ProductName']
            review['TestUrl'] = product['TestUrl']
            review['Author'] = self.extract(
                review_container.xpath(author_xpath))
            review['SourceTestRating'] = self.extract(
                review_container.xpath(rating_xpath))

            review['TestPros'] = self.extract_all(
                review_container.xpath(pros_xpath), separator=' ; ')

            review['TestCons'] = self.extract_all(
                review_container.xpath(cons_xpath), separator=' ; ')

            if review['TestPros'] and review['TestCons']:
                yield review

        next_page = selector.xpath(next_page_xpath)
        if next_page:
            next_page_selector = browser.click(next_page_xpath)
            for review in self._parse_reviews(next_page_selector, browser,
                                              product):
                yield review
Пример #5
0
    def init_item_by_xpaths(self, response, item_type, fields, selector=None):
        if not selector:
            selector = Selector(response=response)

        if item_type not in ('review', 'product', 'product_id', 'category'):
            raise Exception("Invalid item type: %s" % item_type)

        if item_type == "review":
            item = ReviewItem()
        elif item_type == "product":
            item = ProductItem()
        elif item_type == "product_id":
            item = ProductIdItem()
        elif item_type == "category":
            item = CategoryItem()

        if item_type in ('review', 'product'):
            item["TestUrl"] = response.url

        for field in fields:
            # TODO: maybe check field.
            if item_type == "review" and field in ("TestPros, TestCons"):
                item[field] = self.extract_all(selector.xpath(fields[field]),
                                               " ; ")
            else:
                item[field] = self.extract_all(selector.xpath(fields[field]))
        return item
Пример #6
0
 def parse_reviews(self, response):
     category = response.meta['category']
     product = ProductItem()
     product['TestUrl'] = response.url
     product['OriginalCategoryName'] = category['category_path']
     product['ProductName'] = self.extract(response.xpath('//span[@class="fn"]/text()'))
     product_id = response.meta['product_id']
     product['PicURL'] = 'http://geizhals.at/p/'+product_id+'.jpg'
     product['source_internal_id'] = product_id
     yield product
     
     reviews = response.xpath('//li[contains(@class,"gh_box")]')
     for review in reviews:
         user_review = ReviewItem()
         user_review['DBaseCategoryName'] = "USER"
         user_review['ProductName'] = product['ProductName']
         user_review['TestUrl'] = product['TestUrl']
         date = self.extract(review.xpath('.//div[@class="userbox"]/text()')).strip('am ')
         user_review['TestDateText'] = date_format(date, "%d.%m.%Y %H:%M")
         user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="rating"]/text()'))
         user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()'))
         user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()'))
         user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@itemprop="description"]//text()'))
         user_review['source_internal_id'] = product['source_internal_id']
         yield user_review
Пример #7
0
    def parse_review(self, response):
        product = response.meta['product']
        review_url = response.meta['review_url']

        containers_xpath = "//div[@itemprop='review']"
        containers = response.xpath(containers_xpath)
        for review_container in containers:
            review = ReviewItem()
            review['SourceTestRating'] = self.extract(review_container.xpath(".//*[@itemprop='ratingValue']/@content"))
            review['TestDateText'] = self.extract(review_container.xpath(".//span[@itemprop='datePublished']/text()"))
            review['TestSummary'] = self.extract_all(review_container.xpath(".//p[@itemprop='reviewBody']//text()"
                                                                            "[not(ancestor::a)]"))
            review['Author'] = self.extract(review_container.xpath(".//a[@itemprop='author']/text()"))
            review['TestTitle'] = self.extract(review_container.xpath(".//*[@itemprop='name']/text()"))
            review['TestUrl'] = review_url
            review["SourceTestScale"] = "5"

            review['ProductName'] = product['ProductName']
            review['source_internal_id'] = product['source_internal_id']
            
            review["DBaseCategoryName"] = "USER"
            if review["TestDateText"]:
                review["TestDateText"] = date_format(review["TestDateText"], '')

            yield review

        button_next_url = self.extract(response.xpath("//*[@rel='next']/@href"))
        if button_next_url:
            button_next_url = get_full_url(response.url, button_next_url)
            request = Request(button_next_url, callback=self.parse_review, meta=response.meta)
            yield request
Пример #8
0
    def _parse_reviews(self, selector, browser, product):
        review_container_xpath = "//div[@data-review-id]"

        author_xpath = ".//p[@class='pr-review-author-name']/span/text()"
        rating_xpath = ".//span[contains(@class, 'pr-rating')]/text()"
        title_xpath = ".//p[@class='pr-review-rating-headline']"
        test_date_xpath = ".//div[contains(@class, 'pr-review-author-date')]/text()"
        summary_xpath = ".//p[@class='pr-comments']/text()"
        next_page_xpath = "//a[@class='next_page']"
        review_containers = selector.xpath(review_container_xpath)

        for review_container in review_containers:
            review = ReviewItem()
            review['DBaseCategoryName'] = "USER"
            review['ProductName'] = product['ProductName']
            review['TestUrl'] = product['TestUrl']
            review['Author'] = self.extract(review_container.xpath(author_xpath))
            review['SourceTestRating'] = self.extract(review_container.xpath(
                rating_xpath))

            review['TestTitle'] = self.extract(review_container.xpath(title_xpath))
            review['TestSummary'] = self.extract(review_container.xpath(summary_xpath))

            review['TestDateText'] = self.extract(review_container.xpath(test_date_xpath))
            review['TestDateText'] = date_format(review['TestDateText'],
                                                 '%d.%m.%Y')
            yield review
Пример #9
0
    def parse_review(self, response):
        product = response.meta['product']

        user_review = ReviewItem()
        user_review['DBaseCategoryName'] = "USER"
        user_review['ProductName'] = product['ProductName']
        user_review['TestUrl'] = response.url
        date = self.extract(
            response.xpath(
                '//span[@class="dtreviewed"]/span[@class="value-title"]/@title'
            ))
        if date:
            user_review['TestDateText'] = date_format(date, '')
        rating = self.extract(
            response.xpath(
                '//div[@class="contentBox"]//a[contains(@class,"iReviewStars")]/@title'
            ))
        rating = re.findall(r'[^"]+ star', rating)
        user_review['SourceTestRating'] = rating[0]
        user_review['Author'] = self.extract(
            response.xpath('//a[@class="memberName"]/text()'))
        user_review['TestTitle'] = self.extract(
            response.xpath('//h3[contains(@class,"reviewTitle")]/text()'))
        user_review['TestSummary'] = self.extract_all(
            response.xpath('//div[contains(@class,"reviewText")]//text()'))
        user_review['TestPros'] = self.extract_all(
            response.xpath('//span[@class="reviewPros"]/parent::div/text()'))
        user_review['TestCons'] = self.extract_all(
            response.xpath('//span[@class="reviewCons"]/parent::div/text()'))
        yield user_review
Пример #10
0
    def parse_reviews(self, response):
        review = ReviewItem()
        product = ProductItem()
        contents = response.xpath('//article[@class="post-content"]')
        for content in contents:
            title = self.extract(
                content.xpath('.//div//h1[@class="post-title"]//text()'))
            test_url = self.extract(
                content.xpath('.//div//h1[@class="post-title"]//a/@href'))
            author = self.extract(
                content.xpath('.//span[@itemprop="name"]/text()'))
            date_str = self.extract_all(
                content.xpath('.//meta[@itemprop="datePublished"]/@content'))
            date = date_format(date_str, '%Y-%m-%d')
            pic = self.extract(content.xpath('.//img/@src'))
            sumamry = self.extract_all(
                content.xpath('.//div[@itemprop="articleBody"]//text()'))
            sid = test_url.split('/')[-2]
            # product items
            product['ProductName'] = title
            product['PicURL'] = pic
            product['source_internal_id'] = sid
            product['TestUrl'] = test_url
            # review
            review['ProductName'] = title
            review['TestTitle'] = title
            review['TestSummary'] = sumamry
            review['TestUrl'] = test_url
            review['DBaseCategoryName'] = 'pro'
            review['source_internal_id'] = sid
            review['TestDateText'] = date
            review['Author'] = author

            yield review
            yield product
Пример #11
0
    def parse_review(self, response):

        product = ProductItem()

        product_name_xpath = "//hearder[@class='gutter-top']/h1[@itemprop='name']/text()"
        ocn_xpath = "//div[@class='gutter-vertical']//span[@class='tags']/atext()"
        pic_url_xpath = "//meta[@property='og:image']/text()"

        product['ProductName'] = self.extract(response.xpath(product_name_xpath))
        product['OriginalCategoryName'] = response.meta['category']
        product['PicURL'] = self.extract(response.xpath(pic_url_xpath))

        yield product

        testTitle_xpath = "//meta[@property='og:title']/text()"
        testSummary_xpath = "//div[@class='segment-article gutter-bottom-lg']div[class='row']/div/p/text()"
        author_xpath = ".//span[@class='review-created-by']/text()"
        testDateText_xpath = ".//span[@class='review-created-by']/text()"
        sourceTestRating_xpath = ".//span[@class='review-rating']/img/@src"

        review = ReviewItem()
        review["TestUrl"] = response.url
        review["DBaseCategoryName"] = "USER"
        review["SourceTestScale"] = "5";
        review["ProductName"] = product["ProductName"]
        review["TestTitle"] = self.extract_all(response.xpath(testTitle_xpath))
        review["TestSummary"] = self.extract_all(response.xpath(testSummary_xpath), " ")
        review["Author"] = self.extract(response.xpath(author_xpath))
        review["TestDateText"] = self.extract(response.xpath(testDateText_xpath))
Пример #12
0
    def parse_reviews(self, response):
        product = response.meta['product']
        reviews = response.xpath('//ul[@class="reviews-list"]/li')

        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['source_internal_id'] = product['source_internal_id']
            date = self.extract(review.xpath('.//time/@datetime'))
            if date:
                user_review['TestDateText'] = date_format(date, "%Y %m %d")
            rating = self.extract(
                review.xpath(
                    './/div[contains(@class,"rateit-selected")]/@style'))
            rating = rating.strip('width:').strip('.00%')
            user_review['SourceTestRating'] = rating
            user_review['Author'] = self.extract(
                review.xpath('.//div[@class="customer"]/span/text()'))
            user_review['TestTitle'] = self.extract(
                review.xpath('.//div[@class="title"]/text()'))
            user_review['TestSummary'] = self.extract_all(
                review.xpath('.//div[@class="copy"]/p/text()'))
            yield user_review
Пример #13
0
    def parse_reviews(response):
        reviews = re.findall(r'"CID":(((?!("Badges")).)+)}', response.body)

        for item in reviews:
            try:
                review = item[0]
                user_review = ReviewItem()
                user_review['DBaseCategoryName'] = "USER"
                user_review['ProductName'] = response.meta['product'][
                    'ProductName']
                user_review['TestUrl'] = response.meta['product']['TestUrl']
                date = re.findall(r'"SubmissionTime":"([\d-]+)', review)
                user_review['TestDateText'] = date_format(date[0], "%Y-%m-%d")
                rate = re.findall(r'"Rating":([\d])', review)
                user_review['SourceTestRating'] = rate[0]
                author = re.findall(r'"UserNickname":"([^"]+)', review)
                if author:
                    user_review['Author'] = author[0]
                title = re.findall(r'"Title":"([^"]+)', review)
                if title:
                    user_review['TestTitle'] = title[0]
                summary = re.findall(r'"ReviewText":"([^"]+)', review)
                if summary:
                    user_review['TestSummary'] = summary[0]
                yield user_review
            except:
                pass
Пример #14
0
    def parse_reviews(response):
        reviews = re.findall(
            r'TagDimensions(((?!(TagDimensions|SyndicationSource)).)+)ModerationStatus',
            response.body)

        for item in reviews:
            review = item[0]
            sii = re.findall(r'"ProductId":"([\d-]+)', review)
            if response.meta['product']['source_internal_id'] in sii:
                user_review = ReviewItem()
                user_review['DBaseCategoryName'] = "USER"
                user_review['ProductName'] = response.meta['product'][
                    'ProductName']
                user_review['TestUrl'] = response.meta['product']['TestUrl']
                user_review['source_internal_id'] = response.meta['product'][
                    'source_internal_id']
                date = re.findall(r'"SubmissionTime":"([\d-]+)', review)
                user_review['TestDateText'] = date_format(date[0], "%Y-%m-%d")
                rate = re.findall(r'"Rating":([\d])', review)
                user_review['SourceTestRating'] = rate[0]
                author = re.findall(r'"UserNickname":"([^"]+)', review)
                if author:
                    user_review['Author'] = author[0]
                title = re.findall(r'"Title":"([^"]+)', review)
                if title:
                    user_review['TestTitle'] = title[0]
                summary = re.findall(r'"ReviewText":"([^"]+)', review)
                if summary:
                    user_review['TestSummary'] = summary[0]
                yield user_review
Пример #15
0
    def parse_reviews(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category'][
            'category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/a/text()'))
        product['PicURL'] = self.extract(
            response.xpath('//meta[@property="og:image"]/@content'))
        product['ProductManufacturer'] = self.extract(
            response.xpath('//meta[@itemprop="brand"]/@content'))
        product['source_internal_id'] = self.extract(
            response.xpath('//@data-product-id'))
        yield product

        reviews = response.xpath('//li[@class="opinion-row"]')
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['source_internal_id'] = product['source_internal_id']
            date = self.extract(
                review.xpath('.//meta[@itemprop="datePublished"]/@content'))
            user_review['TestDateText'] = date_format(date, "%Y %m %d")
            user_review['SourceTestRating'] = self.extract(
                review.xpath('.//meta[@itemprop="ratingValue"]/@content'))
            user_review['Author'] = self.extract(review.xpath('.//h4/text()'))
            user_review['TestTitle'] = self.extract(
                review.xpath('.//div[contains(@class,"grade-text")]/text()'))
            user_review['TestSummary'] = self.extract_all(
                review.xpath('.//div[@itemprop="description"]/text()'))
            yield user_review
Пример #16
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/text()'))
        product['PicURL'] = self.extract(response.xpath('//div[@class="images"]/a/img/@src'))
        product['ProductManufacturer'] = self.extract(
                response.xpath('//span[text()="Marca"]/parent::li/span[@class="value"]/text()'))
        product['source_internal_id'] = self.extract(response.xpath('//input[@id="prodId"]/@value'))
        yield product

        reviews = response.xpath('//article[@itemscope]')
       
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['source_internal_id'] = product['source_internal_id']
            user_review['TestUrl'] = product['TestUrl']
            date = self.extract(review.xpath('.//div[@class="date"]/text()'))
            date_match = re.findall(r'[\d/]{10}', date)
            if date_match:
                user_review['TestDateText'] = date_format(date_match[0], "%d/%m/%Y")
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()'))
            user_review['Author'] = self.extract(review.xpath('.//h2/a/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//h3/a/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//p[@itemprop="reviewBody"]/text()'))
            user_review['TestPros'] = self.extract_all(review.xpath('.//div[@class="pro"]//li/text()'), '; ')
            user_review['TestCons'] = self.extract_all(review.xpath('.//div[@class="con"]//li/text()'), '; ')
            yield user_review
Пример #17
0
    def parse_product(self, response):
        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1[@itemprop="itemreviewed"]/text()'))
        product['PicURL'] = self.extract(response.xpath('//div[@class="productPhotoGallery"]/div/img/@src'))
        product['ProductManufacturer'] = self.extract(response.xpath(
                '//div[@class="manufacturer"]//span[not(text()="brak")]/text()'))
        yield product

        reviews = response.xpath(
                '//div[@class="opinion"][not(descendant::a[contains(text(),"Opinia z serwisu Ceneo.pl")])]')
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            date = self.extract(review.xpath('.//span[@class="date"]/text()'))
            user_review['TestDateText'] = date_format(date, "%Y-%m-%d")
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@class="points"]/text()'))
            user_review['Author'] = self.extract_all(review.xpath('.//*[@class="profileName"]//text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="text"]//text()'))
            user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pluses"]//span/text()'), '; ')
            user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="minuses"]//span/text()'), '; ')
            yield user_review
Пример #18
0
def review_item_from_review_json_ld(json_ld, _review=None, overwrite=False):
    review = _review if _review else ReviewItem()
    html_parser = HTMLParser()

    review_rating_obj = json_ld.get('reviewRating', {})

    if review_rating_obj and (overwrite
                              or not review.get('SourceTestScale', '')):
        # according to Google Developers, 5 is the default best rating
        review['SourceTestScale'] = review_rating_obj.get('bestRating', 5)

    if review_rating_obj and (overwrite
                              or not review.get('SourceTestRating', '')):
        review['SourceTestRating'] = review_rating_obj.get('ratingValue', None)
        if review.get('SourceTestRating') is not None:
            # Do not assign rating from JSON LD if its value is less than that of worst rating
            try:
                # according to Google Developers, 1 is the default worst rating
                worst_rating = float(review_rating_obj.get('worstRating', 1))
                if float(review['SourceTestRating']) < worst_rating:
                    review['SourceTestRating'] = ''
            except:
                pass

    if overwrite or not review.get('ProductName', ''):
        item_reviewed = json_ld.get('itemReviewed', {})
        product_name = item_reviewed.get('name', '')
        if product_name:
            review['ProductName'] = html_parser.unescape(product_name).strip()

    # For all the information we can extract from 'Article' JSON-LD, the way
    # to extract them from 'Review' JSON-LD is exactly the same
    review = review_item_from_article_json_ld(json_ld, review, overwrite)

    return review
Пример #19
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/text()'))
        pic_url = self.extract(response.xpath('//div[@class="product-carousel"]//img[@itemprop="image"][1]/@src'))
        product['PicURL'] = get_full_url(response, pic_url)
        product['ProductManufacturer'] = self.extract(response.xpath(
            '//td[text()="Constructeur"]/following-sibling::td/text()'))
        yield product

        reviews = response.xpath('//li[@itemprop="review"]')
       
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            date = self.extract(review.xpath('.//span[@itemprop="datePublished"]/text()'))
            user_review['TestDateText'] = date_format(date, '%d/%m/%Y')
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()'))
            user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//div[@itemprop="name"]/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//blockquote/text()'))
            yield user_review
Пример #20
0
    def parse_product(self, response):
        reviews = response.xpath('//section[article[contains(@class,"review")]]')
        if reviews:
            product = ProductItem()

            product['TestUrl'] = response.url
            product['OriginalCategoryName'] = 'Cell Phones'
            product['ProductName'] = self.extract(response.xpath('//meta[@itemprop="name"]/@content'))
            pic_url = self.extract(response.xpath('//meta[@property="og:image"]/@content'))
            product['PicURL'] = get_full_url(response, pic_url)
            product['ProductManufacturer'] = self.extract(response.xpath('//meta[@itemprop="brand"]/@content'))
            yield product

            user_reviews = reviews.xpath('./article[@itemprop="review"]')

            for review in user_reviews:
                user_review = ReviewItem()
                user_review['DBaseCategoryName'] = "USER"
                user_review['ProductName'] = product['ProductName']
                user_review['TestUrl'] = product['TestUrl']
                date = self.extract(review.xpath('.//span[@class="time"]/text()'))
                user_review['TestDateText'] = date_format(date, '')
                user_review['SourceTestRating'] = self.extract(review.xpath('.//meta[@itemprop="ratingValue"]/@content'))
                user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()'))
                user_review['TestPros'] = self.extract_all(review.xpath(
                    './/div[contains(@class,"positives")]/text()'), '; ')
                user_review['TestCons'] = self.extract_all(review.xpath(
                    './/div[contains(@class,"negatives")]/text()'), '; ')
                yield user_review

            pro_review_url = self.extract(reviews.xpath('./article[contains(@class,"expert")]/div/a/@href'))
            if pro_review_url:
                request = Request(url=get_full_url(response, pro_review_url), callback=self.parse_review)
                request.meta['product'] = product
                yield request
Пример #21
0
def review_item_from_article_json_ld(json_ld, _review=None, overwrite=False):
    review = _review if _review else ReviewItem()
    html_parser = HTMLParser()

    if overwrite or not review.get('TestSummary', ''):
        summary = json_ld.get('description', '')
        if summary:
            review['TestSummary'] = html_parser.unescape(summary).strip()

    if overwrite or not review.get('TestTitle', ''):
        title = json_ld.get('name', '')
        if not title:
            title = json_ld.get('headline', '')

        if title:
            review['TestTitle'] = html_parser.unescape(title).strip()

    if overwrite or not review.get('Author', ''):
        try:
            author_str = json_ld.get('author', {}).get('name', '')
        except:
            author_list = json_ld.get('author', [])
            author_str = ', '.join(a.get('name', '') for a in author_list)

        if author_str:
            review['Author'] = html_parser.unescape(author_str).strip()

    if overwrite or not review.get('TestDateText', ''):
        test_date_text = json_ld.get('datePublished', '')
        if test_date_text:
            test_date_text = date_format(test_date_text, '')
            review['TestDateText'] = test_date_text

    return review
Пример #22
0
 def level_4(self, response):
     original_url = response.url
     pname = response.meta["ProductName"]
     test_url = response.meta["TestUrl"]
     json_string = response.body.replace('bv_1111_60234', '').strip('()')
     data = json.loads(json_string)
     results = data['BatchedResults']['q0']['Results']
     try:
         for item in results:
             review = ReviewItem()
             review['DBaseCategoryName'] = "USER"
             review['ProductName'] = pname
             review['TestUrl'] = test_url
             review['source_internal_id'] = item['ProductId']
             review['TestDateText'] = item['SubmissionTime']
             if review['TestDateText']:
                 review['TestDateText'] = date_format(
                     review['TestDateText'], '')
             review['SourceTestRating'] = item['Rating']
             review['SourceTestScale'] = '5'
             review['Author'] = item['UserNickname']
             review['TestTitle'] = item['Title']
             review['TestSummary'] = item['ReviewText']
             review['TestPros'] = item['Pros']
             review['TestCons'] = item['Cons']
             yield review
     except:
         pass
     pass
Пример #23
0
    def parse_product(self, response):
        product = ProductItem()
        product['TestUrl'] = response.url
        ocn = self.extract(response.xpath(
            '//script[@type="text/javascript"][contains(text(),"sectionValue")]/text()'))
        ocn_match = re.findall(r'sectionValue = "([^"]+)"', ocn)
        product['OriginalCategoryName'] = ocn_match[0]
        product['ProductName'] = self.extract(response.xpath('//h1/span[@itemprop="name"]/text()'))
        pic_url = self.extract(response.xpath('//ul/li[1]/img[@itemprop="image"]/@src'))
        if pic_url:
            pic_url = get_full_url(response, pic_url)
            product['PicURL'] = pic_url
        product['ProductManufacturer'] = 'HP'
        yield product

        mpn = self.extract_list(response.xpath('//span[@class="prodNum"]/text()'))
        if mpn:
            product_id = self.product_id(product)
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = mpn[0]
            yield product_id

        reviews = response.xpath('//div[@itemprop="review"]')

        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['TestDateText'] = self.extract(review.xpath('./meta[@itemprop="datePublished"]/@content'))
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()'))
            user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//span[@itemprop="name"]/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//span[@itemprop="description"]//text()'))
            yield user_review
Пример #24
0
 def parse_reviews(self, response):
     reviews = response.xpath('//div[@class="caja-comentarios"]')
    
     for review in reviews:
         user_review = ReviewItem()
         user_review['DBaseCategoryName'] = "USER"
         user_review['ProductName'] = response.meta['product']['ProductName']
         user_review['TestUrl'] = response.meta['product']['TestUrl']
         user_review['source_internal_id'] = response.meta['product']['source_internal_id']
         date = self.extract(review.xpath('./p/text()[2]'))
         user_review['TestDateText'] = date_format(date, '%d-%m-%Y')
         rates = self.extract_list(review.xpath('.//li[@class="current-rating"]'))
         scale = 0
         rating = 0
         for rate in rates:
             rate_match = re.findall(r'([\d.]+)/5', rate)
             rating += float(rate_match[0])
             scale += 5
         user_review['SourceTestRating'] = str(rating)
         user_review['SourceTestScale'] = str(scale)
         user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()'))
         user_review['TestSummary'] = self.extract(review.xpath('.//div[@class="caja"]/text()[1]'))
         user_review['TestPros'] = self.extract(review.xpath(
                 './/strong[contains(text(),"Ventajas")]/following-sibling::text()[1]'))
         user_review['TestCons'] = self.extract(review.xpath(
                 './/strong[contains(text(),"Desventajas")]/following-sibling::text()[1]'))
         yield user_review
Пример #25
0
    def parse_reviews(self, response):
        product = response.meta["product"]
        reviews = response.xpath('//ul[@id="reviews-list"]/li')

        next_page_xpath = "//div[@id='review-list']/div[@class='see-more-bar']//a/@href"
       
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = self.extract(review.xpath('.//h3/a/@href'))
            date = self.extract(review.xpath('.//meta[@itemprop="datePublished"]/@content'))
            if date:
                date = date[:10]
                user_review['TestDateText'] = date_format(date, "%Y-%m-%d")
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="reviewRating"]/@content'))
            if user_review['SourceTestRating']:
                user_review['SourceTestScale'] = 5
            user_review['Author'] = self.extract(review.xpath('.//a[@class="user-link"]//text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()'))
            user_review['TestSummary'] = self.extract_all(
                    review.xpath('.//div[@class="review-text"]//span/span/text()'))
            user_review['TestPros'] = self.extract_all(
                    review.xpath(".//p[contains(@class, 'label-cons')]/following::p[1][not(text()='-')]/text()"))
            user_review['TestCons'] = self.extract_all(
                    review.xpath(".//p[contains(@class,'label-pros')]/following::p[1][not(text()='-')]/text()"))
            yield user_review

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            request = Request(url=next_page_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
Пример #26
0
    def parse_pro(self, response):
        item = response.meta['item']
        pro_review = response.xpath('//div[@id="besteproducttest"]')

        rate_xpath = './/div[@class="block"]/div[contains(@class,"bp-review__intro__score")]//text()'

        if pro_review:
            item['has_review'] = 1
            review = ReviewItem()
            review['DBaseCategoryName'] = "PRO"
            review['ProductName'] = item['name']
            review['TestUrl'] = response.url
            date = self.extract(pro_review.xpath('.//@datetime'))
            review['TestDateText'] = date_format(date, '')
            review['SourceTestRating'] = self.extract(
                pro_review.xpath(rate_xpath)).replace(",", ".")
            review['Author'] = self.extract(
                pro_review.xpath('.//div[@class="avatar__title"]/text()'))
            review['TestTitle'] = self.extract(
                pro_review.xpath('.//h1/text()'))
            review['TestSummary'] = self.extract_all(
                pro_review.xpath('.//p/text()'))
            yield review

        request = Request(url=item['url'] + '/gebruikersreviews',
                          callback=self.parse_user)
        request.meta['item'] = item
        yield request
Пример #27
0
    def parse_product_review(self, response):
        print "     ...PARSE_PRODUCT_REVIEW: " + response.url

        date = self.get_date(response)

        if date > self.stored_last_date:

            # REVIEW ITEM ----------------------------------------------------
            review = ReviewItem()

            #  'TestTitle'
            test_title_xpath = '//div[@class="subheadtest"]/h4//text()'
            test_title = response.xpath(test_title_xpath).getall()
            test_title = " ".join(test_title)
            review['TestTitle'] = test_title

            # 'ProductName'
            product_name = \
                self.get_product_name_based_on_title(review['TestTitle'])
            review['ProductName'] = product_name

            # 'TestSummary'
            summary_xpath = '//div[@class="csc-textpic-text"]/*//text()'
            summary = response.xpath(summary_xpath).getall()
            summary = " ".join(summary)
            review['TestSummary'] = summary

            # 'TestDateText'
            review['TestDateText'] = date.strftime("%Y-%m-%d")

            # 'DBaseCategoryName'
            review['DBaseCategoryName'] = 'PRO'

            # 'source_internal_id'
            sid = response.url.split('.0.html')[0]
            sid = sid.split('/')[-1]
            sid = sid.split('.')[-1]
            review['source_internal_id'] = sid

            # 'TestUrl'
            review['TestUrl'] = response.url
            # ----------------------------------------------------------------

            # PRODUCT ITEM ---------------------------------------------------
            product = ProductItem()
            product['source_internal_id'] = review['source_internal_id']
            product['ProductName'] = review['ProductName']
            product['PicURL'] = response.meta.get('pic_url')
            product['TestUrl'] = response.url
            # ----------------------------------------------------------------

            yield review
            yield product

            # In case this is the last review of the page
            if response.meta.get('check_next_page'):
                yield response.follow(url=response.meta.get('next_page_url'),
                                      callback=self.parse)
Пример #28
0
    def parse_review(self, response):
        next_page_xpath = "(//*[@rel='next']/@href)[1]"
        default_rating_xpath = './/reevoo-score/@data-score'

        product = response.meta['product']
        reviews = response.xpath('//article[contains(@id,"review_")]')

        if not reviews:
            return

        # From observation, at least currys.co.uk uses a different format to present review rating
        rating_xpath = response.meta.get('rating_xpath', '')
        if not rating_xpath:
            rating_xpath = default_rating_xpath

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])

        for review in reviews:
            user_review = ReviewItem()
            date = self.extract(
                review.xpath(
                    './/span[contains(@class, "date_publish")]/text()'))
            if date:
                user_review['TestDateText'] = date_format(date, '')
                current_user_review = dateparser.parse(
                    user_review['TestDateText'], date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['SourceTestRating'] = self.extract(
                review.xpath(rating_xpath))
            user_review['Author'] = self.extract(
                review.xpath('.//h4[@class="attribution-name"]/text()'))
            user_review['TestPros'] = self.extract_all(
                review.xpath('.//dd[@class="pros"]/text()'))
            user_review['TestCons'] = self.extract_all(
                review.xpath('.//dd[@class="cons"]/text()'))
            user_review['source_internal_id'] = product['source_internal_id']

            # All reviews after first empty review are empty
            if user_review['TestPros'] or user_review['TestCons']:
                yield user_review
            else:
                return

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url,
                              callback=self.parse_review,
                              meta=response.meta)
            yield request
Пример #29
0
    def test_review(self):
        review = ReviewItem()

        review['source_id'] = 19827398
        review['source_internal_id'] = "FakeID"
        review['ProductName'] = "Fake Product Name"
        review['SourceTestRating'] = "9 million"
        review['SourceTestScale'] = "10"
        review['TestDateText'] = "29/02/2000"
        review['TestPros'] = "Shiny"
        review['TestCons'] = "Doesn't work"
        review['TestSummary'] = "BUY THEM ALL"
        review['TestVerdict'] = "BUY"
        review['Author'] = "Steve"
        review['DbaseCategoryName'] = "Fake Items"
        review['TestTitle'] = "An amazingly shiny thing I bought"
        review[
            'TestUrl'] = "http://awesomejunk.com/shinythings/fake_shiny_thing.html"
        review['Pay'] = "Maybe"
        review['award'] = "AWESOME"
        review['AwardPic'] = "http://somewhere.else.com/pic.png"
        review['countries'] = "ALL OF THEM"

        assert review._name == "review", "ReviewItem _name incorrect"
        assert review[
            'source_id'] == 19827398, "ReviewItem source_id incorrectly set"
        assert review[
            'source_internal_id'] == "FakeID", "ReviewItem source_internal_id incorrectly set"
        assert review[
            'ProductName'] == "Fake Product Name", "ReviewItem ProductName incorrectly set"
        assert review[
            'SourceTestRating'] == "9 million", "ReviewItem SourceTestRating incorrectly set"
        assert review[
            'SourceTestScale'] == "10", "ReviewItem SourceTestScale incorrectly set"
        assert review[
            'TestDateText'] == "29/02/2000", "ReviewItem TestDateText incorrectly set"
        assert review[
            'TestPros'] == "Shiny", "ReviewItem TestPros incorrectly set"
        assert review[
            'TestCons'] == "Doesn't work", "ReviewItem TestCons incorrectly set"
        assert review[
            'TestSummary'] == "BUY THEM ALL", "ReviewItem TestSummary incorrectly set"
        assert review[
            'TestVerdict'] == "BUY", "ReviewItem TestVerdict incorrectly set"
        assert review['Author'] == "Steve", "ReviewItem Author incorrectly set"
        assert review[
            'DbaseCategoryName'] == "Fake Items", "ReviewItem DbaseCategoryName incorrectly set"
        assert review[
            'TestTitle'] == "An amazingly shiny thing I bought", "ReviewItem TestTitle incorrectly set"
        assert review['TestUrl'] == "http://awesomejunk.com/shinythings/fake_shiny_thing.html", \
            "ReviewItem TestUrl incorrectly set"
        assert review['Pay'] == "Maybe", "ReviewItem Pay incorrectly set"
        assert review['award'] == "AWESOME", "ReviewItem award incorrectly set"
        assert review[
            'AwardPic'] == "http://somewhere.else.com/pic.png", "ReviewItem AwardPic incorrectly set"
        assert review[
            'countries'] == "ALL OF THEM", "ReviewItem countries incorrectly set"
Пример #30
0
    def _parse_review(self,
                      product,
                      review_selector,
                      extra_review_parser=None):

        review = ReviewItem()
        date_xpath = './/meta[@itemprop="datePublished"]/@content'
        alt_date_xpath = './/*[contains(@class,"BVRRReviewDate")]/span[@class="value-title"]/@title'
        author_xpath = './/*[contains(@class,"BVRRNickname")]/text()|.//meta[@itemprop="author"]/@content'
        rating_xpath = './/*[contains(@class,"BVRRRatingOverall")]//*[contains(@class,"BVRRRatingNumber")]/text()'
        scale_xpath = './/*[contains(@class,"BVRRRatingOverall")]//*[contains(@class,"BVRRRatingRangeNumber")]//text()'
        pros_xpath = './/*[contains(@class,"BVRRReviewProTags") and contains(@class,"BVRRValue")]//text()'
        alt_pros_xpath = './/*[contains(@class,"BVRRTagsPrefix") and contains(text(),"Pro")]/following-sibling::*[contains(@class, "BVRRTags")][1]//text()'
        cons_xpath = './/*[contains(@class,"BVRRReviewConTags") and contains(@class,"BVRRValue")]//text()'
        alt_cons_xpath = './/*[contains(@class,"BVRRTagsPrefix") and contains(text(),"Cons")]/following-sibling::*[contains(@class, "BVRRTags")][1]//text()'
        summary_xpath = './/*[contains(@class,"BVRRReviewText")]//text()'
        title_xpath = './/*[contains(@class,"BVRRReviewTitle")]/text()'

        review['DBaseCategoryName'] = 'USER'
        if 'source_internal_id' in product:
            review['source_internal_id'] = product['source_internal_id']
        review['ProductName'] = product['ProductName']
        review['TestUrl'] = product['TestUrl']
        review['TestDateText'] = self.extract(
            review_selector.xpath(date_xpath))
        if not review['TestDateText']:
            review['TestDateText'] = self.extract(
                review_selector.xpath(alt_date_xpath))
        review['Author'] = self.extract(review_selector.xpath(author_xpath))
        review['SourceTestRating'] = self.extract(
            review_selector.xpath(rating_xpath))
        if review['SourceTestRating']:
            review['SourceTestScale'] = self.extract(
                review_selector.xpath(scale_xpath))
        review['TestPros'] = self.extract_all(
            review_selector.xpath(pros_xpath))
        if not review['TestPros']:
            review['TestPros'] = self.extract_all(
                review_selector.xpath(alt_pros_xpath))
        review['TestCons'] = self.extract_all(
            review_selector.xpath(cons_xpath))
        if not review['TestCons']:
            review['TestCons'] = self.extract_all(
                review_selector.xpath(alt_cons_xpath))
        review['TestSummary'] = self.extract_all(
            review_selector.xpath(summary_xpath))
        review['TestTitle'] = self.extract_all(
            review_selector.xpath(title_xpath))

        if extra_review_parser:
            try:
                altered_review = extra_review_parser(review_selector, review)
                return altered_review
            except:
                pass

        return review