Пример #1
0
    def parse_product(self, response):
        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category'][
            'category_path']
        product['ProductName'] = self.extract(
            response.xpath(
                '(//div[@id="community_icons"]|//div[contains(@class,"product-title")])/h1/text()'
            ))
        pic_url = self.extract(
            response.xpath(
                '//a[@class="zoom"]/@rel|//div[@id="sync_main"]/img[1]/@data-src'
            ))
        product['PicURL'] = get_full_url(response, pic_url)
        product['ProductManufacturer'] = self.extract(
            response.xpath('//meta[@itemprop="brand"]/@content|//h3/a/text()'))
        product['source_internal_id'] = self.extract(
            response.xpath(
                '//div[@data-gm_pagemode="product_info"]/@data-gm_pageid|'
                '//div[contains(@class,"product-comparison")]/@data-product-id'
            ))
        yield product

        reviews = response.xpath(
            '//div[@itemprop="reviews"]|//div[contains(@class,"review-item")]')
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['source_internal_id'] = product['source_internal_id']
            user_review['TestUrl'] = response.url
            date = self.extract(
                review.xpath(
                    './/@datetime|.//div[@class="review-date"]/text()'))
            if 'am' in date:
                user_review['TestDateText'] = date_format(date, "am %d.%m.%Y")
            else:
                user_review['TestDateText'] = date_format(date, "%Y-%m-%d")
            user_review['SourceTestRating'] = self.extract(
                review.xpath(
                    './/meta[@itemprop="rating"]/@content|.//@data-rating'))
            user_review['Author'] = self.extract(
                review.xpath(
                    './/span[@itemprop="reviewer"]/text()|.//span[@class="click"]/text()'
                ))
            user_review['TestTitle'] = self.extract(
                review.xpath(
                    './/span[@itemprop="summary"]/text()|.//div[contains(@class,"review-title")]/text()'
                ))
            user_review['TestSummary'] = self.extract_all(
                review.xpath(
                    './/p[@itemprop="description"]/span//text()|.//div[contains(@class,"review-content")]//text()'
                ))
            yield user_review
Пример #2
0
    def parse_review(self, response):
        product = response.meta['product']
        review_url = response.meta['review_url']

        containers_xpath = "//div[@itemprop='review']"
        containers = response.xpath(containers_xpath)
        for review_container in containers:
            review = ReviewItem()
            review['SourceTestRating'] = self.extract(review_container.xpath(".//*[@itemprop='ratingValue']/@content"))
            review['TestDateText'] = self.extract(review_container.xpath(".//span[@itemprop='datePublished']/text()"))
            review['TestSummary'] = self.extract_all(review_container.xpath(".//p[@itemprop='reviewBody']//text()"
                                                                            "[not(ancestor::a)]"))
            review['Author'] = self.extract(review_container.xpath(".//a[@itemprop='author']/text()"))
            review['TestTitle'] = self.extract(review_container.xpath(".//*[@itemprop='name']/text()"))
            review['TestUrl'] = review_url
            review["SourceTestScale"] = "5"

            review['ProductName'] = product['ProductName']
            review['source_internal_id'] = product['source_internal_id']
            
            review["DBaseCategoryName"] = "USER"
            if review["TestDateText"]:
                review["TestDateText"] = date_format(review["TestDateText"], '')

            yield review

        button_next_url = self.extract(response.xpath("//*[@rel='next']/@href"))
        if button_next_url:
            button_next_url = get_full_url(response.url, button_next_url)
            request = Request(button_next_url, callback=self.parse_review, meta=response.meta)
            yield request
Пример #3
0
    def parse_review(self, response):
        product = response.meta['product']

        user_review = ReviewItem()
        user_review['DBaseCategoryName'] = "USER"
        user_review['ProductName'] = product['ProductName']
        user_review['TestUrl'] = response.url
        date = self.extract(
            response.xpath(
                '//span[@class="dtreviewed"]/span[@class="value-title"]/@title'
            ))
        if date:
            user_review['TestDateText'] = date_format(date, '')
        rating = self.extract(
            response.xpath(
                '//div[@class="contentBox"]//a[contains(@class,"iReviewStars")]/@title'
            ))
        rating = re.findall(r'[^"]+ star', rating)
        user_review['SourceTestRating'] = rating[0]
        user_review['Author'] = self.extract(
            response.xpath('//a[@class="memberName"]/text()'))
        user_review['TestTitle'] = self.extract(
            response.xpath('//h3[contains(@class,"reviewTitle")]/text()'))
        user_review['TestSummary'] = self.extract_all(
            response.xpath('//div[contains(@class,"reviewText")]//text()'))
        user_review['TestPros'] = self.extract_all(
            response.xpath('//span[@class="reviewPros"]/parent::div/text()'))
        user_review['TestCons'] = self.extract_all(
            response.xpath('//span[@class="reviewCons"]/parent::div/text()'))
        yield user_review
Пример #4
0
    def parse_reviews(self, response):
        review = ReviewItem()
        product = ProductItem()
        contents = response.xpath('//article[@class="post-content"]')
        for content in contents:
            title = self.extract(
                content.xpath('.//div//h1[@class="post-title"]//text()'))
            test_url = self.extract(
                content.xpath('.//div//h1[@class="post-title"]//a/@href'))
            author = self.extract(
                content.xpath('.//span[@itemprop="name"]/text()'))
            date_str = self.extract_all(
                content.xpath('.//meta[@itemprop="datePublished"]/@content'))
            date = date_format(date_str, '%Y-%m-%d')
            pic = self.extract(content.xpath('.//img/@src'))
            sumamry = self.extract_all(
                content.xpath('.//div[@itemprop="articleBody"]//text()'))
            sid = test_url.split('/')[-2]
            # product items
            product['ProductName'] = title
            product['PicURL'] = pic
            product['source_internal_id'] = sid
            product['TestUrl'] = test_url
            # review
            review['ProductName'] = title
            review['TestTitle'] = title
            review['TestSummary'] = sumamry
            review['TestUrl'] = test_url
            review['DBaseCategoryName'] = 'pro'
            review['source_internal_id'] = sid
            review['TestDateText'] = date
            review['Author'] = author

            yield review
            yield product
Пример #5
0
    def parse_pro(self, response):
        item = response.meta['item']
        pro_review = response.xpath('//div[@id="besteproducttest"]')

        rate_xpath = './/div[@class="block"]/div[contains(@class,"bp-review__intro__score")]//text()'

        if pro_review:
            item['has_review'] = 1
            review = ReviewItem()
            review['DBaseCategoryName'] = "PRO"
            review['ProductName'] = item['name']
            review['TestUrl'] = response.url
            date = self.extract(pro_review.xpath('.//@datetime'))
            review['TestDateText'] = date_format(date, '')
            review['SourceTestRating'] = self.extract(
                pro_review.xpath(rate_xpath)).replace(",", ".")
            review['Author'] = self.extract(
                pro_review.xpath('.//div[@class="avatar__title"]/text()'))
            review['TestTitle'] = self.extract(
                pro_review.xpath('.//h1/text()'))
            review['TestSummary'] = self.extract_all(
                pro_review.xpath('.//p/text()'))
            yield review

        request = Request(url=item['url'] + '/gebruikersreviews',
                          callback=self.parse_user)
        request.meta['item'] = item
        yield request
Пример #6
0
    def _parse_reviews(self, selector, browser, product):
        review_container_xpath = "//div[@data-review-id]"

        author_xpath = ".//p[@class='pr-review-author-name']/span/text()"
        rating_xpath = ".//span[contains(@class, 'pr-rating')]/text()"
        title_xpath = ".//p[@class='pr-review-rating-headline']"
        test_date_xpath = ".//div[contains(@class, 'pr-review-author-date')]/text()"
        summary_xpath = ".//p[@class='pr-comments']/text()"
        next_page_xpath = "//a[@class='next_page']"
        review_containers = selector.xpath(review_container_xpath)

        for review_container in review_containers:
            review = ReviewItem()
            review['DBaseCategoryName'] = "USER"
            review['ProductName'] = product['ProductName']
            review['TestUrl'] = product['TestUrl']
            review['Author'] = self.extract(review_container.xpath(author_xpath))
            review['SourceTestRating'] = self.extract(review_container.xpath(
                rating_xpath))

            review['TestTitle'] = self.extract(review_container.xpath(title_xpath))
            review['TestSummary'] = self.extract(review_container.xpath(summary_xpath))

            review['TestDateText'] = self.extract(review_container.xpath(test_date_xpath))
            review['TestDateText'] = date_format(review['TestDateText'],
                                                 '%d.%m.%Y')
            yield review
Пример #7
0
    def parse_reviews(response):
        reviews = re.findall(r'"CID":(((?!("Badges")).)+)}', response.body)

        for item in reviews:
            try:
                review = item[0]
                user_review = ReviewItem()
                user_review['DBaseCategoryName'] = "USER"
                user_review['ProductName'] = response.meta['product'][
                    'ProductName']
                user_review['TestUrl'] = response.meta['product']['TestUrl']
                date = re.findall(r'"SubmissionTime":"([\d-]+)', review)
                user_review['TestDateText'] = date_format(date[0], "%Y-%m-%d")
                rate = re.findall(r'"Rating":([\d])', review)
                user_review['SourceTestRating'] = rate[0]
                author = re.findall(r'"UserNickname":"([^"]+)', review)
                if author:
                    user_review['Author'] = author[0]
                title = re.findall(r'"Title":"([^"]+)', review)
                if title:
                    user_review['TestTitle'] = title[0]
                summary = re.findall(r'"ReviewText":"([^"]+)', review)
                if summary:
                    user_review['TestSummary'] = summary[0]
                yield user_review
            except:
                pass
    def parse_review(self, response):
        product_xpaths = {"PicURL": "//*[@property='og:image']/@content"}

        review_xpaths = {
            "TestTitle": "//*[@property='og:title']/@content",
            "TestSummary": "//*[@name='og:description']/@content",
            "Author": "//span[@itemprop='reviewer']/text()",
            "SourceTestRating":
            "//div[contains(@class, 'expert-rating')]//span[@itemprop='rating']/text()",
            "TestDateText":
            "//div[@class='review']//span[@class='metadata']/text()[last()]",
            "TestPros": "//div[@class='features']/div/text()",
            "TestVerdict": "//div[@class='Normal']/text()[last()]"
        }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        title = review["TestTitle"].lower()
        review["ProductName"] = title.replace("review", "").strip(":")
        if ":" in review["ProductName"]:
            review["ProductName"] = review["ProductName"].split(":")[0]
        review["ProductName"] = review["ProductName"].replace(
            "- the times of india", "").strip()
        product["ProductName"] = review["ProductName"]
        yield product

        review["DBaseCategoryName"] = "PRO"
        review["SourceTestScale"] = "5"
        review["ProductName"] = product["ProductName"]
        if review["TestDateText"]:
            review["TestDateText"] = date_format(review["TestDateText"],
                                                 "%d %B %Y")
        yield review
Пример #9
0
    def parse_review(self, response, reviewData, extra_parser=None):
        product = response.meta['product']

        review = ReviewItem.from_product(product=product,
                                         rating=reviewData['Rating'],
                                         scale=reviewData['RatingRange'],
                                         date=date_format(
                                             reviewData['SubmissionTime'],
                                             '%Y-%m-%dT%H:%M:%S'),
                                         author=reviewData['UserNickname'],
                                         title=reviewData['Title'],
                                         summary=reviewData['ReviewText'],
                                         pros=reviewData['Pros'],
                                         cons=reviewData['Cons'],
                                         tp='USER')

        if not review.get('TestPros', ''):
            review['TestPros'] = ' ; '.join(
                reviewData.get('TagDimensions', {}).get('Pro',
                                                        {}).get('Values', []))

        if not review.get('TestCons', ''):
            review['TestCons'] = ' ; '.join(
                reviewData.get('TagDimensions', {}).get('Con',
                                                        {}).get('Values', []))

        if extra_parser:
            review = extra_parser(review, reviewData)

        return review
Пример #10
0
    def parse_review(self, response):
        product_xpaths = {"PicURL": "//*[@property='og:image']/@content"}

        review_xpaths = {
            "TestTitle": "//*[@property='og:title']/@content",
            "TestSummary": "//*[@property='og:description']/@content",
            "Author": "//a[@rel='author']/text()",
            "TestDateText":
            "//*[contains(@property, 'published_time')]/@content",
        }
        category_name = response.meta['category_name']
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        product["ProductName"] = review['TestTitle']
        product["OriginalCategoryName"] = category_name
        yield product

        review["ProductName"] = product["ProductName"]
        review["DBaseCategoryName"] = "PRO"
        review["SourceTestScale"] = "5"
        if review["TestDateText"]:
            review["TestDateText"] = date_format(review["TestDateText"],
                                                 "%d %B %Y")
        yield review
Пример #11
0
    def parse_reviews(self, response):
        product = response.meta['product']
        reviews = response.xpath('//ul[@class="reviews-list"]/li')

        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['source_internal_id'] = product['source_internal_id']
            date = self.extract(review.xpath('.//time/@datetime'))
            if date:
                user_review['TestDateText'] = date_format(date, "%Y %m %d")
            rating = self.extract(
                review.xpath(
                    './/div[contains(@class,"rateit-selected")]/@style'))
            rating = rating.strip('width:').strip('.00%')
            user_review['SourceTestRating'] = rating
            user_review['Author'] = self.extract(
                review.xpath('.//div[@class="customer"]/span/text()'))
            user_review['TestTitle'] = self.extract(
                review.xpath('.//div[@class="title"]/text()'))
            user_review['TestSummary'] = self.extract_all(
                review.xpath('.//div[@class="copy"]/p/text()'))
            yield user_review
Пример #12
0
    def parse_review(self, response):

        review_xpaths = {
            "TestTitle": "//meta[@property='og:title']/@content",
            "Author": "//div[@class='meta']/a/text()",
            "TestSummary": "//meta[@name='description']/@content"
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        product = ProductItem()
        if not review['TestSummary']:
            review['TestSummary'] = self.extract(
                response.xpath("//meta[@property='og:description']/@content"))

        test_url = response.url
        internal_source_id = str(test_url).split('/')[4].rstrip('/')
        review['source_internal_id'] = internal_source_id
        product['source_internal_id'] = internal_source_id
        # product name
        title = (review['TestTitle']).encode('utf-8')
        if 'review' in title:
            product_name = title.replace(" review", "")
        elif 'Review' in title:
            product_name = title.replace(" Review", "")
        elif 'Video' in title:
            product_name = title.replace(" Video", "").split(":")[0]
        elif ':' in title:
            product_name = str(title).split(":")[0]
        else:
            product_name = title

        product_name = product_name.replace(
            " - Carryology - Exploring better ways to carry",
            "").replace(" Video",
                        "").replace("Drive By", "").replace(":", "").replace(
                            " |", "").replace(" Carryology", "")

        review['ProductName'] = product_name
        product['ProductName'] = product_name

        source_test_rating = self.extract(
            response.xpath("//div[@class='bar']/span[@class='score']/text()"))
        if source_test_rating:
            review['SourceTestRating'] = source_test_rating
            review['SourceTestScale'] = '10'
        review['TestUrl'] = test_url

        date_str = self.extract(
            response.xpath("//div[@class='meta']/text()[2]"))
        date = str(date_str).lstrip(", ")
        date_time = date_format(date, "%B %d, %Y")
        review['TestDateText'] = date_time
        review['DBaseCategoryName'] = 'PRO'

        product['TestUrl'] = test_url
        product['OriginalCategoryName'] = self.extract(
            response.xpath("//div[@class='breadcrumbs']//span/text()"))
        product['PicURL'] = self.extract(
            response.xpath('//meta[@property="og:image"]/@content'))
        yield review
        yield product
Пример #13
0
    def parse_reviews(self, response):
        product = response.meta["product"]
        reviews = response.xpath('//ul[@id="reviews-list"]/li')

        next_page_xpath = "//div[@id='review-list']/div[@class='see-more-bar']//a/@href"
       
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = self.extract(review.xpath('.//h3/a/@href'))
            date = self.extract(review.xpath('.//meta[@itemprop="datePublished"]/@content'))
            if date:
                date = date[:10]
                user_review['TestDateText'] = date_format(date, "%Y-%m-%d")
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="reviewRating"]/@content'))
            if user_review['SourceTestRating']:
                user_review['SourceTestScale'] = 5
            user_review['Author'] = self.extract(review.xpath('.//a[@class="user-link"]//text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()'))
            user_review['TestSummary'] = self.extract_all(
                    review.xpath('.//div[@class="review-text"]//span/span/text()'))
            user_review['TestPros'] = self.extract_all(
                    review.xpath(".//p[contains(@class, 'label-cons')]/following::p[1][not(text()='-')]/text()"))
            user_review['TestCons'] = self.extract_all(
                    review.xpath(".//p[contains(@class,'label-pros')]/following::p[1][not(text()='-')]/text()"))
            yield user_review

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            request = Request(url=next_page_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
Пример #14
0
    def parse_reviews(self, response):
        if response.xpath("//h1[@itemprop='itemReviewed']"):
            product_xpaths = {
                "PicURL":
                "(//*[@property='og:image'])[1]/@content",
                "ProductManufacturer":
                "(//span[@class='detail-label' and text()='Manufacture']/following-sibling::span[@class='detail-content'])[1]//text()"
            }

            review_xpaths = {
                "TestTitle": "//h1[@itemprop='itemReviewed']/text()",
                "TestSummary":
                "(//span[@class='detail-label' and text()='Overview']/following-sibling::span[@class='detail-content'])[1]/p[1]//text()",
                "Author": "//span[@itemprop='author']/text()",
                "SourceTestRating": "//meta[@itemprop='ratingValue']/@content",
                "TestDateText": "//meta[@itemprop='datePublished']/@content",
                "TestVerdict": "(//div[@class='bottomline']/p)[1]//text()"
            }
            test_summary_alt_xpath = "(//span[@class='detail-label' and text()='Overall']/following-sibling::span[@class='detail-content'])[1]/p[1]//text()"
            pros_css = ".procon.pro"
            cons_css = ".procon.con"

            category = None
            match = re.search(self.category_re, response.url)
            if match:
                category = CategoryItem()
                category["category_leaf"] = match.group(1)
                category["category_path"] = match.group(1)
                yield category

            product = self.init_item_by_xpaths(response, "product",
                                               product_xpaths)
            review = self.init_item_by_xpaths(response, "review",
                                              review_xpaths)

            if category:
                product['OriginalCategoryName'] = category["category_path"]
            product['ProductName'] = strip(review['TestTitle'].replace(
                'A Review of the', ''))
            review['ProductName'] = product['ProductName']
            pros_div = response.css(pros_css)
            review["DBaseCategoryName"] = "PRO"
            if not review['TestSummary']:
                review['TestSummary'] = self.extract_all(
                    response.xpath(test_summary_alt_xpath))

            review['TestPros'] = self.extract_all(pros_div.xpath('./p/text()'),
                                                  separator=' ; ',
                                                  strip_unicode=[u'\u2022'])

            cons_div = response.css(cons_css)
            review['TestCons'] = self.extract_all(cons_div.xpath('./p/text()'),
                                                  separator=' ; ',
                                                  strip_unicode=[u'\u2022'])

            review['TestDateText'] = date_format(review['TestDateText'],
                                                 '%b %d, %Y')

            yield product
            yield review
Пример #15
0
    def parse_product(self, response):
        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1[@itemprop="itemreviewed"]/text()'))
        product['PicURL'] = self.extract(response.xpath('//div[@class="productPhotoGallery"]/div/img/@src'))
        product['ProductManufacturer'] = self.extract(response.xpath(
                '//div[@class="manufacturer"]//span[not(text()="brak")]/text()'))
        yield product

        reviews = response.xpath(
                '//div[@class="opinion"][not(descendant::a[contains(text(),"Opinia z serwisu Ceneo.pl")])]')
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            date = self.extract(review.xpath('.//span[@class="date"]/text()'))
            user_review['TestDateText'] = date_format(date, "%Y-%m-%d")
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@class="points"]/text()'))
            user_review['Author'] = self.extract_all(review.xpath('.//*[@class="profileName"]//text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="text"]//text()'))
            user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pluses"]//span/text()'), '; ')
            user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="minuses"]//span/text()'), '; ')
            yield user_review
Пример #16
0
    def parse_reviews(response):
        reviews = re.findall(
            r'TagDimensions(((?!(TagDimensions|SyndicationSource)).)+)ModerationStatus',
            response.body)

        for item in reviews:
            review = item[0]
            sii = re.findall(r'"ProductId":"([\d-]+)', review)
            if response.meta['product']['source_internal_id'] in sii:
                user_review = ReviewItem()
                user_review['DBaseCategoryName'] = "USER"
                user_review['ProductName'] = response.meta['product'][
                    'ProductName']
                user_review['TestUrl'] = response.meta['product']['TestUrl']
                user_review['source_internal_id'] = response.meta['product'][
                    'source_internal_id']
                date = re.findall(r'"SubmissionTime":"([\d-]+)', review)
                user_review['TestDateText'] = date_format(date[0], "%Y-%m-%d")
                rate = re.findall(r'"Rating":([\d])', review)
                user_review['SourceTestRating'] = rate[0]
                author = re.findall(r'"UserNickname":"([^"]+)', review)
                if author:
                    user_review['Author'] = author[0]
                title = re.findall(r'"Title":"([^"]+)', review)
                if title:
                    user_review['TestTitle'] = title[0]
                summary = re.findall(r'"ReviewText":"([^"]+)', review)
                if summary:
                    user_review['TestSummary'] = summary[0]
                yield user_review
Пример #17
0
    def parse_product(self, response):
        reviews = response.xpath('//section[article[contains(@class,"review")]]')
        if reviews:
            product = ProductItem()

            product['TestUrl'] = response.url
            product['OriginalCategoryName'] = 'Cell Phones'
            product['ProductName'] = self.extract(response.xpath('//meta[@itemprop="name"]/@content'))
            pic_url = self.extract(response.xpath('//meta[@property="og:image"]/@content'))
            product['PicURL'] = get_full_url(response, pic_url)
            product['ProductManufacturer'] = self.extract(response.xpath('//meta[@itemprop="brand"]/@content'))
            yield product

            user_reviews = reviews.xpath('./article[@itemprop="review"]')

            for review in user_reviews:
                user_review = ReviewItem()
                user_review['DBaseCategoryName'] = "USER"
                user_review['ProductName'] = product['ProductName']
                user_review['TestUrl'] = product['TestUrl']
                date = self.extract(review.xpath('.//span[@class="time"]/text()'))
                user_review['TestDateText'] = date_format(date, '')
                user_review['SourceTestRating'] = self.extract(review.xpath('.//meta[@itemprop="ratingValue"]/@content'))
                user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()'))
                user_review['TestPros'] = self.extract_all(review.xpath(
                    './/div[contains(@class,"positives")]/text()'), '; ')
                user_review['TestCons'] = self.extract_all(review.xpath(
                    './/div[contains(@class,"negatives")]/text()'), '; ')
                yield user_review

            pro_review_url = self.extract(reviews.xpath('./article[contains(@class,"expert")]/div/a/@href'))
            if pro_review_url:
                request = Request(url=get_full_url(response, pro_review_url), callback=self.parse_review)
                request.meta['product'] = product
                yield request
Пример #18
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/text()'))
        product['PicURL'] = self.extract(response.xpath('//div[@class="images"]/a/img/@src'))
        product['ProductManufacturer'] = self.extract(
                response.xpath('//span[text()="Marca"]/parent::li/span[@class="value"]/text()'))
        product['source_internal_id'] = self.extract(response.xpath('//input[@id="prodId"]/@value'))
        yield product

        reviews = response.xpath('//article[@itemscope]')
       
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['source_internal_id'] = product['source_internal_id']
            user_review['TestUrl'] = product['TestUrl']
            date = self.extract(review.xpath('.//div[@class="date"]/text()'))
            date_match = re.findall(r'[\d/]{10}', date)
            if date_match:
                user_review['TestDateText'] = date_format(date_match[0], "%d/%m/%Y")
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()'))
            user_review['Author'] = self.extract(review.xpath('.//h2/a/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//h3/a/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//p[@itemprop="reviewBody"]/text()'))
            user_review['TestPros'] = self.extract_all(review.xpath('.//div[@class="pro"]//li/text()'), '; ')
            user_review['TestCons'] = self.extract_all(review.xpath('.//div[@class="con"]//li/text()'), '; ')
            yield user_review
Пример #19
0
    def parse_product(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category']['category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/text()'))
        pic_url = self.extract(response.xpath('//div[@class="product-carousel"]//img[@itemprop="image"][1]/@src'))
        product['PicURL'] = get_full_url(response, pic_url)
        product['ProductManufacturer'] = self.extract(response.xpath(
            '//td[text()="Constructeur"]/following-sibling::td/text()'))
        yield product

        reviews = response.xpath('//li[@itemprop="review"]')
       
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            date = self.extract(review.xpath('.//span[@itemprop="datePublished"]/text()'))
            user_review['TestDateText'] = date_format(date, '%d/%m/%Y')
            user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()'))
            user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//div[@itemprop="name"]/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//blockquote/text()'))
            yield user_review
Пример #20
0
 def level_4(self, response):
     original_url = response.url
     pname = response.meta["ProductName"]
     test_url = response.meta["TestUrl"]
     json_string = response.body.replace('bv_1111_60234', '').strip('()')
     data = json.loads(json_string)
     results = data['BatchedResults']['q0']['Results']
     try:
         for item in results:
             review = ReviewItem()
             review['DBaseCategoryName'] = "USER"
             review['ProductName'] = pname
             review['TestUrl'] = test_url
             review['source_internal_id'] = item['ProductId']
             review['TestDateText'] = item['SubmissionTime']
             if review['TestDateText']:
                 review['TestDateText'] = date_format(
                     review['TestDateText'], '')
             review['SourceTestRating'] = item['Rating']
             review['SourceTestScale'] = '5'
             review['Author'] = item['UserNickname']
             review['TestTitle'] = item['Title']
             review['TestSummary'] = item['ReviewText']
             review['TestPros'] = item['Pros']
             review['TestCons'] = item['Cons']
             yield review
     except:
         pass
     pass
Пример #21
0
 def parse_reviews(self, response):
     category = response.meta['category']
     product = ProductItem()
     product['TestUrl'] = response.url
     product['OriginalCategoryName'] = category['category_path']
     product['ProductName'] = self.extract(response.xpath('//span[@class="fn"]/text()'))
     product_id = response.meta['product_id']
     product['PicURL'] = 'http://geizhals.at/p/'+product_id+'.jpg'
     product['source_internal_id'] = product_id
     yield product
     
     reviews = response.xpath('//li[contains(@class,"gh_box")]')
     for review in reviews:
         user_review = ReviewItem()
         user_review['DBaseCategoryName'] = "USER"
         user_review['ProductName'] = product['ProductName']
         user_review['TestUrl'] = product['TestUrl']
         date = self.extract(review.xpath('.//div[@class="userbox"]/text()')).strip('am ')
         user_review['TestDateText'] = date_format(date, "%d.%m.%Y %H:%M")
         user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="rating"]/text()'))
         user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()'))
         user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()'))
         user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@itemprop="description"]//text()'))
         user_review['source_internal_id'] = product['source_internal_id']
         yield user_review
Пример #22
0
 def parse_reviews(self, response):
     reviews = response.xpath('//div[@class="caja-comentarios"]')
    
     for review in reviews:
         user_review = ReviewItem()
         user_review['DBaseCategoryName'] = "USER"
         user_review['ProductName'] = response.meta['product']['ProductName']
         user_review['TestUrl'] = response.meta['product']['TestUrl']
         user_review['source_internal_id'] = response.meta['product']['source_internal_id']
         date = self.extract(review.xpath('./p/text()[2]'))
         user_review['TestDateText'] = date_format(date, '%d-%m-%Y')
         rates = self.extract_list(review.xpath('.//li[@class="current-rating"]'))
         scale = 0
         rating = 0
         for rate in rates:
             rate_match = re.findall(r'([\d.]+)/5', rate)
             rating += float(rate_match[0])
             scale += 5
         user_review['SourceTestRating'] = str(rating)
         user_review['SourceTestScale'] = str(scale)
         user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()'))
         user_review['TestSummary'] = self.extract(review.xpath('.//div[@class="caja"]/text()[1]'))
         user_review['TestPros'] = self.extract(review.xpath(
                 './/strong[contains(text(),"Ventajas")]/following-sibling::text()[1]'))
         user_review['TestCons'] = self.extract(review.xpath(
                 './/strong[contains(text(),"Desventajas")]/following-sibling::text()[1]'))
         yield user_review
Пример #23
0
    def parse_review(self, response):
        review_xpaths = {
            'ProductName': '//meta[@property="og:title"]/@content',
            'TestTitle': '//meta[@property="og:title"]/@content',
            'Author': '(//a[@class="username"])[1]/text()',
            "TestDateText": '(//div[@class="pane-content"])[2]/text()',
            'TestSummary': '//meta[@property="og:description"]/@content'
        }

        review = self.init_item_by_xpaths(response, 'review', review_xpaths)

        review["ProductName"] = review["ProductName"].replace("Test:", '')
        review["source_internal_id"] = self.get_source_internal_id(response)

        review["TestDateText"] = date_format(
            review["TestDateText"], "%d %b %Y")

        review['DBaseCategoryName'] = 'PRO'

        verdict_url_xpath = "//a[contains(text(),'Fazit')]/@href"
        verdict_page = self.extract(response.xpath(verdict_url_xpath))
        if verdict_page:
            return response.follow(
                verdict_page,
                callback=self.get_test_verdict,
                meta={'review': review}
            )

        else:
            self.extract_test_verdict(response, review)
            return review
Пример #24
0
    def parse_reviews(self, response):
        product = ProductItem()

        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['category'][
            'category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/a/text()'))
        product['PicURL'] = self.extract(
            response.xpath('//meta[@property="og:image"]/@content'))
        product['ProductManufacturer'] = self.extract(
            response.xpath('//meta[@itemprop="brand"]/@content'))
        product['source_internal_id'] = self.extract(
            response.xpath('//@data-product-id'))
        yield product

        reviews = response.xpath('//li[@class="opinion-row"]')
        for review in reviews:
            user_review = ReviewItem()
            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['source_internal_id'] = product['source_internal_id']
            date = self.extract(
                review.xpath('.//meta[@itemprop="datePublished"]/@content'))
            user_review['TestDateText'] = date_format(date, "%Y %m %d")
            user_review['SourceTestRating'] = self.extract(
                review.xpath('.//meta[@itemprop="ratingValue"]/@content'))
            user_review['Author'] = self.extract(review.xpath('.//h4/text()'))
            user_review['TestTitle'] = self.extract(
                review.xpath('.//div[contains(@class,"grade-text")]/text()'))
            user_review['TestSummary'] = self.extract_all(
                review.xpath('.//div[@itemprop="description"]/text()'))
            yield user_review
Пример #25
0
    def parse_review(self, response):
        product_xpaths = {
            'OriginalCategoryName': '//div[@class="entry-crumbs"]/'
            'span[last()-1]/a/text()',
            'PicURL': '//meta[@property="og:image"]/@content',
            'source_internal_id': 'substring-after(//article/@id, "-")',
            'TestUrl': '//meta[@property="og:url"]/@content',
        }

        review_xpaths = {
            'TestDateText': 'substring-before(//time/@datetime, "T")',
            'TestPros': '//p[contains(., "Pros")]/following-sibling::p[1]'
            '/text()',
            'TestCons': '//p[contains(., "Cons")]/following-sibling::p[1]'
            '/text()',
            'TestSummary': '//meta[@property="og:description"]/@content',
            'TestVerdict': '//div[@class="td-review-summary-content"]/text()|'
            '//h3[last()]/following-sibling::p[1]/text()',
            'Author': '//div[@class="td-post-author-name"]/a/text()',
            'TestTitle': '//meta[@property="og:title"]/@content',
            'source_internal_id': 'substring-after(//article/@id, "-")',
            'TestUrl': '//meta[@property="og:url"]/@content',
            'TestDateText': 'substring-before(//time/@datetime, "T")',
        }

        product = self.init_item_by_xpaths(response, 'product', product_xpaths)
        review = self.init_item_by_xpaths(response, 'review', review_xpaths)

        product_name_xpath = '//h1[@class="entry-title"]/text()'
        p_name = self.extract(response.xpath(product_name_xpath))
        product_name = ''

        if 'Review' in p_name:
            product_name = p_name.split(' Review')[0]

        else:
            product_name = p_name

        product['ProductName'] = product_name
        review['ProductName'] = product_name

        rating_xpath = '//div[@class="td-review-final-score"]/text()'
        rating = self.extract(response.xpath(rating_xpath))
        SCALE = 5

        if rating:
            review['SourceTestRating'] = rating
            review['SourceTestScale'] = SCALE

        review['DBaseCategoryName'] = 'PRO'

        test_day = review['TestDateText']
        date_str = date_format(test_day, '%Y-%m-%d')
        date_time = datetime.strptime(date_str, '%Y-%m-%d')
        if date_time < self.stored_last_date:
            return

        yield product
        yield review
Пример #26
0
    def parse_items(self, response):
        product_xpaths = {
            "PicURL": "//meta[@property='og:image']/@content",
            "ProductManufacturer":
            "//tr[@class='marke-hersteller']/td/a/text()"
        }

        review_xpaths = {
            "TestSummary": "//div[@id='review_body']/div[1]/p/text()",
            "TestVerdict": "(//div[@id='review_body']/div/p/text())[last()]",
            "TestTitle": "(//title/text())[1]",
            "Author": "//span/meta[@itemprop='author']/@content",
            "TestPros": "//div[@class='list-advantages']/ul/li/div/text()",
            "TestCons": "//div[@class='list-disadvantages']/ul/li/div/text()",
            "SourceTestRating": "//span/meta[@itemprop='ratingValue']/@content"
        }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        productname = self.extract(
            response.xpath("//tr[@class='modell']/td/span/text()"))
        productmanu = product['ProductManufacturer']
        review['ProductName'] = productmanu + " " + productname
        product['ProductName'] = review['ProductName']

        source_internal_id = self.extract(
            response.xpath("//div/meta[@itemprop='productID']/@content"))
        review['source_internal_id'] = source_internal_id
        product['source_internal_id'] = source_internal_id

        if not product['PicURL']:
            product['PicURL'] = self.extract(
                response.xpath("(//div/a/img/@data-src)[1]"))

        if review['SourceTestRating']:
            review['SourceTestScale'] = "5"

        review["DBaseCategoryName"] = "PRO"

        review_date = self.extract(
            response.xpath("//div[@class='offers']/small/text()"))
        date = str(review_date).split(" ")[2]
        review['TestDateText'] = date_format(date, '%d.%m.%Y')

        price = self.extract(
            response.xpath("//div[@class='price']/text()")).encode('utf-8')
        if price:
            product_id = ProductIdItem()
            product_id['ID_kind'] = 'price'
            product_id['ID_value'] = str(price).split(' ')[0]
            product_id['ProductName'] = product['ProductName']
            product_id['source_internal_id'] = product['source_internal_id']
            review_date = datetime.strptime(review['TestDateText'], "%Y-%m-%d")
            if review_date > self.stored_last_date:
                yield review
                yield product_id
                yield product
Пример #27
0
    def parse_review(self, selector, url):
        image_xpath = "//img[@itemprop='image']/@src"
        image_alt_xpath = "//meta[@property='og:image']/@content"
        manufacturer_xpath = "(//span[@class='taxName' and text()='Manufacturer']/following-sibling::span[@class='taxContent'])[1]//text()"

        title_xpath = "//span[@itemprop='itemReviewed']/text()"
        summary_xpath = "//*[@itemprop='description']//text()"
        summary_alt_xpath = "//meta[@property='og:description']/@content"
        author_xpath = "//span[@itemprop='author']//text()"
        date_xpath = "//meta[@itemprop='datePublished']/@content"
        pros_xpath = "//div[@class='positive-wrapper']/text()"
        cons_xpath = "//div[@class='negative-wrapper']/text()"
        rating_value_xpath = "//*[@itemprop='ratingValue']/text()"
        rating_scale_xpath = "//*[@itemprop='bestRating']/text()"

        review = ReviewItem()
        review["TestTitle"] = self.extract_all(selector.xpath(title_xpath))
        review["TestSummary"] = self.extract_all(selector.xpath(summary_xpath))
        review["Author"] = self.extract(selector.xpath(author_xpath))
        review["TestDateText"] = self.extract(selector.xpath(date_xpath))
        review["TestPros"] = self.extract_all(selector.xpath(pros_xpath),
                                              ' ; ')
        review["TestCons"] = self.extract_all(selector.xpath(cons_xpath),
                                              ' ; ')
        review["SourceTestRating"] = self.extract(
            selector.xpath(rating_value_xpath))
        review["SourceTestScale"] = self.extract(
            selector.xpath(rating_scale_xpath))
        review["DBaseCategoryName"] = "PRO"
        review["TestUrl"] = url
        review["TestDateText"] = date_format(review["TestDateText"],
                                             "%b %d,%Y")

        product = ProductItem()
        product_name_re = "(.+)\sReview"
        name_match = re.search(product_name_re, review["TestTitle"],
                               re.IGNORECASE)
        if name_match:
            product["ProductName"] = name_match.group(1)
        else:
            product["ProductName"] = review["TestTitle"]

        review["ProductName"] = product["ProductName"]

        product["TestUrl"] = url
        product["PicURL"] = self.extract(selector.xpath(image_xpath))
        product["ProductManufacturer"] = self.extract(
            selector.xpath(manufacturer_xpath))
        if not product["PicURL"]:
            product["PicURL"] = self.extract(selector.xpath(image_alt_xpath))

        if not review["TestSummary"]:
            review["TestSummary"] = self.extract(
                selector.xpath(summary_alt_xpath))

        yield product
        yield review
Пример #28
0
    def parse_review(self, response):
        next_page_xpath = "(//*[@rel='next']/@href)[1]"
        default_rating_xpath = './/reevoo-score/@data-score'

        product = response.meta['product']
        reviews = response.xpath('//article[contains(@id,"review_")]')

        if not reviews:
            return

        # From observation, at least currys.co.uk uses a different format to present review rating
        rating_xpath = response.meta.get('rating_xpath', '')
        if not rating_xpath:
            rating_xpath = default_rating_xpath

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])

        for review in reviews:
            user_review = ReviewItem()
            date = self.extract(
                review.xpath(
                    './/span[contains(@class, "date_publish")]/text()'))
            if date:
                user_review['TestDateText'] = date_format(date, '')
                current_user_review = dateparser.parse(
                    user_review['TestDateText'], date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['SourceTestRating'] = self.extract(
                review.xpath(rating_xpath))
            user_review['Author'] = self.extract(
                review.xpath('.//h4[@class="attribution-name"]/text()'))
            user_review['TestPros'] = self.extract_all(
                review.xpath('.//dd[@class="pros"]/text()'))
            user_review['TestCons'] = self.extract_all(
                review.xpath('.//dd[@class="cons"]/text()'))
            user_review['source_internal_id'] = product['source_internal_id']

            # All reviews after first empty review are empty
            if user_review['TestPros'] or user_review['TestCons']:
                yield user_review
            else:
                return

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url,
                              callback=self.parse_review,
                              meta=response.meta)
            yield request
Пример #29
0
    def parse_items(self, response):

        review_xpaths = {
            "TestTitle": "//h1/text()",
            "TestVerdict": "//p[@class='game-verdict']/text()",
            "TestPros": "//div[@class='sub-box'][1]/ul/li/text()",
            "TestCons": "//div[@class='sub-box'][2]/ul/li/text()",
            "TestSummary": "//meta[@name='description']/@content"
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        product = ProductItem()

        internal_source_id = str(response.url).split("/")[4]
        review['source_internal_id'] = internal_source_id
        product['source_internal_id'] = internal_source_id

        product_name = self.extract(
            response.xpath('//h1[@itemprop="name headline"]//text()')).encode(
                'utf-8')
        review['ProductName'] = str(product_name).strip('review')
        product['ProductName'] = str(product_name).strip('review')

        source_test_rating = self.extract(
            response.xpath(
                "//span[@class='score no-graphic score-short']/text()"))
        if source_test_rating:
            review['SourceTestRating'] = source_test_rating
            review['SourceTestScale'] = '10'

        product['TestUrl'] = response.url

        date_str = self.extract(response.xpath("//time/@datetime"))
        if date_str:
            date_str = str(date_str).split("T")[0]
            date_time = date_format(date_str, "%Y-%m-%d")
            date_time_to_compare = datetime.strptime(date_time, '%Y-%m-%d')
            if self.stored_last_date > date_time_to_compare:
                return

        review['TestDateText'] = date_time
        review['DBaseCategoryName'] = 'PRO'

        picture_src = self.extract(
            response.xpath(
                "//img[@class='TODO image-class block-image-ads']/@src"))
        picture_url = get_full_url(response.url, picture_src)
        product['PicURL'] = picture_url

        cat = self.extract(
            response.xpath("//a[@class='chunk category']/text()"))
        if cat == 'Review':
            yield review
            yield product
        elif cat == 'Hardware':
            yield review
            yield product
Пример #30
0
    def level_2(self, response):

        product_xpaths = {"PicURL": '//meta[@property="og:image"]/@content'}

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['ProductName'] = self.get_product_name(response)
        product['source_internal_id'] = self.get_source_internal_id(response)

        original_category_name_xpath = "(//img[@alt='Themen']/ancestor::div/"\
            "following-sibling::div)[1]/a/text()"
        original_category_name = self.extract_all(
            response.xpath(original_category_name_xpath), " | ")
        if original_category_name:
            product["OriginalCategoryName"] = original_category_name

        review_xpaths = {
            'source_internal_id':
            "substring-after(//script["
            "@type='text/javascript']/text(),'print/')",
            "TestDateText":
            "//div[@class='articlebox-content']/div[5]/text()",
            "TestSummary":
            '//meta[@property="og:description"]/@content',
            "Author":
            "//meta[@itemprop='creator accountablePerson']/@content",
            "TestTitle":
            '//meta[@property="og:title"]/@content',
            "TestDateText":
            "(//img[@alt='Publikationsdatum']/ancestor::div"
            "/following-sibling::div)[1]/text()"
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        review['ProductName'] = self.get_product_name(response)
        review['source_internal_id'] = self.get_source_internal_id(response)
        yield product

        test_date = review["TestDateText"]

        if test_date:
            test_date = test_date.strip()
            review["TestDateText"] = date_format(test_date, "%d. %B %Y",
                                                 ["de"])

        review["DBaseCategoryName"] = "PRO"

        verdict_url_xpath = "//div[@class='kapitel '][last()]/a/@href"
        verdict_page = self.extract(response.xpath(verdict_url_xpath))
        if verdict_page:
            yield response.follow(verdict_page,
                                  callback=self.get_test_verdict,
                                  meta={'review': review})

        else:
            self.extract_test_verdict(response, review)
            yield review