def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category'][ 'category_path'] product['ProductName'] = self.extract( response.xpath( '(//div[@id="community_icons"]|//div[contains(@class,"product-title")])/h1/text()' )) pic_url = self.extract( response.xpath( '//a[@class="zoom"]/@rel|//div[@id="sync_main"]/img[1]/@data-src' )) product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = self.extract( response.xpath('//meta[@itemprop="brand"]/@content|//h3/a/text()')) product['source_internal_id'] = self.extract( response.xpath( '//div[@data-gm_pagemode="product_info"]/@data-gm_pageid|' '//div[contains(@class,"product-comparison")]/@data-product-id' )) yield product reviews = response.xpath( '//div[@itemprop="reviews"]|//div[contains(@class,"review-item")]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['source_internal_id'] = product['source_internal_id'] user_review['TestUrl'] = response.url date = self.extract( review.xpath( './/@datetime|.//div[@class="review-date"]/text()')) if 'am' in date: user_review['TestDateText'] = date_format(date, "am %d.%m.%Y") else: user_review['TestDateText'] = date_format(date, "%Y-%m-%d") user_review['SourceTestRating'] = self.extract( review.xpath( './/meta[@itemprop="rating"]/@content|.//@data-rating')) user_review['Author'] = self.extract( review.xpath( './/span[@itemprop="reviewer"]/text()|.//span[@class="click"]/text()' )) user_review['TestTitle'] = self.extract( review.xpath( './/span[@itemprop="summary"]/text()|.//div[contains(@class,"review-title")]/text()' )) user_review['TestSummary'] = self.extract_all( review.xpath( './/p[@itemprop="description"]/span//text()|.//div[contains(@class,"review-content")]//text()' )) yield user_review
def parse_review(self, response): product = response.meta['product'] review_url = response.meta['review_url'] containers_xpath = "//div[@itemprop='review']" containers = response.xpath(containers_xpath) for review_container in containers: review = ReviewItem() review['SourceTestRating'] = self.extract(review_container.xpath(".//*[@itemprop='ratingValue']/@content")) review['TestDateText'] = self.extract(review_container.xpath(".//span[@itemprop='datePublished']/text()")) review['TestSummary'] = self.extract_all(review_container.xpath(".//p[@itemprop='reviewBody']//text()" "[not(ancestor::a)]")) review['Author'] = self.extract(review_container.xpath(".//a[@itemprop='author']/text()")) review['TestTitle'] = self.extract(review_container.xpath(".//*[@itemprop='name']/text()")) review['TestUrl'] = review_url review["SourceTestScale"] = "5" review['ProductName'] = product['ProductName'] review['source_internal_id'] = product['source_internal_id'] review["DBaseCategoryName"] = "USER" if review["TestDateText"]: review["TestDateText"] = date_format(review["TestDateText"], '') yield review button_next_url = self.extract(response.xpath("//*[@rel='next']/@href")) if button_next_url: button_next_url = get_full_url(response.url, button_next_url) request = Request(button_next_url, callback=self.parse_review, meta=response.meta) yield request
def parse_review(self, response): product = response.meta['product'] user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = response.url date = self.extract( response.xpath( '//span[@class="dtreviewed"]/span[@class="value-title"]/@title' )) if date: user_review['TestDateText'] = date_format(date, '') rating = self.extract( response.xpath( '//div[@class="contentBox"]//a[contains(@class,"iReviewStars")]/@title' )) rating = re.findall(r'[^"]+ star', rating) user_review['SourceTestRating'] = rating[0] user_review['Author'] = self.extract( response.xpath('//a[@class="memberName"]/text()')) user_review['TestTitle'] = self.extract( response.xpath('//h3[contains(@class,"reviewTitle")]/text()')) user_review['TestSummary'] = self.extract_all( response.xpath('//div[contains(@class,"reviewText")]//text()')) user_review['TestPros'] = self.extract_all( response.xpath('//span[@class="reviewPros"]/parent::div/text()')) user_review['TestCons'] = self.extract_all( response.xpath('//span[@class="reviewCons"]/parent::div/text()')) yield user_review
def parse_reviews(self, response): review = ReviewItem() product = ProductItem() contents = response.xpath('//article[@class="post-content"]') for content in contents: title = self.extract( content.xpath('.//div//h1[@class="post-title"]//text()')) test_url = self.extract( content.xpath('.//div//h1[@class="post-title"]//a/@href')) author = self.extract( content.xpath('.//span[@itemprop="name"]/text()')) date_str = self.extract_all( content.xpath('.//meta[@itemprop="datePublished"]/@content')) date = date_format(date_str, '%Y-%m-%d') pic = self.extract(content.xpath('.//img/@src')) sumamry = self.extract_all( content.xpath('.//div[@itemprop="articleBody"]//text()')) sid = test_url.split('/')[-2] # product items product['ProductName'] = title product['PicURL'] = pic product['source_internal_id'] = sid product['TestUrl'] = test_url # review review['ProductName'] = title review['TestTitle'] = title review['TestSummary'] = sumamry review['TestUrl'] = test_url review['DBaseCategoryName'] = 'pro' review['source_internal_id'] = sid review['TestDateText'] = date review['Author'] = author yield review yield product
def parse_pro(self, response): item = response.meta['item'] pro_review = response.xpath('//div[@id="besteproducttest"]') rate_xpath = './/div[@class="block"]/div[contains(@class,"bp-review__intro__score")]//text()' if pro_review: item['has_review'] = 1 review = ReviewItem() review['DBaseCategoryName'] = "PRO" review['ProductName'] = item['name'] review['TestUrl'] = response.url date = self.extract(pro_review.xpath('.//@datetime')) review['TestDateText'] = date_format(date, '') review['SourceTestRating'] = self.extract( pro_review.xpath(rate_xpath)).replace(",", ".") review['Author'] = self.extract( pro_review.xpath('.//div[@class="avatar__title"]/text()')) review['TestTitle'] = self.extract( pro_review.xpath('.//h1/text()')) review['TestSummary'] = self.extract_all( pro_review.xpath('.//p/text()')) yield review request = Request(url=item['url'] + '/gebruikersreviews', callback=self.parse_user) request.meta['item'] = item yield request
def _parse_reviews(self, selector, browser, product): review_container_xpath = "//div[@data-review-id]" author_xpath = ".//p[@class='pr-review-author-name']/span/text()" rating_xpath = ".//span[contains(@class, 'pr-rating')]/text()" title_xpath = ".//p[@class='pr-review-rating-headline']" test_date_xpath = ".//div[contains(@class, 'pr-review-author-date')]/text()" summary_xpath = ".//p[@class='pr-comments']/text()" next_page_xpath = "//a[@class='next_page']" review_containers = selector.xpath(review_container_xpath) for review_container in review_containers: review = ReviewItem() review['DBaseCategoryName'] = "USER" review['ProductName'] = product['ProductName'] review['TestUrl'] = product['TestUrl'] review['Author'] = self.extract(review_container.xpath(author_xpath)) review['SourceTestRating'] = self.extract(review_container.xpath( rating_xpath)) review['TestTitle'] = self.extract(review_container.xpath(title_xpath)) review['TestSummary'] = self.extract(review_container.xpath(summary_xpath)) review['TestDateText'] = self.extract(review_container.xpath(test_date_xpath)) review['TestDateText'] = date_format(review['TestDateText'], '%d.%m.%Y') yield review
def parse_reviews(response): reviews = re.findall(r'"CID":(((?!("Badges")).)+)}', response.body) for item in reviews: try: review = item[0] user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = response.meta['product'][ 'ProductName'] user_review['TestUrl'] = response.meta['product']['TestUrl'] date = re.findall(r'"SubmissionTime":"([\d-]+)', review) user_review['TestDateText'] = date_format(date[0], "%Y-%m-%d") rate = re.findall(r'"Rating":([\d])', review) user_review['SourceTestRating'] = rate[0] author = re.findall(r'"UserNickname":"([^"]+)', review) if author: user_review['Author'] = author[0] title = re.findall(r'"Title":"([^"]+)', review) if title: user_review['TestTitle'] = title[0] summary = re.findall(r'"ReviewText":"([^"]+)', review) if summary: user_review['TestSummary'] = summary[0] yield user_review except: pass
def parse_review(self, response): product_xpaths = {"PicURL": "//*[@property='og:image']/@content"} review_xpaths = { "TestTitle": "//*[@property='og:title']/@content", "TestSummary": "//*[@name='og:description']/@content", "Author": "//span[@itemprop='reviewer']/text()", "SourceTestRating": "//div[contains(@class, 'expert-rating')]//span[@itemprop='rating']/text()", "TestDateText": "//div[@class='review']//span[@class='metadata']/text()[last()]", "TestPros": "//div[@class='features']/div/text()", "TestVerdict": "//div[@class='Normal']/text()[last()]" } product = self.init_item_by_xpaths(response, "product", product_xpaths) review = self.init_item_by_xpaths(response, "review", review_xpaths) title = review["TestTitle"].lower() review["ProductName"] = title.replace("review", "").strip(":") if ":" in review["ProductName"]: review["ProductName"] = review["ProductName"].split(":")[0] review["ProductName"] = review["ProductName"].replace( "- the times of india", "").strip() product["ProductName"] = review["ProductName"] yield product review["DBaseCategoryName"] = "PRO" review["SourceTestScale"] = "5" review["ProductName"] = product["ProductName"] if review["TestDateText"]: review["TestDateText"] = date_format(review["TestDateText"], "%d %B %Y") yield review
def parse_review(self, response, reviewData, extra_parser=None): product = response.meta['product'] review = ReviewItem.from_product(product=product, rating=reviewData['Rating'], scale=reviewData['RatingRange'], date=date_format( reviewData['SubmissionTime'], '%Y-%m-%dT%H:%M:%S'), author=reviewData['UserNickname'], title=reviewData['Title'], summary=reviewData['ReviewText'], pros=reviewData['Pros'], cons=reviewData['Cons'], tp='USER') if not review.get('TestPros', ''): review['TestPros'] = ' ; '.join( reviewData.get('TagDimensions', {}).get('Pro', {}).get('Values', [])) if not review.get('TestCons', ''): review['TestCons'] = ' ; '.join( reviewData.get('TagDimensions', {}).get('Con', {}).get('Values', [])) if extra_parser: review = extra_parser(review, reviewData) return review
def parse_review(self, response): product_xpaths = {"PicURL": "//*[@property='og:image']/@content"} review_xpaths = { "TestTitle": "//*[@property='og:title']/@content", "TestSummary": "//*[@property='og:description']/@content", "Author": "//a[@rel='author']/text()", "TestDateText": "//*[contains(@property, 'published_time')]/@content", } category_name = response.meta['category_name'] product = self.init_item_by_xpaths(response, "product", product_xpaths) review = self.init_item_by_xpaths(response, "review", review_xpaths) product["ProductName"] = review['TestTitle'] product["OriginalCategoryName"] = category_name yield product review["ProductName"] = product["ProductName"] review["DBaseCategoryName"] = "PRO" review["SourceTestScale"] = "5" if review["TestDateText"]: review["TestDateText"] = date_format(review["TestDateText"], "%d %B %Y") yield review
def parse_reviews(self, response): product = response.meta['product'] reviews = response.xpath('//ul[@class="reviews-list"]/li') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['source_internal_id'] = product['source_internal_id'] date = self.extract(review.xpath('.//time/@datetime')) if date: user_review['TestDateText'] = date_format(date, "%Y %m %d") rating = self.extract( review.xpath( './/div[contains(@class,"rateit-selected")]/@style')) rating = rating.strip('width:').strip('.00%') user_review['SourceTestRating'] = rating user_review['Author'] = self.extract( review.xpath('.//div[@class="customer"]/span/text()')) user_review['TestTitle'] = self.extract( review.xpath('.//div[@class="title"]/text()')) user_review['TestSummary'] = self.extract_all( review.xpath('.//div[@class="copy"]/p/text()')) yield user_review
def parse_review(self, response): review_xpaths = { "TestTitle": "//meta[@property='og:title']/@content", "Author": "//div[@class='meta']/a/text()", "TestSummary": "//meta[@name='description']/@content" } review = self.init_item_by_xpaths(response, "review", review_xpaths) product = ProductItem() if not review['TestSummary']: review['TestSummary'] = self.extract( response.xpath("//meta[@property='og:description']/@content")) test_url = response.url internal_source_id = str(test_url).split('/')[4].rstrip('/') review['source_internal_id'] = internal_source_id product['source_internal_id'] = internal_source_id # product name title = (review['TestTitle']).encode('utf-8') if 'review' in title: product_name = title.replace(" review", "") elif 'Review' in title: product_name = title.replace(" Review", "") elif 'Video' in title: product_name = title.replace(" Video", "").split(":")[0] elif ':' in title: product_name = str(title).split(":")[0] else: product_name = title product_name = product_name.replace( " - Carryology - Exploring better ways to carry", "").replace(" Video", "").replace("Drive By", "").replace(":", "").replace( " |", "").replace(" Carryology", "") review['ProductName'] = product_name product['ProductName'] = product_name source_test_rating = self.extract( response.xpath("//div[@class='bar']/span[@class='score']/text()")) if source_test_rating: review['SourceTestRating'] = source_test_rating review['SourceTestScale'] = '10' review['TestUrl'] = test_url date_str = self.extract( response.xpath("//div[@class='meta']/text()[2]")) date = str(date_str).lstrip(", ") date_time = date_format(date, "%B %d, %Y") review['TestDateText'] = date_time review['DBaseCategoryName'] = 'PRO' product['TestUrl'] = test_url product['OriginalCategoryName'] = self.extract( response.xpath("//div[@class='breadcrumbs']//span/text()")) product['PicURL'] = self.extract( response.xpath('//meta[@property="og:image"]/@content')) yield review yield product
def parse_reviews(self, response): product = response.meta["product"] reviews = response.xpath('//ul[@id="reviews-list"]/li') next_page_xpath = "//div[@id='review-list']/div[@class='see-more-bar']//a/@href" for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = self.extract(review.xpath('.//h3/a/@href')) date = self.extract(review.xpath('.//meta[@itemprop="datePublished"]/@content')) if date: date = date[:10] user_review['TestDateText'] = date_format(date, "%Y-%m-%d") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="reviewRating"]/@content')) if user_review['SourceTestRating']: user_review['SourceTestScale'] = 5 user_review['Author'] = self.extract(review.xpath('.//a[@class="user-link"]//text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()')) user_review['TestSummary'] = self.extract_all( review.xpath('.//div[@class="review-text"]//span/span/text()')) user_review['TestPros'] = self.extract_all( review.xpath(".//p[contains(@class, 'label-cons')]/following::p[1][not(text()='-')]/text()")) user_review['TestCons'] = self.extract_all( review.xpath(".//p[contains(@class,'label-pros')]/following::p[1][not(text()='-')]/text()")) yield user_review next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: request = Request(url=next_page_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_reviews(self, response): if response.xpath("//h1[@itemprop='itemReviewed']"): product_xpaths = { "PicURL": "(//*[@property='og:image'])[1]/@content", "ProductManufacturer": "(//span[@class='detail-label' and text()='Manufacture']/following-sibling::span[@class='detail-content'])[1]//text()" } review_xpaths = { "TestTitle": "//h1[@itemprop='itemReviewed']/text()", "TestSummary": "(//span[@class='detail-label' and text()='Overview']/following-sibling::span[@class='detail-content'])[1]/p[1]//text()", "Author": "//span[@itemprop='author']/text()", "SourceTestRating": "//meta[@itemprop='ratingValue']/@content", "TestDateText": "//meta[@itemprop='datePublished']/@content", "TestVerdict": "(//div[@class='bottomline']/p)[1]//text()" } test_summary_alt_xpath = "(//span[@class='detail-label' and text()='Overall']/following-sibling::span[@class='detail-content'])[1]/p[1]//text()" pros_css = ".procon.pro" cons_css = ".procon.con" category = None match = re.search(self.category_re, response.url) if match: category = CategoryItem() category["category_leaf"] = match.group(1) category["category_path"] = match.group(1) yield category product = self.init_item_by_xpaths(response, "product", product_xpaths) review = self.init_item_by_xpaths(response, "review", review_xpaths) if category: product['OriginalCategoryName'] = category["category_path"] product['ProductName'] = strip(review['TestTitle'].replace( 'A Review of the', '')) review['ProductName'] = product['ProductName'] pros_div = response.css(pros_css) review["DBaseCategoryName"] = "PRO" if not review['TestSummary']: review['TestSummary'] = self.extract_all( response.xpath(test_summary_alt_xpath)) review['TestPros'] = self.extract_all(pros_div.xpath('./p/text()'), separator=' ; ', strip_unicode=[u'\u2022']) cons_div = response.css(cons_css) review['TestCons'] = self.extract_all(cons_div.xpath('./p/text()'), separator=' ; ', strip_unicode=[u'\u2022']) review['TestDateText'] = date_format(review['TestDateText'], '%b %d, %Y') yield product yield review
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1[@itemprop="itemreviewed"]/text()')) product['PicURL'] = self.extract(response.xpath('//div[@class="productPhotoGallery"]/div/img/@src')) product['ProductManufacturer'] = self.extract(response.xpath( '//div[@class="manufacturer"]//span[not(text()="brak")]/text()')) yield product reviews = response.xpath( '//div[@class="opinion"][not(descendant::a[contains(text(),"Opinia z serwisu Ceneo.pl")])]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@class="date"]/text()')) user_review['TestDateText'] = date_format(date, "%Y-%m-%d") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@class="points"]/text()')) user_review['Author'] = self.extract_all(review.xpath('.//*[@class="profileName"]//text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="text"]//text()')) user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pluses"]//span/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="minuses"]//span/text()'), '; ') yield user_review
def parse_reviews(response): reviews = re.findall( r'TagDimensions(((?!(TagDimensions|SyndicationSource)).)+)ModerationStatus', response.body) for item in reviews: review = item[0] sii = re.findall(r'"ProductId":"([\d-]+)', review) if response.meta['product']['source_internal_id'] in sii: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = response.meta['product'][ 'ProductName'] user_review['TestUrl'] = response.meta['product']['TestUrl'] user_review['source_internal_id'] = response.meta['product'][ 'source_internal_id'] date = re.findall(r'"SubmissionTime":"([\d-]+)', review) user_review['TestDateText'] = date_format(date[0], "%Y-%m-%d") rate = re.findall(r'"Rating":([\d])', review) user_review['SourceTestRating'] = rate[0] author = re.findall(r'"UserNickname":"([^"]+)', review) if author: user_review['Author'] = author[0] title = re.findall(r'"Title":"([^"]+)', review) if title: user_review['TestTitle'] = title[0] summary = re.findall(r'"ReviewText":"([^"]+)', review) if summary: user_review['TestSummary'] = summary[0] yield user_review
def parse_product(self, response): reviews = response.xpath('//section[article[contains(@class,"review")]]') if reviews: product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = 'Cell Phones' product['ProductName'] = self.extract(response.xpath('//meta[@itemprop="name"]/@content')) pic_url = self.extract(response.xpath('//meta[@property="og:image"]/@content')) product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = self.extract(response.xpath('//meta[@itemprop="brand"]/@content')) yield product user_reviews = reviews.xpath('./article[@itemprop="review"]') for review in user_reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@class="time"]/text()')) user_review['TestDateText'] = date_format(date, '') user_review['SourceTestRating'] = self.extract(review.xpath('.//meta[@itemprop="ratingValue"]/@content')) user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()')) user_review['TestPros'] = self.extract_all(review.xpath( './/div[contains(@class,"positives")]/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath( './/div[contains(@class,"negatives")]/text()'), '; ') yield user_review pro_review_url = self.extract(reviews.xpath('./article[contains(@class,"expert")]/div/a/@href')) if pro_review_url: request = Request(url=get_full_url(response, pro_review_url), callback=self.parse_review) request.meta['product'] = product yield request
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1/text()')) product['PicURL'] = self.extract(response.xpath('//div[@class="images"]/a/img/@src')) product['ProductManufacturer'] = self.extract( response.xpath('//span[text()="Marca"]/parent::li/span[@class="value"]/text()')) product['source_internal_id'] = self.extract(response.xpath('//input[@id="prodId"]/@value')) yield product reviews = response.xpath('//article[@itemscope]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['source_internal_id'] = product['source_internal_id'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//div[@class="date"]/text()')) date_match = re.findall(r'[\d/]{10}', date) if date_match: user_review['TestDateText'] = date_format(date_match[0], "%d/%m/%Y") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()')) user_review['Author'] = self.extract(review.xpath('.//h2/a/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h3/a/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//p[@itemprop="reviewBody"]/text()')) user_review['TestPros'] = self.extract_all(review.xpath('.//div[@class="pro"]//li/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath('.//div[@class="con"]//li/text()'), '; ') yield user_review
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1/text()')) pic_url = self.extract(response.xpath('//div[@class="product-carousel"]//img[@itemprop="image"][1]/@src')) product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = self.extract(response.xpath( '//td[text()="Constructeur"]/following-sibling::td/text()')) yield product reviews = response.xpath('//li[@itemprop="review"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@itemprop="datePublished"]/text()')) user_review['TestDateText'] = date_format(date, '%d/%m/%Y') user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()')) user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//div[@itemprop="name"]/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//blockquote/text()')) yield user_review
def level_4(self, response): original_url = response.url pname = response.meta["ProductName"] test_url = response.meta["TestUrl"] json_string = response.body.replace('bv_1111_60234', '').strip('()') data = json.loads(json_string) results = data['BatchedResults']['q0']['Results'] try: for item in results: review = ReviewItem() review['DBaseCategoryName'] = "USER" review['ProductName'] = pname review['TestUrl'] = test_url review['source_internal_id'] = item['ProductId'] review['TestDateText'] = item['SubmissionTime'] if review['TestDateText']: review['TestDateText'] = date_format( review['TestDateText'], '') review['SourceTestRating'] = item['Rating'] review['SourceTestScale'] = '5' review['Author'] = item['UserNickname'] review['TestTitle'] = item['Title'] review['TestSummary'] = item['ReviewText'] review['TestPros'] = item['Pros'] review['TestCons'] = item['Cons'] yield review except: pass pass
def parse_reviews(self, response): category = response.meta['category'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] product['ProductName'] = self.extract(response.xpath('//span[@class="fn"]/text()')) product_id = response.meta['product_id'] product['PicURL'] = 'http://geizhals.at/p/'+product_id+'.jpg' product['source_internal_id'] = product_id yield product reviews = response.xpath('//li[contains(@class,"gh_box")]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//div[@class="userbox"]/text()')).strip('am ') user_review['TestDateText'] = date_format(date, "%d.%m.%Y %H:%M") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="rating"]/text()')) user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@itemprop="description"]//text()')) user_review['source_internal_id'] = product['source_internal_id'] yield user_review
def parse_reviews(self, response): reviews = response.xpath('//div[@class="caja-comentarios"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = response.meta['product']['ProductName'] user_review['TestUrl'] = response.meta['product']['TestUrl'] user_review['source_internal_id'] = response.meta['product']['source_internal_id'] date = self.extract(review.xpath('./p/text()[2]')) user_review['TestDateText'] = date_format(date, '%d-%m-%Y') rates = self.extract_list(review.xpath('.//li[@class="current-rating"]')) scale = 0 rating = 0 for rate in rates: rate_match = re.findall(r'([\d.]+)/5', rate) rating += float(rate_match[0]) scale += 5 user_review['SourceTestRating'] = str(rating) user_review['SourceTestScale'] = str(scale) user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()')) user_review['TestSummary'] = self.extract(review.xpath('.//div[@class="caja"]/text()[1]')) user_review['TestPros'] = self.extract(review.xpath( './/strong[contains(text(),"Ventajas")]/following-sibling::text()[1]')) user_review['TestCons'] = self.extract(review.xpath( './/strong[contains(text(),"Desventajas")]/following-sibling::text()[1]')) yield user_review
def parse_review(self, response): review_xpaths = { 'ProductName': '//meta[@property="og:title"]/@content', 'TestTitle': '//meta[@property="og:title"]/@content', 'Author': '(//a[@class="username"])[1]/text()', "TestDateText": '(//div[@class="pane-content"])[2]/text()', 'TestSummary': '//meta[@property="og:description"]/@content' } review = self.init_item_by_xpaths(response, 'review', review_xpaths) review["ProductName"] = review["ProductName"].replace("Test:", '') review["source_internal_id"] = self.get_source_internal_id(response) review["TestDateText"] = date_format( review["TestDateText"], "%d %b %Y") review['DBaseCategoryName'] = 'PRO' verdict_url_xpath = "//a[contains(text(),'Fazit')]/@href" verdict_page = self.extract(response.xpath(verdict_url_xpath)) if verdict_page: return response.follow( verdict_page, callback=self.get_test_verdict, meta={'review': review} ) else: self.extract_test_verdict(response, review) return review
def parse_reviews(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category'][ 'category_path'] product['ProductName'] = self.extract(response.xpath('//h1/a/text()')) product['PicURL'] = self.extract( response.xpath('//meta[@property="og:image"]/@content')) product['ProductManufacturer'] = self.extract( response.xpath('//meta[@itemprop="brand"]/@content')) product['source_internal_id'] = self.extract( response.xpath('//@data-product-id')) yield product reviews = response.xpath('//li[@class="opinion-row"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['source_internal_id'] = product['source_internal_id'] date = self.extract( review.xpath('.//meta[@itemprop="datePublished"]/@content')) user_review['TestDateText'] = date_format(date, "%Y %m %d") user_review['SourceTestRating'] = self.extract( review.xpath('.//meta[@itemprop="ratingValue"]/@content')) user_review['Author'] = self.extract(review.xpath('.//h4/text()')) user_review['TestTitle'] = self.extract( review.xpath('.//div[contains(@class,"grade-text")]/text()')) user_review['TestSummary'] = self.extract_all( review.xpath('.//div[@itemprop="description"]/text()')) yield user_review
def parse_review(self, response): product_xpaths = { 'OriginalCategoryName': '//div[@class="entry-crumbs"]/' 'span[last()-1]/a/text()', 'PicURL': '//meta[@property="og:image"]/@content', 'source_internal_id': 'substring-after(//article/@id, "-")', 'TestUrl': '//meta[@property="og:url"]/@content', } review_xpaths = { 'TestDateText': 'substring-before(//time/@datetime, "T")', 'TestPros': '//p[contains(., "Pros")]/following-sibling::p[1]' '/text()', 'TestCons': '//p[contains(., "Cons")]/following-sibling::p[1]' '/text()', 'TestSummary': '//meta[@property="og:description"]/@content', 'TestVerdict': '//div[@class="td-review-summary-content"]/text()|' '//h3[last()]/following-sibling::p[1]/text()', 'Author': '//div[@class="td-post-author-name"]/a/text()', 'TestTitle': '//meta[@property="og:title"]/@content', 'source_internal_id': 'substring-after(//article/@id, "-")', 'TestUrl': '//meta[@property="og:url"]/@content', 'TestDateText': 'substring-before(//time/@datetime, "T")', } product = self.init_item_by_xpaths(response, 'product', product_xpaths) review = self.init_item_by_xpaths(response, 'review', review_xpaths) product_name_xpath = '//h1[@class="entry-title"]/text()' p_name = self.extract(response.xpath(product_name_xpath)) product_name = '' if 'Review' in p_name: product_name = p_name.split(' Review')[0] else: product_name = p_name product['ProductName'] = product_name review['ProductName'] = product_name rating_xpath = '//div[@class="td-review-final-score"]/text()' rating = self.extract(response.xpath(rating_xpath)) SCALE = 5 if rating: review['SourceTestRating'] = rating review['SourceTestScale'] = SCALE review['DBaseCategoryName'] = 'PRO' test_day = review['TestDateText'] date_str = date_format(test_day, '%Y-%m-%d') date_time = datetime.strptime(date_str, '%Y-%m-%d') if date_time < self.stored_last_date: return yield product yield review
def parse_items(self, response): product_xpaths = { "PicURL": "//meta[@property='og:image']/@content", "ProductManufacturer": "//tr[@class='marke-hersteller']/td/a/text()" } review_xpaths = { "TestSummary": "//div[@id='review_body']/div[1]/p/text()", "TestVerdict": "(//div[@id='review_body']/div/p/text())[last()]", "TestTitle": "(//title/text())[1]", "Author": "//span/meta[@itemprop='author']/@content", "TestPros": "//div[@class='list-advantages']/ul/li/div/text()", "TestCons": "//div[@class='list-disadvantages']/ul/li/div/text()", "SourceTestRating": "//span/meta[@itemprop='ratingValue']/@content" } product = self.init_item_by_xpaths(response, "product", product_xpaths) review = self.init_item_by_xpaths(response, "review", review_xpaths) productname = self.extract( response.xpath("//tr[@class='modell']/td/span/text()")) productmanu = product['ProductManufacturer'] review['ProductName'] = productmanu + " " + productname product['ProductName'] = review['ProductName'] source_internal_id = self.extract( response.xpath("//div/meta[@itemprop='productID']/@content")) review['source_internal_id'] = source_internal_id product['source_internal_id'] = source_internal_id if not product['PicURL']: product['PicURL'] = self.extract( response.xpath("(//div/a/img/@data-src)[1]")) if review['SourceTestRating']: review['SourceTestScale'] = "5" review["DBaseCategoryName"] = "PRO" review_date = self.extract( response.xpath("//div[@class='offers']/small/text()")) date = str(review_date).split(" ")[2] review['TestDateText'] = date_format(date, '%d.%m.%Y') price = self.extract( response.xpath("//div[@class='price']/text()")).encode('utf-8') if price: product_id = ProductIdItem() product_id['ID_kind'] = 'price' product_id['ID_value'] = str(price).split(' ')[0] product_id['ProductName'] = product['ProductName'] product_id['source_internal_id'] = product['source_internal_id'] review_date = datetime.strptime(review['TestDateText'], "%Y-%m-%d") if review_date > self.stored_last_date: yield review yield product_id yield product
def parse_review(self, selector, url): image_xpath = "//img[@itemprop='image']/@src" image_alt_xpath = "//meta[@property='og:image']/@content" manufacturer_xpath = "(//span[@class='taxName' and text()='Manufacturer']/following-sibling::span[@class='taxContent'])[1]//text()" title_xpath = "//span[@itemprop='itemReviewed']/text()" summary_xpath = "//*[@itemprop='description']//text()" summary_alt_xpath = "//meta[@property='og:description']/@content" author_xpath = "//span[@itemprop='author']//text()" date_xpath = "//meta[@itemprop='datePublished']/@content" pros_xpath = "//div[@class='positive-wrapper']/text()" cons_xpath = "//div[@class='negative-wrapper']/text()" rating_value_xpath = "//*[@itemprop='ratingValue']/text()" rating_scale_xpath = "//*[@itemprop='bestRating']/text()" review = ReviewItem() review["TestTitle"] = self.extract_all(selector.xpath(title_xpath)) review["TestSummary"] = self.extract_all(selector.xpath(summary_xpath)) review["Author"] = self.extract(selector.xpath(author_xpath)) review["TestDateText"] = self.extract(selector.xpath(date_xpath)) review["TestPros"] = self.extract_all(selector.xpath(pros_xpath), ' ; ') review["TestCons"] = self.extract_all(selector.xpath(cons_xpath), ' ; ') review["SourceTestRating"] = self.extract( selector.xpath(rating_value_xpath)) review["SourceTestScale"] = self.extract( selector.xpath(rating_scale_xpath)) review["DBaseCategoryName"] = "PRO" review["TestUrl"] = url review["TestDateText"] = date_format(review["TestDateText"], "%b %d,%Y") product = ProductItem() product_name_re = "(.+)\sReview" name_match = re.search(product_name_re, review["TestTitle"], re.IGNORECASE) if name_match: product["ProductName"] = name_match.group(1) else: product["ProductName"] = review["TestTitle"] review["ProductName"] = product["ProductName"] product["TestUrl"] = url product["PicURL"] = self.extract(selector.xpath(image_xpath)) product["ProductManufacturer"] = self.extract( selector.xpath(manufacturer_xpath)) if not product["PicURL"]: product["PicURL"] = self.extract(selector.xpath(image_alt_xpath)) if not review["TestSummary"]: review["TestSummary"] = self.extract( selector.xpath(summary_alt_xpath)) yield product yield review
def parse_review(self, response): next_page_xpath = "(//*[@rel='next']/@href)[1]" default_rating_xpath = './/reevoo-score/@data-score' product = response.meta['product'] reviews = response.xpath('//article[contains(@id,"review_")]') if not reviews: return # From observation, at least currys.co.uk uses a different format to present review rating rating_xpath = response.meta.get('rating_xpath', '') if not rating_xpath: rating_xpath = default_rating_xpath last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) for review in reviews: user_review = ReviewItem() date = self.extract( review.xpath( './/span[contains(@class, "date_publish")]/text()')) if date: user_review['TestDateText'] = date_format(date, '') current_user_review = dateparser.parse( user_review['TestDateText'], date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['SourceTestRating'] = self.extract( review.xpath(rating_xpath)) user_review['Author'] = self.extract( review.xpath('.//h4[@class="attribution-name"]/text()')) user_review['TestPros'] = self.extract_all( review.xpath('.//dd[@class="pros"]/text()')) user_review['TestCons'] = self.extract_all( review.xpath('.//dd[@class="cons"]/text()')) user_review['source_internal_id'] = product['source_internal_id'] # All reviews after first empty review are empty if user_review['TestPros'] or user_review['TestCons']: yield user_review else: return next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_review, meta=response.meta) yield request
def parse_items(self, response): review_xpaths = { "TestTitle": "//h1/text()", "TestVerdict": "//p[@class='game-verdict']/text()", "TestPros": "//div[@class='sub-box'][1]/ul/li/text()", "TestCons": "//div[@class='sub-box'][2]/ul/li/text()", "TestSummary": "//meta[@name='description']/@content" } review = self.init_item_by_xpaths(response, "review", review_xpaths) product = ProductItem() internal_source_id = str(response.url).split("/")[4] review['source_internal_id'] = internal_source_id product['source_internal_id'] = internal_source_id product_name = self.extract( response.xpath('//h1[@itemprop="name headline"]//text()')).encode( 'utf-8') review['ProductName'] = str(product_name).strip('review') product['ProductName'] = str(product_name).strip('review') source_test_rating = self.extract( response.xpath( "//span[@class='score no-graphic score-short']/text()")) if source_test_rating: review['SourceTestRating'] = source_test_rating review['SourceTestScale'] = '10' product['TestUrl'] = response.url date_str = self.extract(response.xpath("//time/@datetime")) if date_str: date_str = str(date_str).split("T")[0] date_time = date_format(date_str, "%Y-%m-%d") date_time_to_compare = datetime.strptime(date_time, '%Y-%m-%d') if self.stored_last_date > date_time_to_compare: return review['TestDateText'] = date_time review['DBaseCategoryName'] = 'PRO' picture_src = self.extract( response.xpath( "//img[@class='TODO image-class block-image-ads']/@src")) picture_url = get_full_url(response.url, picture_src) product['PicURL'] = picture_url cat = self.extract( response.xpath("//a[@class='chunk category']/text()")) if cat == 'Review': yield review yield product elif cat == 'Hardware': yield review yield product
def level_2(self, response): product_xpaths = {"PicURL": '//meta[@property="og:image"]/@content'} product = self.init_item_by_xpaths(response, "product", product_xpaths) product['ProductName'] = self.get_product_name(response) product['source_internal_id'] = self.get_source_internal_id(response) original_category_name_xpath = "(//img[@alt='Themen']/ancestor::div/"\ "following-sibling::div)[1]/a/text()" original_category_name = self.extract_all( response.xpath(original_category_name_xpath), " | ") if original_category_name: product["OriginalCategoryName"] = original_category_name review_xpaths = { 'source_internal_id': "substring-after(//script[" "@type='text/javascript']/text(),'print/')", "TestDateText": "//div[@class='articlebox-content']/div[5]/text()", "TestSummary": '//meta[@property="og:description"]/@content', "Author": "//meta[@itemprop='creator accountablePerson']/@content", "TestTitle": '//meta[@property="og:title"]/@content', "TestDateText": "(//img[@alt='Publikationsdatum']/ancestor::div" "/following-sibling::div)[1]/text()" } review = self.init_item_by_xpaths(response, "review", review_xpaths) review['ProductName'] = self.get_product_name(response) review['source_internal_id'] = self.get_source_internal_id(response) yield product test_date = review["TestDateText"] if test_date: test_date = test_date.strip() review["TestDateText"] = date_format(test_date, "%d. %B %Y", ["de"]) review["DBaseCategoryName"] = "PRO" verdict_url_xpath = "//div[@class='kapitel '][last()]/a/@href" verdict_page = self.extract(response.xpath(verdict_url_xpath)) if verdict_page: yield response.follow(verdict_page, callback=self.get_test_verdict, meta={'review': review}) else: self.extract_test_verdict(response, review) yield review