def parse_review(self, response): review_json_ld = extruct_helper.extract_json_ld( response.text, "Review") article_json_ld = extruct_helper.extract_json_ld( response.text, "NewsArticle") if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld) elif article_json_ld: review = extruct_helper.review_item_from_article_json_ld( article_json_ld) else: review = ReviewItem() review['DBaseCategoryName'] = 'PRO' if not review.get('TestUrl', ''): review['TestUrl'] = response.url review['ProductName'] = self.extract( response.xpath( "//div[@class='productDataBlock']/ul/li[1]/strong/text()")) if not review.get('ProductName', ''): review['ProductName'] = self.get_product_name(response) source_internal_id = str(response).split("/")[4] review['source_internal_id'] = source_internal_id.rstrip('>') review['TestPros'] = self.extract( response.xpath("//div[@id='ahReviewPros']/ul/li/text()")) review['TestCons'] = self.extract( response.xpath("//div[@id='ahReviewCons']/ul/li/text()")) return review
def parse_review(self, node, response): review = ReviewItem() # No author for the source page meta_info = node.get('meta', {}) review['ProductName'] = node.get('title', '') review['source_internal_id'] = meta_info.get('id', '') review['TestDateText'] = meta_info.get('review_date', '') review['TestSummary'] = node.get('description', '') review['TestTitle'] = review.get('ProductName') review['TestUrl'] = node.get('url', '') review['SourceTestRating'] = meta_info.get('expert_evaluation_float', '') # source rating scale based on scale of 10 if review.get('SourceTestRating'): review['SourceTestScale'] = 10 review['source_id'] = self.spider_conf['source_id'] review['DBaseCategoryName'] = 'PRO' if meta_info.get('conclusion', ''): review['TestVerdict'] = meta_info.get('conclusion', '') if meta_info.get('reviewer', ''): review['Author'] = meta_info.get('reviewer', '') return review
def level_4(self, response): original_url = response.url category_leaf_xpath = "//ol[@class='breadcrumbs']//ol/li[last()]//a//span//text()" category_path_xpath = "//ol[@class='breadcrumbs']//span//text()" category = CategoryItem() category['category_url'] = original_url category['category_leaf'] = self.extract(response.xpath(category_leaf_xpath)) category['category_path'] = self.extract_all(response.xpath(category_path_xpath), ' | ') if self.should_skip_category(category): return yield category product_xpaths = { "source_internal_id": "//div[@class='overviewHeaderTitle']//h1/a/@href", "ProductName":"//div[@class='overviewHeaderTitle']//h1/a//text()", "OriginalCategoryName":"//ol[@class='breadcrumbs']//span//text()", "PicURL":"//div[@class='headerContent']//img/@src", } product = self.init_item_by_xpaths(response, "product", product_xpaths) product['TestUrl'] = original_url picurl = product.get("PicURL", "") if picurl and picurl[:2] == "//": product["PicURL"] = "https:" + product["PicURL"] if picurl and picurl[:1] == "/": product["PicURL"] = get_full_url(original_url, picurl) manuf = product.get("ProductManufacturer", "") if manuf == "" and ""[:2] != "//": product["ProductManufacturer"] = "" try: product["OriginalCategoryName"] = category['category_path'] except: pass ocn = product.get("OriginalCategoryName", "") if ocn == "" and "//ol[@class='breadcrumbs']//span//text()"[:2] != "//": product["OriginalCategoryName"] = "//ol[@class='breadcrumbs']//span//text()" matches = None field_value = product.get("source_internal_id", "") if field_value: matches = re.search("((?<=/)\d+(?=/))", field_value, re.IGNORECASE) if matches: product["source_internal_id"] = matches.group(1) yield product button_next_url = "" if "//div[contains(@class,'paging-footer')]//a[contains(@class,'next')]/@href": button_next_url = self.extract(response.xpath("//div[contains(@class,'paging-footer')]//a[contains(@class,'next')]/@href")) if button_next_url: button_next_url = get_full_url(original_url, button_next_url) request = Request(button_next_url, callback=self.level_4) yield request containers_xpath = "//ul[@class='reviewList']/li[@class='review']" containers = response.xpath(containers_xpath) for review_container in containers: review = ReviewItem() review['source_internal_id'] = self.extract(response.xpath("//div[@class='overviewHeaderTitle']//h1/a/@href")) review['ProductName'] = self.extract(review_container.xpath("//div[@class='overviewHeaderTitle']//h1/a//text()")) review['SourceTestRating'] = self.extract(review_container.xpath(".//div[@class='reviewAverageRating']//meter/@value")) review['TestDateText'] = self.extract(review_container.xpath(".//span[@class='writeDate']//time//text()")) review['TestPros'] = self.extract(review_container.xpath(".//div[@class='pros']//ul/li//text()")) review['TestCons'] = self.extract(review_container.xpath(".//div[@class='cons']//ul/li//text()")) review['TestSummary'] = self.extract(review_container.xpath(".//div[contains(@class,'reviewText')]//p[count(br)=0]/text() | .//div[contains(@class,'reviewText')]//br[position()=1]/preceding-sibling::text()[1]")) review['Author'] = self.extract(review_container.xpath(".//div[@class='reviewWriter']/strong//text()")) review['TestTitle'] = self.extract(review_container.xpath(".//div[@class='reviewContent']/h3/a//text()")) review['TestUrl'] = original_url try: review['ProductName'] = product['ProductName'] review['source_internal_id'] = product['source_internal_id'] except: pass awpic_link = review.get("AwardPic", "") if awpic_link and awpic_link[:2] == "//": review["AwardPic"] = "https:" + review["AwardPic"] if awpic_link and awpic_link[:1] == "/": review["AwardPic"] = get_full_url(original_url, awpic_link) matches = None field_value = review.get("source_internal_id", "") if field_value: matches = re.search("((?<=/)\d+(?=/))", field_value, re.IGNORECASE) if matches: review["source_internal_id"] = matches.group(1) review["SourceTestScale"] = "5" if review["TestDateText"]: review["TestDateText"] = date_format(review["TestDateText"], "%d %B %Y", ["nl"]) review["DBaseCategoryName"] = "USER" yield review