def enrich_data(self, item_loader, response): self.logger.debug("Start to enrich_data. ") item_loader.add_value("id", response.url, re=r"subject/(\d+)/") item_loader.add_xpath("title", '//h1/span/text()') item_loader.add_xpath("info", '//div[@id="info"]') item_loader.add_xpath("score", '//strong[@class="ll rating_num"]/text()') item_loader.add_xpath( "recommendations", '//div[@class="recommendations-bd"]/dl/dd/a/text()') item_loader.add_xpath("description", '//div[@id="link-report"]/span/text()') nodes = list() celebrities_url = xpath_exchange( response.xpath('//div[@id="celebrities"]/h2/span/a/@href')) if celebrities_url: nodes.append(("celebrities", item_loader, { "url": response.urljoin(celebrities_url) })) related_pic_urls = response.xpath( '//div[@id="related-pic"]/h2/span[@class="pl"]/a/@href').extract() if len(related_pic_urls) >= 2: related_pic_url = related_pic_urls[-2] else: related_pic_url = "" if related_pic_url: nodes.append(("related_pics", item_loader, { "url": related_pic_url })) comments_url = xpath_exchange( response.xpath('//div[@id="comments-section"]//h2/span/a/@href')) if comments_url: nodes.append(("comments", item_loader, {"url": comments_url})) question_list_url = xpath_exchange( response.xpath('//div[@id="askmatrix"]/div/h2/span/a/@href')) if question_list_url: nodes.append(("question_list", item_loader, { "url": question_list_url })) review_list_url = xpath_exchange( response.xpath( '//section[@class="reviews mod movie-content"]//h2/span/a/@href' )) if review_list_url: nodes.append(("review_list", item_loader, { "url": response.url.split("?")[0] + review_list_url })) return nodes
def enrich_review_list(self, item_loader, response): self.logger.debug("Start to enrich_review_list. ") nodes = list() next_url = xpath_exchange( response.xpath('//span[@class="next"]/a/@href')) if next_url: nodes.append(("review_list", item_loader, { "url": response.urljoin(next_url) })) reviews = response.xpath( '//div[@class="review-list"]/div/div[@class="main review-item"]') for review in reviews: nodes.append(("reviews", CustomLoader(item=ReviewItem()), { "url": xpath_exchange(review.xpath('header/h3/a/@href')) })) return nodes
def enrich_question_list(self, item_loader, response): self.logger.debug("Start to enrich_question_list. ") nodes = list() next_url = xpath_exchange( response.xpath('//span[@class="next"]/a/@href')) if next_url: nodes.append(("question_list", item_loader, { "url": response.urljoin(next_url) })) qustions = response.xpath( '//div[@class="questions"]/div[@class="item"]') for question in qustions: nodes.append(("questions", CustomLoader(item=QuestionItem()), { "url": xpath_exchange(question.xpath('h3/a/@href')) })) return nodes
def enrich_related_pics(self, item_loader, response): self.logger.debug("Start to enrich_related_pics. ") types = response.xpath('//div[@class="article"]/div[@class="mod"]') related_pics = dict() for type in types: related_pics[re_search(r"(\w+)", xpath_exchange(type.xpath('div[@class="hd"]/h2/text()')))] = \ type.xpath('div[@class="bd"]/ul/li/a/img/@src').extract() item_loader.add_value("related_pics", related_pics)
def enrich_reviews(self, item_loader, response): self.logger.debug("Start to enrich_reviews. ") item_loader.add_xpath("title", '//h1/span/text()') item_loader.add_xpath("content", '//div[@id="link-report"]/div') item_loader.add_xpath( "author", '//div[@class="article"]/div/div/header/a/span/text()') item_loader.add_xpath( "datetime", '//div[@class="article"]/div/div/header/span[@class="main-meta"]/text()' ) item_loader.add_xpath( "score", '//div[@class="article"]/div/div/header/span[contains(@class, "rating")]/@class', re=r"allstar(\d+)") item_loader.add_xpath( "upvotes", '//div[@class="main-ft"]/div/div/button[1]/text()', re=r"(\d+)") item_loader.add_xpath( "downvotes", '//div[@class="main-ft"]/div/div/button[2]/text()', re=r"(\d+)") comments = response.xpath( '//div[@id="comments"]/div[@class="comment-item"]') comment_list = list() for comment_div in comments: comment = dict() comment["author"] = xpath_exchange( comment_div.xpath('div//div[@class="header"]/a/text()')) comment["datetime"] = xpath_exchange( comment_div.xpath('div//div[@class="header"]/span/text()')) comment["content"] = xpath_exchange( comment_div.xpath('div/p/text()')) comment["reply-quote"] = xpath_exchange( comment_div.xpath( 'div/div[@class="reply-quote"]/span[@class="all"]/text()')) comment["reply"] = xpath_exchange( comment_div.xpath( 'div/div[@class="reply-quote"]/span/a/text()')) comment_list.append(comment) item_loader.add_value("comments", comment_list)
def enrich_celebrities(self, item_loader, response): self.logger.debug("Start to enrich_celebrities. ") positions = response.xpath( '//div[@id="celebrities"]/div[@class="list-wrapper"]') celebrity_list = list() for position in positions: pos = xpath_exchange(position.xpath("h2/text()")) for position_actor in position.xpath('ul/li'): celebrity = dict() role = xpath_exchange( position_actor.xpath('div/span[@class="role"]/text()')) if role: celebrity["role"] = re_search(r"饰 (.*)", role) celebrity["position"] = pos celebrity["name"] = xpath_exchange( position_actor.xpath('div/span[@class="name"]/a/text()')) celebrity["representative"] = position_actor.xpath( 'div/span[@class="works"]/a/text()').extract() celebrity_list.append(celebrity) item_loader.add_value("celebrities", celebrity_list)
def enrich_comments(self, item_loader, response): self.logger.debug("Start to enrich_comments. ") comments = response.xpath( '//div[@id="comments"]/div[@class="comment-item"]') comment_list = list() for comment_div in comments: comment = dict() comment["author"] = xpath_exchange( comment_div.xpath('div/a/@title')) comment["upvotes"] = xpath_exchange( comment_div.xpath('div/h3/span/span[@class="votes"]/text()')) comment["score"] = int( re_search( r"allstar(\d+)", xpath_exchange( comment_div.xpath( 'div/h3/span[@class="comment-info"]/span[contains(@class, "rating")]/@class' ))) or 0) / 5 comment["datetime"] = xpath_exchange( comment_div.xpath( 'div/h3/span/span[@class="comment-time "]/@title')) comment["content"] = xpath_exchange( comment_div.xpath('div/p/text()')) comment_list.append(comment) item_loader.add_value("comments", comment_list) next_url = xpath_exchange( response.xpath('//div[@id="paginator"]/a[@class="next"]/@href')) if next_url: return [("comments", item_loader, { "url": response.urljoin(next_url) })]