def parse_question(self, response): if 'QusetionHeader-title' in response.text: match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*', response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhiHuQuestionItem(), response=response) item_loader.add_css('title', 'h1.QusetionHeader-title::text') item_loader.add_css('content', '.QusetionHeader-detail') item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css('comments_num', '.QusetionHeader-actions button::text') item_loader.add_css('watch_user_num', '.NumberBoard-value::text') item_loader.add_css('topics', '.NumberBoard-topics .Popover div::text') question_item = item_loader.load_item() else: match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*', response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhiHuQuestionItem(), response=response) # item_loader.add_css('title', '.zh-qusetion-title h2 a::text') item_loader.add_xpath( 'title', '//*[@id="zh-qusetion-title"]/h2/a/text() |' ' //*[@id="zh-qusetion-title"]/h2/span/text()') item_loader.add_css('content', '#zh-qusetion-detail') item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('answer_num', '#zh-question-answer-num::text') item_loader.add_css( 'comments_num', '#zh-question-meta-wrap a[name="addcomment"]::text') # item_loader.add_css('watch_user_num', '#zh-question-side-header-wrap::text') item_loader.add_xpath( 'watch_user_num', '//*[@id="zh-question-side-header-wrap"]/text()|' '//*[@class="zh-question-followers-sidebar"]/div/a/strong/text()' ) item_loader.add_css('topics', '.zm-tag-editor-labels a::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): #处理question页面。从页面中提取question item if "QuestionHeader-title" in response.text: MatchObj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if MatchObj: ID = int(MatchObj.group(2)) item_loader = ItemLoader(item=ZhiHuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", ID) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request( self.start_answer_url.format(ID, 20, 0), headers=self.header, callback=self.parse_answer, ) yield question_item
def parse_question(self, response): zhihu_question_item = ArticleItemLoader(item=ZhiHuQuestionItem(), response=response) question_id = response.meta.get('question_id', 0) zhihu_question_item.add_css('title', "h1.QuestionHeader-title::text") zhihu_question_item.add_value("question_id", question_id) zhihu_question_item.add_css("question_detail", ".QuestionHeader-detail") zhihu_question_item.add_css("tags", ".Tag-content .Popover div::text") # zhihu_question_item.add_css("follow_nums", ".QuestionFollowStatus .NumberBoard-itemValue") zhihu_question_item = zhihu_question_item.load_item() yield zhihu_question_item yield Request(self.query_next_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answers)
def parse_question(self, response): item_loader = ZhiHuQuestionItemLoader(item=ZhiHuQuestionItem(), response=response) item_loader.add_css('title', 'h1.QuestionHeader-title::text') item_loader.add_css('topics', '.QuestionTopic ::text') item_loader.add_css('comment_num', '.QuestionHeader-Comment button::text') item_loader.add_css('content', '.QuestionHeader-detail') item_loader.add_css('answers_num', '.List-headerText span::text') match_obj = re.match('.*?(\d+)$|/', response.url) item_loader.add_value('question_id', match_obj.group(1)) selector = response.css( 'strong.NumberBoard-itemValue::attr("title")').extract() item_loader.add_value('watch_num', selector[0]) item_loader.add_value('click_num', selector[1]) zhihu_question_item = item_loader.load_item() yield zhihu_question_item yield scrapy.Request(self.answer_api_url.format(match_obj.group(1)), headers=self.headers, callback=self.parse_answers)