def parse_question(self, response): # 处理question页面,从页面中提取具体的quesiton item if "QuestionHeader-title" in response.text: # 处理新版本的页面 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.ad_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", "Questionheader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover::text") question_item = item_loader.load_item() else: # 处理知乎旧版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() pass