Пример #1
0
    def parse_question(self, response):
        if 'QusetionHeader-title' in response.text:
            match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*',
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhiHuQuestionItem(),
                                     response=response)
            item_loader.add_css('title', 'h1.QusetionHeader-title::text')
            item_loader.add_css('content', '.QusetionHeader-detail')
            item_loader.add_value('url', response.url)
            item_loader.add_value('zhihu_id', question_id)
            item_loader.add_css('answer_num', '.List-headerText span::text')
            item_loader.add_css('comments_num',
                                '.QusetionHeader-actions button::text')
            item_loader.add_css('watch_user_num', '.NumberBoard-value::text')
            item_loader.add_css('topics',
                                '.NumberBoard-topics .Popover div::text')

            question_item = item_loader.load_item()

        else:
            match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*',
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhiHuQuestionItem(),
                                     response=response)
            # item_loader.add_css('title', '.zh-qusetion-title h2 a::text')
            item_loader.add_xpath(
                'title', '//*[@id="zh-qusetion-title"]/h2/a/text() |'
                ' //*[@id="zh-qusetion-title"]/h2/span/text()')
            item_loader.add_css('content', '#zh-qusetion-detail')
            item_loader.add_value('url', response.url)
            item_loader.add_value('zhihu_id', question_id)
            item_loader.add_css('answer_num', '#zh-question-answer-num::text')
            item_loader.add_css(
                'comments_num',
                '#zh-question-meta-wrap a[name="addcomment"]::text')
            # item_loader.add_css('watch_user_num', '#zh-question-side-header-wrap::text')
            item_loader.add_xpath(
                'watch_user_num',
                '//*[@id="zh-question-side-header-wrap"]/text()|'
                '//*[@class="zh-question-followers-sidebar"]/div/a/strong/text()'
            )
            item_loader.add_css('topics', '.zm-tag-editor-labels a::text')

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
Пример #2
0
 def parse_question(self, response):
     #处理question页面。从页面中提取question item
     if "QuestionHeader-title" in response.text:
         MatchObj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                             response.url)
         if MatchObj:
             ID = int(MatchObj.group(2))
         item_loader = ItemLoader(item=ZhiHuQuestionItem(),
                                  response=response)
         item_loader.add_css("title", "h1.QuestionHeader-title::text")
         item_loader.add_css("content", ".QuestionHeader-detail")
         item_loader.add_value("url", response.url)
         item_loader.add_value("zhihu_id", ID)
         item_loader.add_css("answer_num", ".List-headerText span::text")
         item_loader.add_css("comments_num",
                             ".QuestionHeader-Comment button::text")
         item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
         item_loader.add_css("topics",
                             ".QuestionHeader-topics .Popover div::text")
         question_item = item_loader.load_item()
     yield scrapy.Request(
         self.start_answer_url.format(ID, 20, 0),
         headers=self.header,
         callback=self.parse_answer,
     )
     yield question_item
Пример #3
0
 def parse_question(self, response):
     zhihu_question_item = ArticleItemLoader(item=ZhiHuQuestionItem(),
                                             response=response)
     question_id = response.meta.get('question_id', 0)
     zhihu_question_item.add_css('title', "h1.QuestionHeader-title::text")
     zhihu_question_item.add_value("question_id", question_id)
     zhihu_question_item.add_css("question_detail",
                                 ".QuestionHeader-detail")
     zhihu_question_item.add_css("tags", ".Tag-content .Popover div::text")
     # zhihu_question_item.add_css("follow_nums", ".QuestionFollowStatus .NumberBoard-itemValue")
     zhihu_question_item = zhihu_question_item.load_item()
     yield zhihu_question_item
     yield Request(self.query_next_answer_url.format(question_id, 20, 0),
                   headers=self.headers,
                   callback=self.parse_answers)
Пример #4
0
    def parse_question(self, response):
        item_loader = ZhiHuQuestionItemLoader(item=ZhiHuQuestionItem(),
                                              response=response)
        item_loader.add_css('title', 'h1.QuestionHeader-title::text')
        item_loader.add_css('topics', '.QuestionTopic ::text')
        item_loader.add_css('comment_num',
                            '.QuestionHeader-Comment button::text')
        item_loader.add_css('content', '.QuestionHeader-detail')
        item_loader.add_css('answers_num', '.List-headerText span::text')

        match_obj = re.match('.*?(\d+)$|/', response.url)
        item_loader.add_value('question_id', match_obj.group(1))

        selector = response.css(
            'strong.NumberBoard-itemValue::attr("title")').extract()
        item_loader.add_value('watch_num', selector[0])
        item_loader.add_value('click_num', selector[1])

        zhihu_question_item = item_loader.load_item()

        yield zhihu_question_item
        yield scrapy.Request(self.answer_api_url.format(match_obj.group(1)),
                             headers=self.headers,
                             callback=self.parse_answers)