Exemplo n.º 1
0
  def parse_question(self, response):
    # 处理question页面, 从页面中提取出具体的question item
    question_id = int(re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url).group(2))
    if "QuestionHeader-title" in response.text:
      item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
      item_loader.add_css("title", "h1.QuestionHeader-title::text")
      item_loader.add_css("content", ".QuestionHeader-detail")
      item_loader.add_value("url", response.url)
      item_loader.add_value("zhihu_id", question_id)
      item_loader.add_css("answer_num", ".List-headerText span::text")
      item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
      item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
      item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

      question_item = item_loader.load_item()
    else:
      # 处理老版本页面的item提取
      item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
      # item_loader.add_css("title", ".zh-question-title h2 a::text")
      item_loader.add_xpath("title",
                            "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
      item_loader.add_css("content", "#zh-question-detail")
      item_loader.add_value("url", response.url)
      item_loader.add_value("zhihu_id", question_id)
      item_loader.add_css("answer_num", "#zh-question-answer-num::text")
      item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
      # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
      item_loader.add_xpath("watch_user_num",
                            "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
      item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

      question_item = item_loader.load_item()
    yield question_item
Exemplo n.º 2
0
    def parse_question(self, response):
        """
            处理question页面,从页面中提取出具体的question item
            这里其实也可以提取到url,但是为了逻辑更清晰,不加
        """
        question_id = ''
        match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                             response.url)
        if match_obj:
            question_id = int(match_obj.group(2))

        if "QuestionHeader-title" in response.text:
            # 处理新版本
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            # 旧版本
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath(
                "title",
                "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()"
            )
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css(
                "comments_num",
                "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath(
                "watch_user_num",
                "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()"
            )
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()

        # 请求answer
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)

        # 识别到是一个item,提交给pipeline;如果识别到一个Request,则会去下载页面,然后交给parse
        yield question_item
Exemplo n.º 3
0
    def parse_question(self, response):
        # 处理question页面,从页面中提取出具体的question item

        if "QuestionHeader-title" in response.text:
            # 处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(
                    match_obj.group(2))  # question_id在数据库中是int类型,传入前最好先处理好

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content",
                                ".QuestionHeader-detail")  # 取html内容,不用伪类选择器
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-Comment button::text")
            item_loader.add_css("watch_user_num",
                                ".NumberBoard-value::text")  # return a list
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text"
                                )  # 子带元素中寻找元素  div是后代节点,多少层都能找
            # item_loader.add_xpath("topics", "//*[@id='root']/div/main/div/meta[3]")    # 提取的整个标签, 可以考虑以后用正则表达式处理
            question_item = item_loader.load_item()

        else:
            # 处理知乎旧版本页面的item提取 -- 实际现在应该都是新版本了。
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(
                    match_obj.group(2))  # question_id在数据库中是int类型,传入前最好先处理好
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_css("content",
                                "#zh-question-detail")  # #zh中#表示取的id
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css(
                "comments_num",
                "#zh-question-meta-wrap a[name='addcomment']::text")
            item_loader.add_css(
                "watch_user_num",
                "#zh-question-side-header-wrap::text")  # return a list
            item_loader.add_css("topics",
                                ".zm-tag-editor-labels a::text")  # 子带元素中寻找元素

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item  # comment for debug answer
Exemplo n.º 4
0
    def parse_question(self, response):
        # 处理question页面,从页面中提取具体的quesiton item
        if "QuestionHeader-title" in response.text:
            # 处理新版本的页面
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                "Questionheader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover::text")

            question_item = item_loader.load_item()

        else:
            # 处理知乎旧版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath(
                "title",
                "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()"
            )
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css(
                "comments_num",
                "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath(
                "watch_user_num",
                "//*[@id=zh-question-side-header-wrap]/text()|//*[@class=zh-question-followers-sidebar]/div/a/strong/text()"
            )
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
Exemplo n.º 5
0
    def parse_question(self, response):
        # 处理question页面,从页面中提取具体的quesiton item

        if "QuestionHeader-title" in response.text:
            # 处理新版本的页面
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.ad_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                "Questionheader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover::text")

            question_item = item_loader.load_item()

        else:
            # 处理知乎旧版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css(
                "comments_num",
                "#zh-question-meta-wrap a[name='addcomment']::text")
            item_loader.add_css("watch_user_num",
                                "#zh-question-side-header-wrap::text")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()
        pass
Exemplo n.º 6
0
    def parse_question(self, response):
        """
        提取知乎问题相应字段
        """
        question_id = int(response.meta.get("question_id", ""))
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", ".QuestionHeader-content h1::text")
        item_loader.add_css("content", ".QuestionHeader-detail span::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css(
            "watcher_num",
            ".NumberBoard.QuestionFollowStatus-counts .NumberBoard-value::text"
        )
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css(
            "click_num",
            ".NumberBoard.QuestionFollowStatus-counts .NumberBoard-value::text"
        )
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")
        question_item = item_loader.load_item()

        # 传入问题的id,每次请求的数量,以及第一次请求的偏移值
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
Exemplo n.º 7
0
    def parse_question(self, response):
        match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                             response.url)
        question_id = ''
        if match_obj:
            question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content", ".QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeader-Comment button::text")
        item_loader.add_css(
            "watch_user_num",
            ".QuestionFollowStatus-counts button div strong::text")
        item_loader.add_css(
            "follow_user_num",
            ".QuestionFollowStatus-counts div div strong::text")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             cookies=self.cookies,
                             headers=zhihuLogin.headers,
                             callback=self.parse_answer)
        yield question_item
Exemplo n.º 8
0
    def parse_question(self, response):
        # 处理question页面,从页面中提取出具体的question item
        question_id = response.meta.get('question_id')
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)

        item_loader.add_xpath(
            'title',
            "//div[@class='QuestionHeader']//h1[@class='QuestionHeader-title']/text()"
        )
        # item_loader.add_xpath('content', "//div[@class='QuestionHeader-detail']")
        item_loader.add_value('url', response.url)
        item_loader.add_value('zhihu_id', question_id)
        item_loader.add_xpath('answer_num',
                              "//h4[@class='List-headerText']/span/text()")
        item_loader.add_xpath(
            'comments_num',
            "//div[@class='QuestionHeader-Comment']/button/text()")
        item_loader.add_xpath('watch_user_num',
                              "//div[@class='NumberBoard-value']/text()")
        item_loader.add_xpath('topics',
                              "//a[@class='TopicLink']/div/div/text()")

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 0),
                             callback=self.parse_answer)
        yield question_item
Exemplo n.º 9
0
    def parse_question(self, response):
        # 处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
Exemplo n.º 10
0
 def parse_question(self, response):
     match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                          response.url)
     if match_obj:
         question_id = int(match_obj.group(2))
         item_loader = ZhihuQuestionItemLoader(item=ZhihuQuestionItem(),
                                               response=response)
         item_loader.add_css("title", "h1.QuestionHeader-title::text")
         item_loader.add_css("content", ".QuestionHeader-detail")
         item_loader.add_value('url', response.url)
         item_loader.add_value("id", question_id)
         item_loader.add_css('answer_num', ".List-headerText span::text")
         item_loader.add_css('comments_num',
                             ".QuestionHeader-Comment button::text")
         item_loader.add_css('follow_num', ".NumberBoard-itemValue::text")
         item_loader.add_css('view_num',
                             ".NumberBoard-itemValue:nth-child(2)::text")
         item_loader.add_css('topics',
                             ".QuestionHeader-topics .Popover div::text")
         item_loader.add_value(
             'crawl_time',
             datetime.datetime.now().strftime(SQL_DATETIME_FORMAT))
         question_item = item_loader.load_item()
         yield scrapy.Request(
             self.start_answer_url.format(question_id, 20, 0),
             meta={'cookiejar': response.meta['cookiejar']},
             headers=self.headers,
             callback=self.parse_answer)
         yield question_item
Exemplo n.º 11
0
    def parse_question(self, response):
        # 处理question页面, 从页面中提取出具体的question item
        # 注意:这里也可以继续抓取url进行下一步跟踪,参考self.parse(),此处省略
        question_id = int(response.meta.get("question_id", ""))
        if "QuestionHeader-title" in response.text:
            # 处理新版本
            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            # 取html,不取text
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            # 回答数,提取text
            item_loader.add_css("answer_num", ".List-headerText span::text")
            # 评论数,提取text,略微改动
            item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
            # 关注数和浏览数一起提取,略微改动
            item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text")
            # 主题。 css格式里空格代表后代(子代用">")小心Popover后面还有个后代div
            item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            # 处理旧版本,略
            raise NotImplementedError
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
Exemplo n.º 12
0
    def parse_question(self, response):
        """
        1. 获取问题详情
        """

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)

        question_id = re.match("(.*zhihu.com/question/?(\d+)).*",
                               response.url).group(2)
        item_loader.add_value('question_id', question_id)
        item_loader.add_value('url', response.url)
        item_loader.add_css('title', 'h1.QuestionHeader-title::text')
        item_loader.add_css(
            'content', '.QuestionRichText.QuestionRichText--expandable span')
        item_loader.add_css(
            'topics', '.QuestionHeader-topics .Tag-content .Popover div::text')
        item_loader.add_css('answer_num', '.List-headerText span::text')
        item_loader.add_css(
            'follow_num',
            '.QuestionFollowStatus-counts .NumberBoard-itemValue::text')
        item_loader.add_css(
            'browse_num',
            '.QuestionFollowStatus-counts .NumberBoard-itemValue::text')
        item_loader.add_css('comment_num',
                            ".QuestionHeader-Comment button::text")

        question_item = item_loader.load_item()

        yield question_item
        yield scrapy.Request(self.answer_start_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
    def parse_question(self, response):
        #处理question页面, 从页面中提取出具体的question item  #在items.py里定义item
        if "QuestionHeader-title" in response.text:
            #处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)  #通过re匹配zhihu_id
            if match_obj:
                question_id = int(match_obj.group(2))  #在数据库里定义了zhihu_id为int类型

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)  #通过re匹配zhihu_id  #也可以用meta获得zhihu_id
            item_loader.add_css("answer_num", ".List-headerText span::text")
            #item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
            item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
            #item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text")

            item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")  #没加>,只有空格,是指在后代节点中寻找  #item_loader.add_css("topics", ".QuestionHeader-topics > .Popover div::text")  #加>,是指在子节点中寻找

            question_item = item_loader.load_item()
        else:
            #处理老版本页面的item提取
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)  #通过re匹配zhihu_id
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text") #title可能放在a标签里,也可能放在span标签里
            item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")  #xpath有或
            item_loader.add_css("content", "#zh-question-detail")  # #取ID  .取class
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()
            
        #pass

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)  #请求20条  #从0开始
        yield question_item  #发到pipelines.py
Exemplo n.º 14
0
    def parse_questions(self, response):
        """
        解析某个 ‘问题’ 页面,提取数据,收集到item
        :param response: 
        :return: 
        """
        zhihu_question_item = ZhihuQuestionItem()

        # 从response中提取出想要的信息
        url = response.url
        zhihu_id = response.meta.get('zhihu_id')
        topic = ','.join(
            response.css(
                '.QuestionHeader-topics .Popover div::text').extract())
        title = response.css(
            '.QuestionHeader h1.QuestionHeader-title::text').extract_first('')
        content = response.css('.QuestionRichText span::text').extract_first(
            '')
        answer_num = int(
            response.css('.List-headerText span::text').extract_first(0))
        comments_num = int(
            response.css('.QuestionHeader-Comment button::text').re('(\d+)')
            [0])
        watch_user_num = int(
            response.css('.NumberBoard-value::text').re('(\d+)')[0])
        click_num = int(
            response.css('.NumberBoard-value::text').re('(\d+)')[1])

        # 把上面提取出来的信息填充到 item 中
        zhihu_question_item['url'] = url
        zhihu_question_item['zhihu_id'] = zhihu_id
        zhihu_question_item['topic'] = topic
        zhihu_question_item['title'] = title
        zhihu_question_item['content'] = content
        zhihu_question_item['answer_num'] = answer_num
        zhihu_question_item['comments_num'] = comments_num
        zhihu_question_item['watch_user_num'] = watch_user_num
        zhihu_question_item['click_num'] = click_num

        # 到这里还要解析问题的回答信息,在这里触发第问题答案的初始请求,offset为0
        yield scrapy.Request(url=self.start_answers_url.format(
            zhihu_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answers)

        # 用yield 把 item 传输到pipeline
        yield zhihu_question_item
        #  注释是因为,几个爬虫返回去到pipeline里面的都叫item,如何区分?

        # 按理说爬取全站,是每个页面的所有url都应该提取出来,做self.parse里面的操做,这里为了简化代码,提高可读性,暂时不做,可以最后加
        """
Exemplo n.º 15
0
 def parse_question(self, response):
     question_id = response.meta.get('question_id')
     item_loader = ItemLoader(item=ZhihuQuestionItem(),response=response)
     item_loader.add_css('title','h1.QuestionHeader-title::text')
     item_loader.add_css('content','.QuestionHeader-detail')
     item_loader.add_value('url',response.url)
     item_loader.add_value('zhihu_id', question_id)
     item_loader.add_css('answer_num','.List-headerText span::text')
     item_loader.add_css('comments_num','.QuestionHeaderActions .QuestionHeader-Comment button::text')
     item_loader.add_css('watch_user_num','.NumberBoard-itemValue::text')
     item_loader.add_css('topics','.QuestionHeader-topics .Popover div::text')
     question_item = item_loader.load_item()
     yield scrapy.Request(url=self.start_answer_url.format(question_id,20,0),headers=self.header,callback=self.parse_answer)
     yield question_item
Exemplo n.º 16
0
    def parse_question(self, response):
        item_loader = ZhihuItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css('title', '.QuestionHeader-main h1::text')
        item_loader.add_css('content', '.QuestionRichText span::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('zhihu_id', response.meta.get('question_id'))
        item_loader.add_css('answer_num', '.List-headerText span::text')
        item_loader.add_css('comments_num', '.QuestionHeader-Comment .Button::text')
        item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text')
        item_loader.add_css('watch_user_num', '.QuestionFollowStatus .Button .NumberBoard-value::text')
        item_loader.add_css('click_num', '.QuestionFollowStatus .NumberBoard-item .NumberBoard-value::text')
        item_loader.add_value('crawl_time', datetime.now())
        question_item = item_loader.load_item()

        yield question_item
Exemplo n.º 17
0
    def parse_question(self, response):
        '''
            parse question
        '''
        if response.status in self.handle_httpstatus_list:
            self.failed_urls.append(response.url)
            # 数据收集,当Response状态码为403/404/500时,failed_url数加1
            self.crawler.stats.inc_value("failed_url")

        question_item = ZhihuQuestionItem()
        question_id = int(response.meta.get("question_id"))
        title = response.css('.QuestionHeader-title::text').extract_first('')
        question_url = response.url
        topics = response.css(
            'meta[itemprop="keywords"]::attr(content)').extract()
        topics = '/'.join(topics)
        content = response.css(
            '.QuestionRichText--collapsed div span::text').extract_first('')
        answer_nums = response.css(
            '.List-headerText span::text').extract_first('')
        answer_nums = extract_nums(answer_nums)
        comment_nums = response.css(
            '.QuestionHeader-Comment button::text').extract_first('')
        comment_nums = extract_nums(comment_nums)
        watch_user_nums = response.css(
            '.NumberBoard-itemValue::text').extract_first('')
        watch_user_nums = extract_nums(watch_user_nums)
        click_nums = response.css('.NumberBoard-itemValue::text').extract()[1]
        click_nums = extract_nums(click_nums)
        crawl_time = datetime.datetime.now()

        question_item["question_id"] = question_id
        question_item["topics"] = topics
        question_item["question_url"] = question_url
        question_item["title"] = title
        question_item["content"] = content
        question_item["answer_nums"] = answer_nums
        question_item["comment_nums"] = comment_nums
        question_item["watch_user_nums"] = watch_user_nums
        question_item["click_nums"] = click_nums
        question_item["crawl_time"] = crawl_time

        yield question_item
        yield scrapy.Request(self.start_answer_url.format(question_id, 5, 0),
                             callback=self.parse_answer)
Exemplo n.º 18
0
    def parse_question(self, response):
        zhihu_id = response.meta.get('question_id', '')

        # 使用ItemLoader加载item
        item_loader = ItemLoader(ZhihuQuestionItem(), response=response)
        item_loader.add_value('zhihu_id', zhihu_id)
        item_loader.add_css('topics',
                            '.QuestionHeader-topics .Popover div::text')
        item_loader.add_value('url', response.url)
        item_loader.add_css('title', '.QuestionHeader-title::text')
        item_loader.add_css('content', '.QuestionHeader-detail')
        # if 'answer' in response.url:
        #     item_loader.add_css('answer_num', '.Question-mainColumn a.QuestionMainAction::text')
        # else:
        #     item_loader.add_css('answer_num', '.QuestionAnswers-answers .List-headerText span::text')
        # 从两种不同的样式中获取数据 可以用xpath的 | 或符号
        item_loader.add_xpath(
            'answer_num',
            '//*[@class="List-headerText"]/span/text()|//a[@class="QuestionMainAction"]/text()'
        )
        item_loader.add_css('comments_num',
                            '.QuestionHeader-Comment button::text')
        item_loader.add_css(
            'watch_user_num',
            '.QuestionFollowStatus .NumberBoard-itemValue::text')
        item_loader.add_css(
            'click_num', '.QuestionFollowStatus .NumberBoard-itemValue::text')

        # 根据上面我们定义的规则将ItemLoader加载成question_item
        question_item = item_loader.load_item()

        # 如果这个问题有人回答,则发送请求获取回答信息
        # 没有回答将回答数赋值为0
        if 'answer_num' in question_item:
            # 问题页面有回答,我们拼接一个获取回答的请求,进行分析回答
            yield scrapy.Request(self.answer_url.format(zhihu_id, 0),
                                 headers=self.header,
                                 callback=self.parse_answer)
        else:
            question_item['answer_num'] = ['0']

        # 这里yield出去什么,scrapy会自动分析,如果是一个itme,会自动调用pipeline;如果是request会自动下载这个页面,跳转到callback函数
        yield question_item
Exemplo n.º 19
0
    def parse_question(self, response):
        zhihu_id = response.meta.get("question_id", "")
        question_item_loader = ArticleItemLoader(item=ZhihuQuestionItem(),
                                                 response=response)
        question_item_loader.add_css("title", "h1.QuestionHeader-title::text")
        question_item_loader.add_css("content", ".QuestionHeader-detail")
        question_item_loader.add_value("url", response.url)
        question_item_loader.add_value("zhihu_id", zhihu_id)
        question_item_loader.add_css("answer_num",
                                     ".List-headerText span::text")
        question_item_loader.add_css("comments_num",
                                     ".QuestionHeader-Comment button::text")
        question_item_loader.add_css("watch_user_num",
                                     ".NumberBoard-itemValue::text")
        question_item_loader.add_css(
            "topics", ".QuestionHeader-topics .Popover div::text")
        question_item = question_item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(zhihu_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)

        yield question_item
Exemplo n.º 20
0
    def parse_question(self, response):

        # 可选:在question页面也进行进一步的a标签提取与跟踪
        # 处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            # 处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = str(match_obj.group(2))

            item_loader = XiaojianrenItemLoader(item=ZhihuQuestionItem(),
                                                response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content",
                                '.QuestionRichText--collapsed div span::text')
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-Comment button::text")
            # 这里一次把一列值提出来
            item_loader.add_css("watch_user_num",
                                ".NumberBoard-itemValue::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            # 处理老版本页面的item提取(好像已经没有老版页面了我这里放着保险一下)
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            question_id = ''
            if match_obj:
                question_id = str(match_obj.group(2))

            item_loader = XiaojianrenItemLoader(item=ZhihuQuestionItem(),
                                                response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath(
                "title",
                "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()"
            )
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css(
                "comments_num",
                "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath(
                "watch_user_num",
                "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()"
            )
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()
            question_item.get("title")
        # 发起向后台具体answer的接口请求
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             meta={
                                 "title": question_item.get("title"),
                                 "question_url": question_item.get("url")
                             },
                             callback=self.parse_answer)
        yield question_item