示例#1
0
    def parse_question(self, response):
        #处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            #处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            # item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-Comment button::text")

            # item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("watch_user_num",
                                ".NumberBoard-itemValue::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            #处理老版本页面的item提取
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath(
                "title",
                "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()"
            )
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css(
                "comments_num",
                "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath(
                "watch_user_num",
                "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()"
            )
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
示例#2
0
    def parse_question(self, response):
        #处理question页面,并提出具体的question item
        if 'QuestionHeader-title' in response.text:
            #处理新版本
            match_obj = re.match('(.*zhihu.com.question/(/d+).*)(/|$)',
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css('title', 'h1.QuestionHeader-title::text')
            item_loader.add_css('content', '.QuestionHeader-detail')
            item_loader.add_value('url', response.url)
            item_loader.add_value('zhihu_id', question_id)
            item_loader.add_css('answer_num', '.List-headerText span::text')
            item_loader.add_css('comment_nums',
                                '.QuestionHeader-actions button::text')
            item_loader.add_css('watch_user_num', '.NumberBoard-value::text')
            item_loader.add_css('topics',
                                '.QuestionHeader-topics .Popover div::text')
            question_item = item_loader.load_item()
        else:
            #处理旧版本页面item页面提取
            match_obj = re.match('(.*zhihu.com.question/(/d+).*)(/|$)',
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            #item_loader.add_css('title', '.zh-question-title h2 a::text')
            item_loader.add_xpath(
                'title',
                '//*[@id="zh-question-title"]/h2/span/text()|//*[@id="zh-question-title"]/h2/a/text()'
            )
            item_loader.add_css('content', '#zh-question-detail')
            item_loader.add_value('url', response.url)
            item_loader.add_value('zhihu_id', question_id)
            item_loader.add_css('answer_num', '#zh-question-answer-num::text')
            item_loader.add_css(
                'comment_nums',
                '#zh-question-meta-wrap a[name="addcomment"]::text')
            #item_loader.add_css('watch_user_num', '.#zh-question-side-header-wrap::text')
            item_loader.add_xpath(
                'watch_user_num',
                '//*[@id=zh-question-side-header-wrap]/text()|//*[@class="zh-question-fllowers-sidebar"]/div/a/strong/text()'
            )
            item_loader.add_css('topics', '.zm-tag-editor-labels a::text')
            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.header,
                             callback=self.parse_answer)
        yield question_item
示例#3
0
    def parse_question(self, response):

        question_id = int(response.meta.get('question_id'))
        # handle new version
        if "QuestionHeader-title" in response.text:
            print("This is the new version of Zhihu")

            question_loader = ItemLoader(item=ZhihuQuestionItem(),
                                         response=response)

            question_loader.add_value('url', response.url)
            question_loader.add_value('zhihu_id', question_id)
            question_loader.add_css('title', "h1.QuestionHeader-title::text")
            question_loader.add_css('content', "div.QuestionHeader-detail")
            question_loader.add_css('answer_num',
                                    "h4.List-headerText span::text")
            question_loader.add_css('comments_num',
                                    "div.QuestionHeader-Comment button::text")
            question_loader.add_css('watch_user_num',
                                    "div.NumberBoard-value::text")
            question_loader.add_css(
                'topics', "div.QuestionHeader-topics .Popover div::text")

        else:
            print("This is the old version of Zhihu")
            question_loader = ItemLoader(item=ZhihuQuestionItem(),
                                         response=response)

            question_loader.add_value('url', response.url)

            question_loader.add_value('zhihu_id', question_id)
            question_loader.add_xpath(
                "title",
                "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()"
            )
            question_loader.add_css("content", "#zh-question-detail")
            question_loader.add_css("answer_num",
                                    "#zh-question-answer-num::text")
            question_loader.add_css(
                "comments_num",
                "#zh-question-meta-wrap a[name='addcomment']::text")
            question_loader.add_xpath(
                "watch_user_num",
                "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()"
            )
            question_loader.add_css("topics", ".zm-tag-editor-labels a::text")

        question_item = question_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.header,
                             callback=self.parse_answer)
        yield question_item
示例#4
0
    def parse_question(self, response):
        #处理question页面,从页面中提取具体的item
        if "QuestionHeader-title" in response.text:
            #处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)  #传递实例
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num", ".QuestionHeader-Comment")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            #处理旧版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)  # 传递实例
            item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css(
                "comments_num",
                "#zh-question-meat-wrap a[name='addcomment']::text")
            item_loader.add_css("watch_user_num",
                                "#zh-question-side-header-warp::text")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
示例#5
0
 def parse_question(self, response):
     #处理question页面,从页面中提取具体的question item
     if "QuestionHeader-title" in response.text:
         match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                              response.url)
         if match_obj:
             question_id = int(match_obj.group(2))
         #处理新版本
         item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                  response=response)
         item_loader.add_css("title", "h1.QuestionHeader-title::text")
         item_loader.add_css("content", ".QuestionHeader-detail")
         item_loader.add_value("url", response.url)
         item_loader.add_value("zhihu_id", question_id)
         item_loader.add_css("answer_num", ".List-headerText span::text")
         #item_loader.add_xpath("answer_num","//*[@id='root']/div/main/div/div[2]/div[1]/div[1]/a/text()")
         item_loader.add_css("comments_num",
                             ".QuestionHeader-Comment button::text")
         #item_loader.add_css("watch_user_num",".NumberBoard-value::text")
         item_loader.add_css("topics",
                             ".QuestionHeader-topics .Popover div::text")
         question_item = item_loader.load_item()
     #如果是request,会路由到下载器进行下载
     #yield scrapy.Request(self.start_answer_url.format(question_id,20,0),headers=self.headers,callback=self.parse_answer)
     #如果是item,会路由到pipelines中
     yield question_item
示例#6
0
    def parse_question(self, response):
        #处理question页面,从页面提取数据
        question_id = int(response.meta.get("question_id", ""))
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content", "div.QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeaderActions button::text")
        item_loader.add_css("watch_user_num",
                            "strong.NumberBoard-itemValue::attr(title)")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")
        item_loader.add_value("crawl_time", datetime.datetime.now())

        question_item = item_loader.load_item()

        self.headers["Referer"] = "https://www.zhihu.com/question/" + str(
            question_id)
        yield scrapy.Request(self.start_answer_url.format(question_id, 0, 20),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
示例#7
0
    def parse_question(self, response):
        """
        处理question页面,从页面中提取出具体的question item
        :type response: HtmlResponse
        :param response:
        :return:
        """
        question_id = int(response.meta.get('question_id'))
        if "QuestionHeader-title" in response.text:
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css('title', 'h1.QuestionHeader-title::text')
            # item_loader.add_xpath('title',"//*[@id=')
            item_loader.add_css('content', '.QuestionHeader-detail')
            item_loader.add_value('url', response.url)
            item_loader.add_value('zhihu_id',
                                  int(response.meta.get('question_id')))
            # item_loader.add_css('answer_num','.List-headerText span::text')
            item_loader.add_xpath(
                'answer_num',
                '//*[@id="root"]/div/main/div/div[2]/div[1]/div[1]/a/text()|//*[@id="QuestionAnswers-answers"]/div/div/div[1]/h4/span/text()'
            )
            item_loader.add_css('comments_num',
                                '.QuestionHeaderActions button::text')
            # item_loader.add_xpath('comments_num','//*[@class="Button Button--plain"][1]/svg/text()')
            item_loader.add_css('watch_user_num', '.NumberBoard-value::text')
            item_loader.add_css('topics',
                                '.QuestionHeader-topics .Popover div::text')
            question_item = item_loader.load_item()

        # 请求问题的答案
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             callback=self.parse_answer,
                             headers=self.headers)
        yield question_item
示例#8
0
    def parse_question(self, response):

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)

        item_loader.add_css('title', 'h1.QuestionHeader-title::text')

        # item_loader.add_xpath('title', "//h1[@class='QuestionHeader-title']/text()|//h1[@class='QuestionHeader-title']/a/text()")

        item_loader.add_css('content', '.QuestionHeader-detail')

        item_loader.add_value('url', response.url)

        item_loader.add_value('zhihu_id', response.meta.get('zhihu_id', ''))

        item_loader.add_css('answer_num', 'h4.List-headerText span::text')

        item_loader.add_css('comments_num',
                            '.QuestionHeader-Comment button::text')

        item_loader.add_css('watch_user_num',
                            'strong.NumberBoard-itemValue::text')

        item_loader.add_css('topics',
                            '.QuestionHeader-topics .Popover div::text')

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(
            response.meta.get('zhihu_id', ''), 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
示例#9
0
    def parse_question(self, response):

        if "QuestionHeader-title" in response.text:
            print("latest version")
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover::text")

            question_item = item_loader.load_item()
            print("question_item: ")
            print(question_item)

        else:
            print("old version , please switch to latest version")
        yield question_item
示例#10
0
    def parse_question(self, response):

        match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                             response.url)
        if match_obj:
            question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content", ".QuestionHeader-detail span::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeader-Comment button::text")
        item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()
        if question_item.get("content") == None:
            item_loader.add_value("content", "None")
            question_item = item_loader.load_item()

        print("sth")
        #分析回答 最多只能一次提取20条
        # answer_num = int(question_item.get("answer_num"))
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        #yield 出去的item 被路由到pipline里面
        #TODO
        yield question_item
示例#11
0
    def parse_detail(self, response):
        # 处理question页面,从页面中提取quesiont item

        match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*',
                             response.url)
        question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css('title', 'h1.QuestionHeader-title::text')
        item_loader.add_css('content', '.QuestionHeader-detail'
                            )  # .QuestionHeader-detail span.RichText::text
        item_loader.add_value('url', response.url)
        item_loader.add_value('zhihu_id', question_id)
        item_loader.add_css('answer_num', '.List-headerText span::text')
        item_loader.add_css(
            'comments_num',
            '.QuestionHeader-Comment button.Button--plain::text')
        item_loader.add_css(
            'watch_user_num',
            '.NumberBoard-itemInner strong.NumberBoard-itemValue::attr("title")'
        )
        item_loader.add_css('topics',
                            '.QuestionHeader-topics .Popover div::text')

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_urls.format(question_id, 3, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
示例#12
0
    def parse_question(self, response):
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css('content', ".QuestionHeader-detail div span::text")

        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeader-Comment button::text")
        item_loader.add_css(
            "watch_user_num",
            ".QuestionFollowStatus-counts button div strong::text")
        item_loader.add_css(
            "click_num", ".QuestionFollowStatus-counts div div strong::text")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")
        item_loader.add_value("url", response.url)

        request_id = response.meta.get("request_id")
        item_loader.add_value("zhihu_id", request_id)
        item_loader.add_value("crawl_time", datetime.datetime.now())
        question_item = item_loader.load_item()
        yield scrapy.Request(self.start_answer_url.format(request_id, 20, 1),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
示例#13
0
    def parse_question(self, response):
        #处理question,从页面中提取出具体的question item
        match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                             response.url)
        if match_obj:
            question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css(
            "title",
            ".QuestionHeader .QuestionHeader-content .QuestionHeader-main h1.QuestionHeader-title::text"
        )
        item_loader.add_css("content", "div.QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", "h4.List-headerText span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeader-Comment button::text")
        item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

        # 路由到下载器
        yield scrapy.Request(self.start_answer_url.format(question_id, 5, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item  #路由到pipeline中
示例#14
0
    def parse_question(self, response):
        match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                             response.url)
        question_id = 0
        if match_obj:
            question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content", ".QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeader-actions button::text")
        item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
示例#15
0
    def parse_question(self, response):
        # 处理question页面, 从页面中提取出具体的question item

        question_id = response.meta.get('question_id', '')

        item_loader = ArticleItemLoader(item=ZhihuQuestionItem(),
                                        response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content", ".QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeader-Comment span::text")
        item_loader.add_css(
            "watch_user_num",
            ".NumberBoard:first-child .NumberBoard-itemValue::text")
        item_loader.add_css(
            "click_num",
            ".NumberBoard:last-child .NumberBoard-itemValue::text")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

        # 请求答案api
        answer_url = self.start_answer_url.format(question_id, 5, 0)
        yield scrapy.Request(answer_url,
                             headers=self.headers,
                             callback=self.parse_answer)

        yield question_item
示例#16
0
    def parse_question(self, response):
        """
        处理 question 页面,从页面中提取出具体的 question item 和 answer item

        这个函数中也可以继续按照 parse 一样,对所有的 url 进行跟踪
        """
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        if "answer" not in response.url:
            # 处理新版本
            item_loader.add_css("question_title",
                                ".QuestionHeader .QuestionHeader-title::text")
            # item_loader.add_css("question_content", ".QuestionHeader-detail .RichText::text")   # 这个可能取不全
            item_loader.add_xpath(
                "question_content",
                "//div[@class='QuestionHeader-detail']//span/text() "
                "| //div[@class='QuestionHeader-detail']//span//*/text()")
            item_loader.add_value("question_url", response.url)
            item_loader.add_value("question_id",
                                  int(response.meta.get("question_id", "")))
            item_loader.add_css("answer_num", ".QuestionMainAction::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-Comment button::text")
            item_loader.add_css("attentioned_num", ".NumberBoard-value::text")
            item_loader.add_css("scanned_num", ".NumberBoard-value::text")
            item_loader.add_css("question_topics",
                                ".Tag-content .Popover div::text")
            item_loader.add_value("crawl_time", datetime.datetime.now())
        else:
            # 处理旧版本
            item_loader.add_css(
                "question_title",
                ".QuestionHeader .QuestionHeader-title::text")  #
            # item_loader.add_css("question_content", ".QuestionHeader-detail .RichText::text") #
            item_loader.add_xpath(
                "question_content",
                "//div[@class='QuestionHeader-detail']//span/text() "
                "| //div[@class='QuestionHeader-detail']//span//*/text()")
            item_loader.add_value("question_url", response.url)
            item_loader.add_value("question_id",
                                  int(response.meta.get("question_id", "")))
            item_loader.add_css("answer_num", ".QuestionMainAction::text")  #
            item_loader.add_css("comments_num",
                                ".QuestionHeader-Comment button::text")
            item_loader.add_css("attentioned_num", ".NumberBoard-value::text")
            item_loader.add_css("scanned_num", ".NumberBoard-value::text")
            item_loader.add_css("question_topics",
                                ".Tag-content .Popover div::text")
            item_loader.add_value("crawl_time", datetime.datetime.now())

        question_item = item_loader.load_item()

        answer_url = self.start_answer_url.format(
            response.meta.get("question_id"), 0, 20)

        # for testing
        yield scrapy.Request(answer_url,
                             headers=self.headers,
                             callback=self.parse_answer,
                             dont_filter=True)
        yield question_item
示例#17
0
    def parse_question(self, response):
        #处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            #处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content", ".QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_xpath(
            "comments_num",
            "//*[@class='QuestionHeader-Comment']/button/text()")
        item_loader.add_xpath(
            "watch_user_num",
            "//*[@class='QuestionHeader-follow-status']/div/div/button/div/strong/text()"
        )
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")
        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 3, 0),
                             callback=self.parse_answer)
        yield question_item
示例#18
0
    def parse_question(self, response):
        # 处理question页面
        match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                             response.url)
        if match_obj:
            question_id = int(match_obj.group(2))
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        # 标题
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        # 问题内容
        item_loader.add_css("content", ".QuestionHeader-detail")
        # 问题url
        item_loader.add_value("url", response.url)
        # 问题id
        item_loader.add_value("zhihu_id", question_id)
        # 回答数量
        item_loader.add_xpath("answer_num",
                              "//h4[@class='List-headerText']/span//text()")
        # 评论数量
        item_loader.add_css("comment_num",
                            ".QuestionHeader-Comment button::text")
        # 关注者和浏览数
        item_loader.add_xpath(
            "watch_user_num",
            "//strong[@class='NumberBoard-itemValue']/text()")
        # 所属话题
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(
            str(question_id), 20, 0),
                             callback=self.parse_answer)
        yield question_item
示例#19
0
文件: zhihu.py 项目: abcd567/spiders
    def parse_question(self, response):
        """
        从question页面提取question item
        """
        question_id = response.meta.get('question_id', '')

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css('title',
                            'h1.QuestionHeader-title:nth-child(2)::text')
        item_loader.add_css('content', '.QuestionHeader-detail')
        item_loader.add_value('url', response.url)
        item_loader.add_value('zhihu_id', question_id)
        item_loader.add_css('answer_num',
                            '.List-headerText > span:nth-child(1)::text')
        item_loader.add_css(
            'comments_num',
            '.QuestionHeader-Comment > button:nth-child(1)::text')
        item_loader.add_css('watch_user_num', '.NumberBoard-itemValue::text')
        item_loader.add_css('topics',
                            '.QuestionHeader-topics .Popover > div::text')

        question_item = item_loader.load_item()

        yield scrapy.Request(url=self.start_answer_urls.format(question_id, 0),
                             headers=self.headers,
                             callback=self.parse_answer)

        yield question_item
示例#20
0
    def parse_question(self, response):
        """
        进入chrome.exe执行路径 手动启动chrome chrome.exe --remote-debugging-port=9222
        解析每个问题详情页数据 提取具体question item
        :param response:
        :return:
        """
        if "QuestionHeader-title" in response.text:  # 新版本页面的处理方式
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)  # 提取问题url 和 问题id的正则
            if match_obj:
                question_id = int(match_obj.group(2))  # 问题id
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)  # 实例化itemloader
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                ".ContentItem-actions button::text")
            item_loader.add_css("watch_user_num",
                                ".NumberBoard-itemValue::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:  # 旧版页面的处理方式 已取消
            pass
        # question_url = self.start_answer_url.format(question_id, 20, 0)
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             callback=self.parse_answer)
        yield question_item  # 异步存储问题数据
示例#21
0
    def parse_question(self, response):

        match_obj = re.match("(.*zhihu.com/question/(\d+)).*", response.url)
        question_id = 0
        if match_obj:
            question_id = int(match_obj.group(2))
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", ".QuestionHeader-title::text")
        item_loader.add_css("content", ".QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("question_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText>span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeader-Comment>button::text")
        item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()
        if int(question_id) == 315387406:
            s = question_item["answer_num"]
        question_item["click_num"] = question_item["watch_user_num"][1]
        question_item["watch_user_num"] = question_item["watch_user_num"][0]

        yield scrapy.Request(url=self.answer_start_url.format(
            question_id, 5, 0),
                             headers=self.headers,
                             cookies=self.cookies1,
                             callback=self.parse_answer)
        yield question_item
示例#22
0
 def parse_question(self, response):
     # 处理question页面,从页面中提取出具体的question item
     if "QuestionMainAction" in response.text:
         #     处理老版本
         # 利用正则表达式提取request_url和request_id
         match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                              response.url)
         if match_obj:
             question_id = int(match_obj.group(2))
         item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                  response=response)
         item_loader.add_css("title", "h1.QuestionHeader-title::text")
         item_loader.add_css("content", ".QuestionHeader-detail")
         item_loader.add_css("url", response.url)
         item_loader.add_css("zhihu_id", question_id)
         question_item = item_loader.load_item()
     else:
         # 利用正则表达式提取request_url和request_id
         match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                              response.url)
         if match_obj:
             question_id = int(match_obj.group(2))
         # 处理新版本
         item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                  response=response)
         item_loader.add_css("title", "h1.QuestionHeader-title::text")
         item_loader.add_css("content", ".QuestionHeader-detail")
         item_loader.add_value("url", response.url)
         item_loader.add_value("zhihu_id", question_id)
         item_loader.add_css("answer_num", ".List-headerText span::text")
         item_loader.add_css("comments_num",
                             ".QuestionHeader-Comment button::text")
         item_loader.add_css("watch_user_num",
                             ".NumberBoard-itemValue::text")
         item_loader.add_css("topics",
                             ".QuestionHeader-topics .Popover div::text")
         question_item = item_loader.load_item()
     # 解析question中的answer
     yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                          headers=self.headers,
                          callback=self.parse_answer)
     # 将question_item传到pipelines.py中
     yield question_item
示例#23
0
    def parse_question(self, response):
        # 处理question页面,从页面中提取具体的question item
        # 这个函数体中也可像parse函数那样跟踪url
        match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                             response.url)  # 提取id或者带/answer
        if match_obj:
            question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css(
                "title",
                "div[data-zop-question]::attr(data-zop-question)")  # 一个字典提取
            item_loader.add_css(
                "topics",
                "div[data-zop-question]::attr(data-zop-question)")  # 一个字典提取
            """
            json.loads(d).get('title')  # title
            
            tp_l = json.loads(d)['topics'] 
            ",".join([d.get('name') for d in tp_l])  # topics
            """
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            # item_loader.add_css("answer_num", ".QuestionMainAction::text") # 正则提取
            item_loader.add_xpath(
                "answer_num",
                "//*[@class='List-headerText']/span/text()|//*[@class='QuestionMainAction']/text()"
            )
            """
            re.match(".*?([,\d]+).*", d).group(1).replace(",", "")  # int
            """
            item_loader.add_css("comments_num",
                                ".QuestionHeader-Comment .Button::text")
            """
            re.match(".*?(\d+).*", d).group(1)  # int
            """
            item_loader.add_css("watch_user_num",
                                ".QuestionFollowStatus-counts strong::text")
            """
            d.replace(",", "")
            """
            item_loader.add_css(
                "click_num",
                ".QuestionFollowStatus-counts strong::text")  # 不能提取第一个
            question_item = item_loader.load_item()

            yield question_item
            yield scrapy.Request(
                url=self.start_answer_url.format(question_id, 0, 3),
                headers=self.headers,
                callback=self.parse_answer,
            )
示例#24
0
    def parse_question(self, response):
        # 处理quetion页面

        if "QuestionHeader-title" in response.text:
            # 处理知乎新版本
            item_load = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_load.add_css("title", 'QuestionHeader-title::text')
            item_load.add_css("content", 'QuestionHeader-detail')
            item_load.add_value("url", response.url)
            question_id = response.meta.get("question_id")
            item_load.add_value("zhihu_id", question_id)
            item_load.add_css("answer_num", '.List-headerText span::text')
            item_load.add_css("comments_num", '.QuestionHeader-actions button::text')
            item_load.add_css("watch_user_num", '.NumberBoard-value::text')
            item_load.add_css("topics", '.QuestionHeader-topics .Popover div::text')
            # item_load.add_css("click_num",'')
            # item_load.add_value("crawl_time",time.time.now())
            question_item = item_load.load_item()
        else:
            # 处理老版本
            item_load = ItemLoader(item=ZhihuQuestionItem(), response=response)
            question_id = response.meta.get("question_id")
            item_load.add_value("zhihu_id", question_id)
            item_load.add_value("url", response.url)
            # item_load.add_css("title", '.zh-question-title h2 a::text') #可能在span或者a标签里,但是css选择器无法实现或的选择改为xpath
            item_load.add_xpath("title",
                                '//*[@id="zh-question-title"]/h2/a/text()|//*[@id="zh-question-title"]/h2/span/text()')
            item_load.add_css("content", '#zh-question-detail')
            item_load.add_css("answer_num", '#zh-question-answer-num::text')
            item_load.add_css("comments_num", '#zh-question-meta-wrap a[name="addcomment"]::text')
            item_load.add_css("watch_user_num", '#zh-question-side-header-wrap::text')
            # item_loader.add_xpath("watch_user_num",
            item_load.add_xpath("watch_user_num",
                                "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
            item_load.add_css("topics", '.zm-tag-editor-labels a::text')
            question_item = item_load.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
示例#25
0
 def parse_question(self,response):
     question_id = response.meta.get('question_id','')
     item_loader = ItemLoader(item = ZhihuQuestionItem(),response=response)
     item_loader.add_css('title','.QuestionHeader h1.QuestionHeader-title::text')
     item_loader.add_css('topics','.QuestionTopic .Popover  div::text')
     item_loader.add_css('content','.QuestionHeader-detail span::text')
     item_loader.add_value('url',response.url)
     item_loader.add_value('zhihu_id',question_id)
     item_loader.add_css('answer_num','.List-headerText span::text ')
     item_loader.add_css('comments_num','.QuestionHeader-Comment button::text')
     item_loader.add_css('watch_user_num','.NumberBoard-itemValue::text')
     question_item = item_loader.load_item()
     yield scrapy.Request(self.start_answer_url.format(question_id,20,0),headers=self.headers, callback=self.parse_answer)
     yield question_item
     scrapy.FormRequest
     pass
示例#26
0
    def parse_question(self, response):
        # 处理question页面, 从页面中提取出具体的question item
        if 'QuestionHeader-title' in response.text:
            #处理新版本
            match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*',
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_xpath(
                'title', '//main//h1[@class="QuestionHeader-title"]//text()')
            item_loader.add_xpath('content',
                                  '//div[@class="QuestionHeader-detail"]')
            item_loader.add_value('url', response.url)
            item_loader.add_value('zhihu_id', question_id)
            item_loader.add_xpath('answer_num',
                                  '//div[@class="List"]/div/h4/span/text()')
            item_loader.add_xpath(
                'comments_num',
                '//div[@class="QuestionHeader-Comment"]//button//text()')
            item_loader.add_xpath(
                'watch_user_num',
                '//strong[@class="NumberBoard-itemValue"]//text()')
            item_loader.add_xpath(
                'topics',
                '//div[@class="QuestionHeader-topics"]//div[@class="Popover"]//text()'
            )

            # item_loader.add_css('title', 'h1.QuestionHeader-title::text')
            # item_loader.add_css('content', '.QuestionHeader-detail')
            # item_loader.add_value('url', response.url)
            # item_loader.add_value('zhihu_id', question_id)
            # item_loader.add_css('answer_num', '.List-headerText span::text')
            # item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')
            # item_loader.add_css('watch_user_num', '.NumberBoard-itemValue::text')
            # item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text')

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item

        pass
示例#27
0
    def parse_question(self, response):
        itemloader = mArticleItemLoader(item=ZhihuQuestionItem(), response=response)
        itemloader.add_css("title", ".QuestionHeader-title::text")
        itemloader.add_css("detail", ".QuestionHeader-detail")
        itemloader.add_css("comment_nums", ".QuestionHeader-Comment button::text")
        itemloader.add_css("attention_nums", ".Button.NumberBoard-item .NumberBoard-value::text")
        itemloader.add_css("watch_nums", "div.NumberBoard-item > div:nth-child(2)::text")
        itemloader.add_css("tags", ".Popover div::text")
        itemloader.add_value("crawl_time", datetime.now().strftime("%Y/%m/%d"))
        itemloader.add_value("url", response.url)
        itemloader.add_value("object_id", get_md5(response.url))
        itemloader.add_value("question_id", response.meta.get("id"))
        item = itemloader.load_item()

        answer_url = self.start_answer_url.format(response.meta.get("id"))
        yield scrapy.Request(url=answer_url, headers=self.headers,callback=self.parse_answer)
        yield item
示例#28
0
    def parse(self, response):
        print(response.text)
        item_loader = CustnomItemLoader(item=ZhihuQuestionItem(),response=response)
        item_loader.add_xpath('zhihu_id',)
        item_loader.add_xpath()
        item_loader.add_xpath()
        item_loader.add_xpath('title','//h1[@class="QuestionHeader-title"]')

        item_loader.add_xpath()
        item_loader.add_xpath()
        item_loader.add_xpath()
        item_loader.add_xpath()
        item_loader.add_xpath()
        item_loader.add_xpath()
        item_loader.add_xpath()
        item_loader.add_xpath()
        item_loader.add_xpath()
示例#29
0
    def parse_question(self, response):
        # 处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            #处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_xpath(
                "title",
                '//*[@id="root"]/div/main/div/div[1]/div[2]/div/div[1]/div[1]/h1/text()|//*[@id="root"]/div/main/article/div[1]/div[3]/div[1]/div/h1/text()'
            )
            item_loader.add_xpath(
                "content",
                '//*[@id="root"]/div/main/div/div[1]/div[2]/div/div[1]/div[1]/div[2]/div/div/div/span/p/text|//*[@id="root"]/div/main/article/div[1]/div[3]/div[1]/p/text()'
            )
            item_loader.add_xpath("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_xpath(
                "comments_num",
                '//*[@id="root"]/div/main/div/div[1]/div[2]/div/div[2]/div/div/div[2]/div[1]/button/text()|//*[@id="root"]/div/main/article/div[1]/div[4]/button[1]/text()'
            )
            item_loader.add_xpath(
                "watch_user_num",
                '//*[@id="root"]/div/main/div/div[1]/div[2]/div/div[1]/div[2]/div/div/div/button/div/strong'
            )
            item_loader.add_xpath("topics",
                                  '.QuestionHeader-topics .Popover::text')
            question_item = item_loader.load_item()

        else:
            # 处理老版本页面的item提取
            # match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            # if match_obj:
            #     question_id = int(match_obj.group(2))

            pass
        yield scrapy.Request(self.start_answer_url.format(question_id, 0, 20),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
示例#30
0
 def parse_question(self, response):
     #处理question页面,从页面中提取具体的queston item
     question_id = int(response.meta.get('question_id', ''))
     item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
     item_loader.add_css('title', 'h1.QuestionHeader-title::text')
     item_loader.add_css('content' 'div.QuestionHeader-detail')
     item_loader.add_value('url', response.url)
     item_loader.add_css('answer_num', 'h4.List-headerText span::text')
     item_loader.add_css('comment_num',
                         '.QuestionHeader-Comment button::text')
     item_loader.add_value('question_id', question_id)
     item_loader.add_css('watch_user_num' '."NumberBoard-itemValue::text')
     item_loader.add_css('topics',
                         '.QuestionHeader-topics .Popover div::text')
     question_item = item_loader.load_item()
     yield scrapy.Request(self.start_answer_url.format(question_id, 5, 0),
                          headers=self.header,
                          callback=self.parse_answer)
     yield question_item