Пример #1
0
    def parse_answer(self, response):
        """处理问题的回答, 返回json数据"""
        answers = json.loads(response.text)
        is_end = answers['paging']['is_end']
        next_url = answers['paging']['next']

        # 提取answer的具体结构
        for answer in answers['data']:
            answer_item = ZhihuAnswerItem()
            answer_item['zhihu_id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer['author'][
                'id'] if 'id' in answer['author'] else None
            answer_item[
                'content'] = answer['content'] if 'content' in answer else None
            answer_item['praise_num'] = answer['voteup_count']
            answer_item['comments_num'] = answer['comment_count']
            answer_item['create_time'] = answer['created_time']
            answer_item['update_time'] = answer['updated_time']
            answer_item['crawl_time'] = datetime.datetime.now()

            yield answer_item

        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)
Пример #2
0
    def parse_answer(self, response):
        # 处理question的answer
        answer_json = json.loads(response.text)
        is_end = answer_json["paging"]["is_end"]
        next_url = answer_json["paging"]["next"]

        # 提取answer内容
        for answer in answer_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["answer_id"] = int(answer['id'])
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else "匿名作者"
            answer_item["content"] = answer[
                "content"] if "content" in answer else "内容为空"
            answer_item["praise_num"] = int(answer["voteup_count"])
            answer_item["comment_num"] = int(answer["comment_count"])
            answer_item["update_time"] = time.strftime(
                SQL_DATETIME_FORMAT, time.localtime(answer["updated_time"]))
            answer_item["create_time"] = time.strftime(
                SQL_DATETIME_FORMAT, time.localtime(answer["created_time"]))
            answer_item["crawl_time"] = datetime.datetime.now().strftime(
                SQL_DATETIME_FORMAT)

            yield answer_item
            pass

        if not is_end:
            yield scrapy.Request(next_url)
Пример #3
0
 def parse_answer(self, response):
     #处理question的answer
     ans_json = json.loads(response.text)
     is_end = ans_json['paging']['is_end']
     totals_answer = ans_json['paging']['totals']
     next_url = ans_json['paging']['next']
     #提取question_answer当中的字段
     for answer1 in ans_json['data']:
         answer_item = ZhihuAnswerItem()
         answer_item['zhihu_id'] = answer1['id']
         answer_item['url'] = answer1['url']
         answer_item['question_id'] = answer1['question']['id']
         answer_item['author_id'] = answer1['author'][
             'id'] if 'id' in answer1['author'] else None
         answer_item['content'] = answer1[
             'content'] if 'content' in answer1 else None
         answer_item['parise_num'] = answer1['voteup_count']
         answer_item['comment_num'] = answer1['comment_count']
         answer_item['create_time'] = answer1['created_time']
         answer_item['update_time'] = answer1['updated_time']
         answer_item['acrawl_time'] = datetime.datetime.now()
         yield answer_item
     if not is_end:
         yield scrapy.Request(next_url,
                              headers=self.header,
                              callback=self.parse_answer)
Пример #4
0
    def parse_answer(self, response):
        # 回答列表返回json结果,所以将字符串转换为json结构
        ans_json = json.loads(response.text)
        # 获取是否为最后一页
        is_end = ans_json["paging"]["is_end"]
        # 获取下一页url
        next_url = ans_json["paging"]["next"]

        # 提取answer字段
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None
            answer_item[
                "content"] = answer["content"] if "content" in answer else None
            answer_item["praise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            # 处理item
            yield answer_item

        # 如果不是最后一页,则处理下一页
        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)
Пример #5
0
    def parse_answer(self, response):
        """
        解析请求到的回答数据
        :param response:
        :return:
        """
        ans_json = json.loads(response.text)  # 解析返回数据
        is_end = ans_json["paging"]["is_end"]  # 是否是最后一页
        totals_answer = ans_json["paging"]["totals"]  # 所有的回答数量
        next_url = ans_json["paging"]["next"]  # 回答信息的下一页url

        # 提取回答的具体字段
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None  # 可能出现匿名用户
            answer_item["content"] = answer[
                "content"] if "content" in answer else None  # 如果不出现
            answer_item["parise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            answer_item["crawl_time"] = datetime.datetime.now()  # 当前时间

            yield answer_item  # 异步存储数据

        if not is_end:  # 如果不是最后一页
            yield scrapy.Request(next_url,
                                 callback=self.parse_answer)  # 进行下一页回答的请求
Пример #6
0
    def parse_answer(self, response):
        # 处理question的answer
        ans_json = json.loads(response.text)
        is_end = ans_json["paging"]["is_end"]
        next_url = ans_json["paging"]["next"]

        # 提取answer的具体字段
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None
            answer_item[
                "content"] = answer["content"] if "content" in answer else None
            answer_item["parise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            answer_item["crawl_time"] = datetime.datetime.now()

            yield answer_item

        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)

        pass
Пример #7
0
 def parse_answer(self,response):
     #处理question的answer
     aheader = {"Referer": response.url}
     ans_json= json.loads(response.text)
     is_end=ans_json['paging']['is_end']
     totals_totals = ans_json['paging']['totals']
     next_url = ans_json['paging']['next']
     #提取answer具体字段
     for answer in ans_json["data"]:
         answer_item = ZhihuAnswerItem()
         answer_item["zhihu_id"] = answer["id"]
         answer_item["url"] = answer["url"]
         answer_item["question_id"] = answer["question"]["id"]
         answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None
         answer_item["content"] = answer["content"] if "content" in answer else None
         answer_item["praise_num"] = answer["voteup_count"]
         answer_item["comments_num"] = answer["comment_count"]
         answer_item["created_time"] = answer["created_time"]
         answer_item["updated_time"] = answer["updated_time"]
         answer_item["crawl_time"] = datetime.datetime.now()
         print(answer_item["zhihu_id"],answer_item["url"],answer_item["question_id"],answer_item["author_id"],answer_item["content"],answer_item["praise_num"],answer_item["comments_num"],answer_item["created_time"],answer_item["updated_time"],answer_item["crawl_time"])
         yield answer_item
     if not is_end:
         
         for i in range(1):
             yield scrapy.Request(next_url,headers=aheader,callback=self.parse_answer)
Пример #8
0
    def parse_answer(self, response):
        response_data = json.loads(response.text)
        is_end = response_data['paging']['is_end']
        next_page = response_data['paging']['next']

        for data in response_data['data']:
            answer_item = ZhihuAnswerItem()
            answer_item['zhihu_id'] = data['id']
            answer_item['url'] = data['url']
            answer_item['question_id'] = data['question']['id']
            answer_item['author_id'] = data['author']['id'] if data['author'][
                'id'] else 0
            answer_item['content'] = data['content']
            answer_item['parise_num'] = data['voteup_count']
            answer_item['comments_num'] = data['comment_count']
            answer_item['create_time'] = datetime.datetime.fromtimestamp(
                int(data['created_time']))
            answer_item['update_time'] = datetime.datetime.fromtimestamp(
                int(data['updated_time']))
            answer_item['crawl_time'] = datetime.datetime.now()

            yield answer_item

        if not is_end:
            yield Request(next_page,
                          headers=self.header,
                          cookies=self.requests_cookies,
                          callback=self.parse_answer)
Пример #9
0
    def parse_answer(self, response):
        """
        用json提取answer
        """
        # 有无下一个回答
        ans_json = json.loads(response.text)
        is_end = ans_json['paging']['is_end']
        totals_answer = ans_json['paging']['totals']
        next_url = ans_json['paging']['next']

        # 提取answer字段
        for answer in ans_json['data']:
            answer_item = ZhihuAnswerItem()
            answer_item['zhihu_id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer['author']['id']
            answer_item[
                'content'] = answer['content'] if 'content' in answer else None
            answer_item['praise_num'] = answer['voteup_count']
            answer_item['comments_num'] = answer['comment_count']
            answer_item['create_time'] = answer['created_time']
            answer_item['update_time'] = answer['updated_time']
            answer_item['crawl_time'] = datetime.datetime.now()

            yield answer_item

        if not is_end:
            yield scrapy.Request(url=next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)
        pass
Пример #10
0
    def parse_answer(self, response):
        #处理answer,answer返回的为json数据
        answer_json = json.loads(response.text)
        is_end = answer_json["paging"]["is_end"]
        answer_totals = answer_json["paging"]["totals"]
        answer_next = answer_json["paging"]["next"]
        #提取answer的具体字段
        for answers in answer_json["data"]:
            #print (answers)
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answers["id"]
            answer_item["url"] = response.url
            answer_item["question_id"] = answers["question"]["id"]
            answer_item["author_id"] = answers["author"][
                "id"] if "id" in answers["author"] else None
            answer_item["content"] = answers[
                "content"] if "content" in answers["content"] else None
            answer_item["parise"] = answers["voteup_count"]
            answer_item["comments_num"] = answers["comment_count"]
            answer_item["create_time"] = answers["created_time"]
            answer_item["update_time"] = answers["updated_time"]
            answer_item["crawl_time"] = datetime.datetime.now()

            yield answer_item

        #如果不是最后一页
        if not is_end:
            scrapy.Request(answer_next,
                           headers=self.headers,
                           callback=self.parse_answer)
Пример #11
0
    def parse_answer(self, response):
        answer_json = json.load(response.text)

        # 解析出该页面是否为最后一个页面
        is_end = answer_json["paging"]["is_end"]
        #总共有多少个回答
        totals = answer_json["paging"]["totals"]
        next_url = answer_json["paging"]["next"]
        """
            在该 url(页面中) 提取 limit 个 anwser
        """
        for answer in answer_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            """
                如果用户在匿名的情况下进行评论的, 是没有id字段的
            """
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None
            """
                可能在回答中 answer["content"] 为 NULL, 但 answer["excerpt] 肯定有
            """
            answer_item["content"] = answer[
                "content"] if "content" in answer else answer["excerpt"]
            answer_item["praise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            answer_item["crawl_time"] = datetime.now()
            """
                yield 实际上会交给 pipeline进行进一步的处理
            """
            yield answer_item
        """
            如果 is_end 为 fasle, 说明 还可以请求下一个回答页面
        """
        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=headers,
                                 callback=self.parse_answer)
Пример #12
0
    def parse_answer(self, response):
        # 处理question的answer
        ans_json = json.loads(response.text)
        is_end = ans_json['paging']['is_end']
        next_url = ans_json['paging']['next']

        # 提取answer字段
        answer_item = ZhihuAnswerItem()
        for answer in ans_json['data']:
            answer_item['answer_id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer['author']['id'] if answer['author']['id'] != '0' else 'anonymous'
            answer_item['content'] = answer['content'] if 'content' in answer else answer['excerpt']
            answer_item['praise_num'] = answer['voteup_count']
            answer_item['comments_num'] = answer['comment_count']
            answer_item['create_time'] = answer['created_time']
            answer_item['update_time'] = answer['updated_time']
            yield answer_item

        if not is_end:
            yield scrapy.Request(next_url, headers=self.headers, dont_filter=True, callback=self.parse_answer)
Пример #13
0
    def parse_answer(self, response):
        data = json.loads(response.text)

        for answer in data['data']:
            itemloader = mArticleItemLoader(item=ZhihuAnswerItem(), response=response)
            itemloader.add_value('object_id',get_md5(answer['url']))
            itemloader.add_value('answer_id', answer['id'])
            itemloader.add_value('question_id', answer['question']['id'])
            itemloader.add_value('voteup_count', answer['voteup_count'])
            itemloader.add_value('author_id', answer['author']['id'])
            itemloader.add_value('content', answer['content'])
            itemloader.add_value('comment_count', answer['comment_count'])
            itemloader.add_value('url', answer['url'])
            itemloader.add_value('updated_time', datetime.fromtimestamp(answer['updated_time']).strftime("%Y/%m/%d"))
            itemloader.add_value('crawl_time', datetime.now().strftime("%Y/%m/%d"))
            itemloader.add_value('created_time', datetime.fromtimestamp(answer['updated_time']).strftime("%Y/%m/%d"))

            item = itemloader.load_item()
            yield item
        if not data['paging']['is_end']:
            next_url = data['paging']['next']
            yield scrapy.Request(url=next_url, headers=self.headers, callback=self.parse_answer)
Пример #14
0
 def parse_answer(self, response):
     answer_json = json.loads(response.text)
     for answer_data in answer_json["data"]:
         answer_item = ZhihuAnswerItem()
         answer_item["answer_id"] = answer_data["id"]
         answer_item["url"] = answer_data["url"]
         answer_item["question_id"] = answer_data["question"]["id"]
         answer_item["question_title"] = answer_data["question"]["title"]
         answer_item["author_id"] = answer_data["author"][
             "id"] if "id" in answer_data["author"] else None
         answer_item["content"] = answer_data[
             "content"] if "content" in answer_data else None
         answer_item["praise_num"] = answer_data["voteup_count"]
         answer_item["comments_num"] = answer_data["comment_count"]
         answer_item["create_time"] = answer_data["created_time"]
         answer_item["update_time"] = answer_data["updated_time"]
         answer_item["crawl_time"] = datetime.datetime.now()
         # yield answer_item
     if not answer_json["paging"]['is_end']:
         yield scrapy.Request(url=answer_json["paging"]['next'],
                              headers=self.headers,
                              cookies=self.cookies1,
                              callback=self.parse_answer)
Пример #15
0
    def parse_answer(self, response):
        """
        提取知乎 answer item
        """
        answer_item = ZhihuAnswerItem()
        answer_json = json.loads(response.text)
        is_end = answer_json["paging"]["is_end"]
        total_answers = answer_json["paging"]["totals"]
        next_url = answer_json["paging"]["next"]

        # 提取 answer 的具体字段
        for answer in answer_json["data"]:
            answer_item["question_id"] = answer["id"]
            answer_item["answer_url"] = answer["url"]
            answer_item["answer_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None
            answer_item["author_name"] = answer["author"][
                "name"] if "name" in answer["author"] else "unknown"
            answer_item["author_gender"] = answer["author"][
                "gender"] if "gender" in answer["author"] else -1
            answer_item["answer_content"] = answer[
                "content"] if "content" in answer else answer["excerpt"]
            answer_item["praise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            answer_item["crawl_time"] = datetime.datetime.now()

            yield answer_item

        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer,
                                 dont_filter=True)
Пример #16
0
    def parse_answer(self, response):
        answer_json = json.loads(response.text)
        is_end = answer_json["paging"]["is_end"]
        next_url = answer_json["paging"]["next"]
        # 解析answer
        for answer in answer_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["table_name"] = "zhihu_answer"
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"]["id"]
            answer_item["content"] = answer["content"]
            answer_item["praise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = datetime.fromtimestamp(answer["created_time"])
            answer_item["update_time"] = datetime.fromtimestamp(answer["updated_time"])
            answer_item["crawl_time"] = datetime.now()

            yield answer_item

        # 还有answer则继续提取
        if not is_end:
            yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
Пример #17
0
    def parse_answer(self,response):
        asw_json = json.loads(response.text)
        is_end = asw_json['paging']['is_end']
        next_url = asw_json['paging']['next']
        for answer in asw_json['data']:
            answer_item = ZhihuAnswerItem()
            answer_item['zhihu_id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer['author']['id'] if  id in answer['author'] else None
            answer_item['content'] = answer['content']
            answer_item['comment_num'] = answer['comment_count']
            answer_item['parise_num'] = answer['voteup_count']
            answer_item['create_time'] = answer['created_time']
            answer_item['update_time'] = answer['updated_time']
            answer_item['crawl_time'] = datetime.datetime.now()

            yield answer_item

        if not is_end:
            pass
            yield scrapy.Request(next_url,headers=self.headers,callback=self.parse_answer)

        pass