def parse_answer(self, response): """处理问题的回答, 返回json数据""" answers = json.loads(response.text) is_end = answers['paging']['is_end'] next_url = answers['paging']['next'] # 提取answer的具体结构 for answer in answers['data']: answer_item = ZhihuAnswerItem() answer_item['zhihu_id'] = answer['id'] answer_item['url'] = answer['url'] answer_item['question_id'] = answer['question']['id'] answer_item['author_id'] = answer['author'][ 'id'] if 'id' in answer['author'] else None answer_item[ 'content'] = answer['content'] if 'content' in answer else None answer_item['praise_num'] = answer['voteup_count'] answer_item['comments_num'] = answer['comment_count'] answer_item['create_time'] = answer['created_time'] answer_item['update_time'] = answer['updated_time'] answer_item['crawl_time'] = datetime.datetime.now() yield answer_item if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): # 处理question的answer answer_json = json.loads(response.text) is_end = answer_json["paging"]["is_end"] next_url = answer_json["paging"]["next"] # 提取answer内容 for answer in answer_json["data"]: answer_item = ZhihuAnswerItem() answer_item["answer_id"] = int(answer['id']) answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else "匿名作者" answer_item["content"] = answer[ "content"] if "content" in answer else "内容为空" answer_item["praise_num"] = int(answer["voteup_count"]) answer_item["comment_num"] = int(answer["comment_count"]) answer_item["update_time"] = time.strftime( SQL_DATETIME_FORMAT, time.localtime(answer["updated_time"])) answer_item["create_time"] = time.strftime( SQL_DATETIME_FORMAT, time.localtime(answer["created_time"])) answer_item["crawl_time"] = datetime.datetime.now().strftime( SQL_DATETIME_FORMAT) yield answer_item pass if not is_end: yield scrapy.Request(next_url)
def parse_answer(self, response): #处理question的answer ans_json = json.loads(response.text) is_end = ans_json['paging']['is_end'] totals_answer = ans_json['paging']['totals'] next_url = ans_json['paging']['next'] #提取question_answer当中的字段 for answer1 in ans_json['data']: answer_item = ZhihuAnswerItem() answer_item['zhihu_id'] = answer1['id'] answer_item['url'] = answer1['url'] answer_item['question_id'] = answer1['question']['id'] answer_item['author_id'] = answer1['author'][ 'id'] if 'id' in answer1['author'] else None answer_item['content'] = answer1[ 'content'] if 'content' in answer1 else None answer_item['parise_num'] = answer1['voteup_count'] answer_item['comment_num'] = answer1['comment_count'] answer_item['create_time'] = answer1['created_time'] answer_item['update_time'] = answer1['updated_time'] answer_item['acrawl_time'] = datetime.datetime.now() yield answer_item if not is_end: yield scrapy.Request(next_url, headers=self.header, callback=self.parse_answer)
def parse_answer(self, response): # 回答列表返回json结果,所以将字符串转换为json结构 ans_json = json.loads(response.text) # 获取是否为最后一页 is_end = ans_json["paging"]["is_end"] # 获取下一页url next_url = ans_json["paging"]["next"] # 提取answer字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None answer_item[ "content"] = answer["content"] if "content" in answer else None answer_item["praise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] # 处理item yield answer_item # 如果不是最后一页,则处理下一页 if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): """ 解析请求到的回答数据 :param response: :return: """ ans_json = json.loads(response.text) # 解析返回数据 is_end = ans_json["paging"]["is_end"] # 是否是最后一页 totals_answer = ans_json["paging"]["totals"] # 所有的回答数量 next_url = ans_json["paging"]["next"] # 回答信息的下一页url # 提取回答的具体字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None # 可能出现匿名用户 answer_item["content"] = answer[ "content"] if "content" in answer else None # 如果不出现 answer_item["parise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() # 当前时间 yield answer_item # 异步存储数据 if not is_end: # 如果不是最后一页 yield scrapy.Request(next_url, callback=self.parse_answer) # 进行下一页回答的请求
def parse_answer(self, response): # 处理question的answer ans_json = json.loads(response.text) is_end = ans_json["paging"]["is_end"] next_url = ans_json["paging"]["next"] # 提取answer的具体字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None answer_item[ "content"] = answer["content"] if "content" in answer else None answer_item["parise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() yield answer_item if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer) pass
def parse_answer(self,response): #处理question的answer aheader = {"Referer": response.url} ans_json= json.loads(response.text) is_end=ans_json['paging']['is_end'] totals_totals = ans_json['paging']['totals'] next_url = ans_json['paging']['next'] #提取answer具体字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None answer_item["content"] = answer["content"] if "content" in answer else None answer_item["praise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["created_time"] = answer["created_time"] answer_item["updated_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() print(answer_item["zhihu_id"],answer_item["url"],answer_item["question_id"],answer_item["author_id"],answer_item["content"],answer_item["praise_num"],answer_item["comments_num"],answer_item["created_time"],answer_item["updated_time"],answer_item["crawl_time"]) yield answer_item if not is_end: for i in range(1): yield scrapy.Request(next_url,headers=aheader,callback=self.parse_answer)
def parse_answer(self, response): response_data = json.loads(response.text) is_end = response_data['paging']['is_end'] next_page = response_data['paging']['next'] for data in response_data['data']: answer_item = ZhihuAnswerItem() answer_item['zhihu_id'] = data['id'] answer_item['url'] = data['url'] answer_item['question_id'] = data['question']['id'] answer_item['author_id'] = data['author']['id'] if data['author'][ 'id'] else 0 answer_item['content'] = data['content'] answer_item['parise_num'] = data['voteup_count'] answer_item['comments_num'] = data['comment_count'] answer_item['create_time'] = datetime.datetime.fromtimestamp( int(data['created_time'])) answer_item['update_time'] = datetime.datetime.fromtimestamp( int(data['updated_time'])) answer_item['crawl_time'] = datetime.datetime.now() yield answer_item if not is_end: yield Request(next_page, headers=self.header, cookies=self.requests_cookies, callback=self.parse_answer)
def parse_answer(self, response): """ 用json提取answer """ # 有无下一个回答 ans_json = json.loads(response.text) is_end = ans_json['paging']['is_end'] totals_answer = ans_json['paging']['totals'] next_url = ans_json['paging']['next'] # 提取answer字段 for answer in ans_json['data']: answer_item = ZhihuAnswerItem() answer_item['zhihu_id'] = answer['id'] answer_item['url'] = answer['url'] answer_item['question_id'] = answer['question']['id'] answer_item['author_id'] = answer['author']['id'] answer_item[ 'content'] = answer['content'] if 'content' in answer else None answer_item['praise_num'] = answer['voteup_count'] answer_item['comments_num'] = answer['comment_count'] answer_item['create_time'] = answer['created_time'] answer_item['update_time'] = answer['updated_time'] answer_item['crawl_time'] = datetime.datetime.now() yield answer_item if not is_end: yield scrapy.Request(url=next_url, headers=self.headers, callback=self.parse_answer) pass
def parse_answer(self, response): #处理answer,answer返回的为json数据 answer_json = json.loads(response.text) is_end = answer_json["paging"]["is_end"] answer_totals = answer_json["paging"]["totals"] answer_next = answer_json["paging"]["next"] #提取answer的具体字段 for answers in answer_json["data"]: #print (answers) answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answers["id"] answer_item["url"] = response.url answer_item["question_id"] = answers["question"]["id"] answer_item["author_id"] = answers["author"][ "id"] if "id" in answers["author"] else None answer_item["content"] = answers[ "content"] if "content" in answers["content"] else None answer_item["parise"] = answers["voteup_count"] answer_item["comments_num"] = answers["comment_count"] answer_item["create_time"] = answers["created_time"] answer_item["update_time"] = answers["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() yield answer_item #如果不是最后一页 if not is_end: scrapy.Request(answer_next, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): answer_json = json.load(response.text) # 解析出该页面是否为最后一个页面 is_end = answer_json["paging"]["is_end"] #总共有多少个回答 totals = answer_json["paging"]["totals"] next_url = answer_json["paging"]["next"] """ 在该 url(页面中) 提取 limit 个 anwser """ for answer in answer_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] """ 如果用户在匿名的情况下进行评论的, 是没有id字段的 """ answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None """ 可能在回答中 answer["content"] 为 NULL, 但 answer["excerpt] 肯定有 """ answer_item["content"] = answer[ "content"] if "content" in answer else answer["excerpt"] answer_item["praise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.now() """ yield 实际上会交给 pipeline进行进一步的处理 """ yield answer_item """ 如果 is_end 为 fasle, 说明 还可以请求下一个回答页面 """ if not is_end: yield scrapy.Request(next_url, headers=headers, callback=self.parse_answer)
def parse_answer(self, response): # 处理question的answer ans_json = json.loads(response.text) is_end = ans_json['paging']['is_end'] next_url = ans_json['paging']['next'] # 提取answer字段 answer_item = ZhihuAnswerItem() for answer in ans_json['data']: answer_item['answer_id'] = answer['id'] answer_item['url'] = answer['url'] answer_item['question_id'] = answer['question']['id'] answer_item['author_id'] = answer['author']['id'] if answer['author']['id'] != '0' else 'anonymous' answer_item['content'] = answer['content'] if 'content' in answer else answer['excerpt'] answer_item['praise_num'] = answer['voteup_count'] answer_item['comments_num'] = answer['comment_count'] answer_item['create_time'] = answer['created_time'] answer_item['update_time'] = answer['updated_time'] yield answer_item if not is_end: yield scrapy.Request(next_url, headers=self.headers, dont_filter=True, callback=self.parse_answer)
def parse_answer(self, response): data = json.loads(response.text) for answer in data['data']: itemloader = mArticleItemLoader(item=ZhihuAnswerItem(), response=response) itemloader.add_value('object_id',get_md5(answer['url'])) itemloader.add_value('answer_id', answer['id']) itemloader.add_value('question_id', answer['question']['id']) itemloader.add_value('voteup_count', answer['voteup_count']) itemloader.add_value('author_id', answer['author']['id']) itemloader.add_value('content', answer['content']) itemloader.add_value('comment_count', answer['comment_count']) itemloader.add_value('url', answer['url']) itemloader.add_value('updated_time', datetime.fromtimestamp(answer['updated_time']).strftime("%Y/%m/%d")) itemloader.add_value('crawl_time', datetime.now().strftime("%Y/%m/%d")) itemloader.add_value('created_time', datetime.fromtimestamp(answer['updated_time']).strftime("%Y/%m/%d")) item = itemloader.load_item() yield item if not data['paging']['is_end']: next_url = data['paging']['next'] yield scrapy.Request(url=next_url, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): answer_json = json.loads(response.text) for answer_data in answer_json["data"]: answer_item = ZhihuAnswerItem() answer_item["answer_id"] = answer_data["id"] answer_item["url"] = answer_data["url"] answer_item["question_id"] = answer_data["question"]["id"] answer_item["question_title"] = answer_data["question"]["title"] answer_item["author_id"] = answer_data["author"][ "id"] if "id" in answer_data["author"] else None answer_item["content"] = answer_data[ "content"] if "content" in answer_data else None answer_item["praise_num"] = answer_data["voteup_count"] answer_item["comments_num"] = answer_data["comment_count"] answer_item["create_time"] = answer_data["created_time"] answer_item["update_time"] = answer_data["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() # yield answer_item if not answer_json["paging"]['is_end']: yield scrapy.Request(url=answer_json["paging"]['next'], headers=self.headers, cookies=self.cookies1, callback=self.parse_answer)
def parse_answer(self, response): """ 提取知乎 answer item """ answer_item = ZhihuAnswerItem() answer_json = json.loads(response.text) is_end = answer_json["paging"]["is_end"] total_answers = answer_json["paging"]["totals"] next_url = answer_json["paging"]["next"] # 提取 answer 的具体字段 for answer in answer_json["data"]: answer_item["question_id"] = answer["id"] answer_item["answer_url"] = answer["url"] answer_item["answer_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None answer_item["author_name"] = answer["author"][ "name"] if "name" in answer["author"] else "unknown" answer_item["author_gender"] = answer["author"][ "gender"] if "gender" in answer["author"] else -1 answer_item["answer_content"] = answer[ "content"] if "content" in answer else answer["excerpt"] answer_item["praise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() yield answer_item if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer, dont_filter=True)
def parse_answer(self, response): answer_json = json.loads(response.text) is_end = answer_json["paging"]["is_end"] next_url = answer_json["paging"]["next"] # 解析answer for answer in answer_json["data"]: answer_item = ZhihuAnswerItem() answer_item["table_name"] = "zhihu_answer" answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"]["id"] answer_item["content"] = answer["content"] answer_item["praise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = datetime.fromtimestamp(answer["created_time"]) answer_item["update_time"] = datetime.fromtimestamp(answer["updated_time"]) answer_item["crawl_time"] = datetime.now() yield answer_item # 还有answer则继续提取 if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
def parse_answer(self,response): asw_json = json.loads(response.text) is_end = asw_json['paging']['is_end'] next_url = asw_json['paging']['next'] for answer in asw_json['data']: answer_item = ZhihuAnswerItem() answer_item['zhihu_id'] = answer['id'] answer_item['url'] = answer['url'] answer_item['question_id'] = answer['question']['id'] answer_item['author_id'] = answer['author']['id'] if id in answer['author'] else None answer_item['content'] = answer['content'] answer_item['comment_num'] = answer['comment_count'] answer_item['parise_num'] = answer['voteup_count'] answer_item['create_time'] = answer['created_time'] answer_item['update_time'] = answer['updated_time'] answer_item['crawl_time'] = datetime.datetime.now() yield answer_item if not is_end: pass yield scrapy.Request(next_url,headers=self.headers,callback=self.parse_answer) pass