def parse_answer(self, response): ans_json = json.loads(response.text) # 同一个question下的answer数量太多,只取前五个,否则会影响question的爬取速度 # is_end = ans_json["paging"]["is_end"] # next_url = ans_json["paging"]["next"] # headers = { # "HOST": "www.zhihu.com", # "Referer": "https://www.zhizhu.com", # 'User-Agent': "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0" # } for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item["title"] = answer["question"]["title"] answer_item["answer_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"]["id"] answer_item["content"] = remove_tags(answer["content"]) answer_item["praise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(answer["created_time"])) answer_item["update_time"] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(answer["updated_time"])) answer_item["crawl_time"] = str(datetime.datetime.now()) yield answer_item
def parse_ans(self, response): selector = Selector(response) print response.url username = response.url.split('/')[-2] try: for record in selector.xpath(r"id('zh-profile-answer-list')/div"): ask_title = ''.join(record.xpath(r"h2/a/text()").extract()) url = host + ''.join(record.xpath("h2/a/@href").extract()) # answer_url ask_url = url.split('/answer')[0] agree_num = ''.join(record.xpath('div/div[2]/a/text()').extract()) summary = ''.join(record.xpath(r"div/div[4]/div/text()").extract()).replace("\n","").strip() #TODO content = ''.join(record.xpath(r"div/div[4]/textarea/text()").extract()).replace("\n","").strip() comment_num = ''.join(record.xpath(r"div/div[5]/div/a[2]/text()").extract()) #'添加评论'或者'3 条评论' comment_num = comment_num.split(' ')[0] #取数字 if comment_num.startswith(u'\n添加评论'): comment_num = '0' yield ZhihuAnswerItem(_id=url,username = username,url = url, ask_title = ask_title, \ ask_url = ask_url, agree_num = agree_num, summary = summary , content = content, comment_num = comment_num) except Exception, e: open('error_pages/answers_' + response.url.split('/')[-2]+'.html', 'w').write(response.body) print '='*10 + str(e)
def parse_answer(self, reponse): # 处理question的answer ans_json = json.loads(reponse.text) is_end = ans_json["paging"]["is_end"] next_url = ans_json["paging"]["next"] # 提取answer的具体字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None answer_item[ "content"] = answer["content"] if "content" in answer else None answer_item["parise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() yield answer_item if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
def parse_ans_and_que(self,response): #解析answer和question的数据,并构造个人页面的url实行访问 response_json = json.loads(response.text) if response_json["paging"]["is_start"] == True: #仅获取api中首页的第一个数据信息中的queston数据 question_item = ZhihuQuestionItem() data = response_json["data"][0] question_item['question_id'] = data["question"]["id"] question_item['question_url'] = response.meta.get("question_url", "") question_item['question'] = data["question"]["title"] question_item['content'] = response.meta.get("content","") question_item['comments_num'] = response.meta.get("comments_num","") question_item['followers_num'] = response.meta.get("followers_num", "") question_item['watch_user_num'] = response.meta.get("watch_user_num", "") question_item['question_topics'] = response.meta.get("question_topics","") question_item['question_created_time'] = data["question"]["created"] question_item['question_updated_time'] = data["question"]["updated_time"] question_item['answer_num'] = response.meta.get("answer_num","") yield question_item #获取user_key,构造user_url,并且解析answer的数据 for answer_data in response_json["data"]: # user_url.request answer_item = ZhihuAnswerItem() author_id =answer_data["author"]["url_token"] if author_id is not None: author_url = "https://www.zhihu.com/people/" + author_id +"/answers" yield Request(url=author_url, callback=self.parse_user,meta={"author_id":author_id}) else: continue answer_item['question'] = answer_data["question"]["title"] answer_item['question_id'] = answer_data["question"]["id"] answer_item['answer_id'] =answer_data["id"] answer_item['answer_url'] = answer_data["url"] answer_item['content'] = answer_data["content"] answer_item['author_url_token'] = author_id answer_item['vote_num'] = answer_data["voteup_count"] answer_item['comments_num'] = answer_data["comment_count"] answer_item['answer_created_time'] = answer_data["created_time"] answer_item['answer_updated_time'] = answer_data["updated_time"] yield answer_item if response_json["paging"]["is_end"] == False: #判断此api是否有下一页,若有则request next_url = response_json["paging"]["next"] if next_url is not None: yield Request(url=next_url,callback=self.parse_ans_and_que)
def parse_answer(self, response): """ 解析获取到的指定范围答案 """ answers = json.loads(response.text) for ans in answers['data']: item = ZhihuAnswerItem() item['question_id'] = re.match( r'http://www.zhihu.com/api/v4/questions/(\d+)', ans['question']['url']).group(1) item['author'] = ans['author']['name'] item['ans_url'] = ans['url'] item['comment_count'] = ans['comment_count'] item['upvote_count'] = ans['voteup_count'] item['excerpt'] = ans['excerpt'] yield item
def parse_answer(self, response): """ 解析获取到的指定范围答案 """ text = response.text answers = json.loads(text) for ans in answers['data']: item = ZhihuAnswerItem() item['answer_id'] = ans['id'] item['question_id'] = ans['question']['id'] item['author'] = ans['author']['name'] # https://www.zhihu.com/question/266730428/answer/314851560 item['ans_url'] = 'https://www.zhihu.com/question/' + str(item['question_id']) + '/answer/' + str(item['answer_id']) item['comment_count'] = ans['comment_count'] item['upvote_count'] = ans['voteup_count'] item['excerpt'] = ans['excerpt'] if item['upvote_count'] > self.setting['MIN_UPVOTE_COUNT']: item['content'] = self.parse_content(ans['content']) item['content'] = ans['content'] yield item
def parse(self, response): result = json.loads(response.text) is_end = result.get('paging').get('is_end') next_url = result.get('paging').get('next') answers = result.get('data') for answer in answers: answer_item = ZhihuAnswerItem() answer_item['id'] = answer.get('id') answer_item['question'] = answer.get('question').get('title') headline = answer.get('author').get('headline') headline = '(' + headline + ')' if len(headline) > 0 else '' answer_item['author'] = answer.get('author').get('name') + headline answer_item['answer'] = answer.get('content') answer_item['voteup_count'] = answer.get('voteup_count') timestamp = answer.get('updated_time') answer_item['create_at'] = time.strftime('%Y-%m-%d', time.localtime(timestamp)) yield answer_item if is_end == False: yield Request(next_url, callback=self.parse)
for record in selector.xpath(r"id('zh-profile-answer-list')/div"): ask_title = ''.join(record.xpath(r"h2/a/text()").extract()) url = host + ''.join(record.xpath("h2/a/@href").extract()) # answer_url ask_url = url.split('/answer')[0] agree_num = ''.join(record.xpath('div/div[2]/a/text()').extract()) summary = ''.join(record.xpath(r"div/div[4]/div/text()").extract()).replace("\n","").strip() #TODO content = ''.join(record.xpath(r"div/div[4]/textarea/text()").extract()).replace("\n","").strip() comment_num = record.xpath(r"div/div[5]/div/a[2]/text()")[1].extract() #'添加评论'或者'3 条评论' comment_num = comment_num.split(' ')[0] #取数字 if comment_num.startswith(u'添加评论'): comment_num = '0' yield ZhihuAnswerItem(_id=url,username = username,url = url, ask_title = ask_title, \ ask_url = ask_url, agree_num = agree_num, summary = summary , content = content, comment_num = comment_num) except Exception, e: open('error_pages/answers_' + response.url.split('/')[-2]+'.html', 'w').write(response.body) print '='*10 + str(e) elif typeinfo.startswith('asks'): username = response.url.split('/')[-2] try: for record in selector.xpath(r"id('zh-profile-ask-list')/div"): view_num = record.xpath(r'span/div[1]/text()')[0].extract() title = record.xpath(r"div/h2/a/text()")[0].extract() answer_num = record.xpath(r"div/div/span[1]/following-sibling::text()")[0].extract().split(' ')[0].replace('\n','') follower_num = record.xpath(r"div/div/span[2]/following-sibling::text()")[0].extract().split(' ')[0].replace('\n','') url = host+record.xpath(r"div/h2/a/@href")[0].extract()