def parse_answer(self, response): #测试时把返回结果写到本次 #temfn= str(random.randint(0,100)) #f = open("/var/www/html/scrapy/answer/"+temfn,'wb') #f.write(response.body) #f.write("------") #f.close() res = json.loads(response.text) #print (res) data = res['data'] # 一次返回多个(默认5个)答案, 需要遍历 for od in data: #print(od) item = AnswerItem() item['answer_id'] = str(od['id']) # answer id item['question_id'] = str(od['question']['id']) item['question_title'] = od['question']['title'] item['author_url_token'] = od['author']['url_token'] item['author_name'] = od['author']['name'] item['voteup_count'] = str(od['voteup_count']) item['comment_count'] = str(od["comment_count"]) item['content'] = od['content'] yield item testh = etree.HTML(od['content']) itemimg = MyImageItem() itemimg['question_answer_id'] = str( od['question']['id']) + "/" + str(od['id']) itemimg['image_urls'] = testh.xpath("//img/@data-original") yield itemimg
def parse_answer(self, response): #print(response.text) json_data = json.loads(response.text) is_end = json_data['paging']['is_end'] next_url = json_data['paging']['next'] for answer in json_data['data']: answer_item = AnswerItem() answer_item['zhihu_id'] = answer['id'] answer_item['question_id'] = answer['question']['id'] answer_item['url'] = response.url answer_item['author_id'] = answer['author'][ 'id'] if 'id' in answer['author'] else ' ' answer_item[ 'content'] = answer['content'] if 'content' in answer else ' ' answer_item['praise_num'] = answer['voteup_count'] answer_item['comment_num'] = answer['comment_count'] answer_item['creat_time'] = answer['created_time'] answer_item['update_time'] = answer['updated_time'] answer_item['crawl_time'] = datetime.datetime.now() yield answer_item if is_end == 'false': yield scrapy.Request(next_url, callback=self.parse_answer)
def parse_answer_fromApi(self,response): self.humanCheckIssue(response.url) print 'parse_answer ' + response.url jsonResponse=json.loads(response.body) for eachAnswer in jsonResponse['data']: item = AnswerItem() try: item['answerId'] = str(eachAnswer['id']) except: LogManager.writeToLog('parse_answer_fromApi ' + response.url + ':get answerId error') try: item['questionId'] = str(eachAnswer['question']['id']) except: item['questionId'] = str(self.getElementError) try: item['answerAuthorId'] = eachAnswer['author']['url_token'] if item['answerAuthorId'] == '' : item['answerAuthorId'] = u'匿名用户' except: item['answerAuthorId'] = u'匿名用户' try: item['likeNumber'] = str(eachAnswer['voteup_count']) except: item['likeNumber'] = self.getElementError try: item['answerCommentNumber'] = str(eachAnswer['comment_count']) except: item['answerCommentNumber'] = u'0' yield item time.sleep(1) if int(jsonResponse['paging']['totals']) > int(re.findall(r"\d+\.?\d*", jsonResponse['paging']['next'].split('&')[-2])[0]): yield Request(url=jsonResponse['paging']['next'], headers=self.headers, cookies=self.cookies, callback=self.parse_answer_fromApi)
def parse_answer(self, response): self.humanCheckIssue(response.url) print 'parse_answer '+response.url item = AnswerItem() try: x = response.url.split('/') item['answerId']=response.url.split('/')[6] except: LogManager.writeToLog('parse_answer '+response.url+':get answerId error') try: item['questionId']=response.url.split('/')[-3] except: item['questionId'] = self.getElementError LogManager.writeToLog('parse_answer ' + response.url + ':get questionId error') try: item['answerAuthorId']=response.xpath('//div[@class="QuestionAnswer-content"]//a[@class="UserLink-link"]/@href').extract_first().split('/')[-1] except: item['answerAuthorId'] = u'匿名用户' try: item['likeNumber']=response.xpath('//div[@class="QuestionAnswer-content"]//button[@class="Button VoteButton VoteButton--up"]/text()').extract_first() except: item['likeNumber'] = self.getElementError LogManager.writeToLog('parse_answer ' + response.url + ':get likeNumber error') try: item['answerCommentNumber']=re.findall(r"\d+\.?\d*", response.xpath('//div[@class="QuestionAnswer-content"]//button[@class="Button ContentItem-action Button--plain"]/text()').extract_first())[0] except: item['answerCommentNumber'] = u'0' yield item time.sleep(1)
def parse_question(self, response): answers_page = json.loads(response.text) answers_data = answers_page['data'] for answer in answers_data: answer_item = AnswerItem() answer_item['id'] = answer['id'] answer_item['author_id'] = answer['author']['id'] answer_item['author_name'] = answer['author']['name'] answer_item['author_gender'] = answer['author']['gender'] answer_item['content'] = answer['excerpt'] answer_item['voteup_count'] = answer['voteup_count'] yield answer_item
def extract_answers(self, response): resp = json.loads(response.text) if not resp: return for answer in resp['data']: item = AnswerItem() item['answer_id'] = answer['id'] item['content'] = answer['content'] content_selector = Selector(text=answer['content']) imgs = content_selector.xpath( '//figure//img[@class="origin_image zh-lightbox-thumb lazy"]/@data-original' ) item['imgs'] = imgs.extract() yield item
def get_data(self, response): answer_item = AnswerItem() result = json.loads(response.text) data_list = result.get('data') if data_list: print("=====$$$$$$$$$$$$$$$$$$$$$$$$$$$===={}".format( len(data_list))) for item in data_list: # 遍历items,获取0-5下的数据 item = item.get('target') # 定位到问题那一栏 if item: # 先判断能不能拿到item try: titles = item.get('question').get('title') # 知乎问题标题 if titles: content = item.get('content') # 知乎问题id answer_item['content'] = content answer_item['title'] = titles yield answer_item except: pass
def parse_answer(self, response): json_results = json.loads(response.text) for result in json_results['data']: item = AnswerItem() item['answer_user_id'] = response.meta['answer_user_id'] item['answer_id'] = result['id'] item['cretated_time'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(result['created_time'])) item['updated_time'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(result['updated_time'])) item['voteup_count'] = result['voteup_count'] item['comment_count'] = result['comment_count'] item['content'] = result['content'] yield item if json_results['paging']['is_end'] == False: offset = response.meta['offset'] + 20 next_page = re.findall('(.*offset=)\d+', response.url)[0] yield Request(next_page + str(offset), callback=self.parse_answer, meta={ 'answer_user_id': response.meta['answer_user_id'], 'offset': offset })