예제 #1
0
    def parse_answer(self, response):
        #测试时把返回结果写到本次
        #temfn= str(random.randint(0,100))
        #f = open("/var/www/html/scrapy/answer/"+temfn,'wb')
        #f.write(response.body)
        #f.write("------")
        #f.close()
        res = json.loads(response.text)
        #print (res)
        data = res['data']
        # 一次返回多个(默认5个)答案, 需要遍历
        for od in data:
            #print(od)
            item = AnswerItem()
            item['answer_id'] = str(od['id'])  #  answer id
            item['question_id'] = str(od['question']['id'])
            item['question_title'] = od['question']['title']
            item['author_url_token'] = od['author']['url_token']
            item['author_name'] = od['author']['name']
            item['voteup_count'] = str(od['voteup_count'])
            item['comment_count'] = str(od["comment_count"])
            item['content'] = od['content']
            yield item
            testh = etree.HTML(od['content'])
            itemimg = MyImageItem()

            itemimg['question_answer_id'] = str(
                od['question']['id']) + "/" + str(od['id'])
            itemimg['image_urls'] = testh.xpath("//img/@data-original")
            yield itemimg
예제 #2
0
    def parse_answer(self, response):
        #print(response.text)
        json_data = json.loads(response.text)
        is_end = json_data['paging']['is_end']
        next_url = json_data['paging']['next']

        for answer in json_data['data']:
            answer_item = AnswerItem()
            answer_item['zhihu_id'] = answer['id']
            answer_item['question_id'] = answer['question']['id']
            answer_item['url'] = response.url
            answer_item['author_id'] = answer['author'][
                'id'] if 'id' in answer['author'] else ' '
            answer_item[
                'content'] = answer['content'] if 'content' in answer else ' '
            answer_item['praise_num'] = answer['voteup_count']
            answer_item['comment_num'] = answer['comment_count']
            answer_item['creat_time'] = answer['created_time']
            answer_item['update_time'] = answer['updated_time']
            answer_item['crawl_time'] = datetime.datetime.now()

            yield answer_item

        if is_end == 'false':
            yield scrapy.Request(next_url, callback=self.parse_answer)
예제 #3
0
    def parse_answer_fromApi(self,response):
        self.humanCheckIssue(response.url)
        print 'parse_answer     ' + response.url
        jsonResponse=json.loads(response.body)
        for eachAnswer in jsonResponse['data']:
            item = AnswerItem()
            try:
                item['answerId'] = str(eachAnswer['id'])
            except:
                LogManager.writeToLog('parse_answer_fromApi    ' + response.url + ':get answerId error')
            try:
                item['questionId'] = str(eachAnswer['question']['id'])
            except:
                item['questionId'] = str(self.getElementError)
            try:
                item['answerAuthorId'] = eachAnswer['author']['url_token']
                if item['answerAuthorId'] == '' :
                    item['answerAuthorId'] = u'匿名用户'
            except:
                item['answerAuthorId'] = u'匿名用户'
            try:
                item['likeNumber'] = str(eachAnswer['voteup_count'])
            except:
                item['likeNumber'] = self.getElementError
            try:
                item['answerCommentNumber'] = str(eachAnswer['comment_count'])
            except:
                item['answerCommentNumber'] = u'0'
            yield item

        time.sleep(1)

        if int(jsonResponse['paging']['totals']) > int(re.findall(r"\d+\.?\d*", jsonResponse['paging']['next'].split('&')[-2])[0]):
            yield Request(url=jsonResponse['paging']['next'], headers=self.headers,
                          cookies=self.cookies, callback=self.parse_answer_fromApi)
예제 #4
0
    def parse_answer(self, response):
        self.humanCheckIssue(response.url)
        print 'parse_answer     '+response.url
        item = AnswerItem()
        try:
            x = response.url.split('/')
            item['answerId']=response.url.split('/')[6]
        except:
            LogManager.writeToLog('parse_answer    '+response.url+':get answerId error')
        try:
            item['questionId']=response.url.split('/')[-3]
        except:
            item['questionId'] = self.getElementError
            LogManager.writeToLog('parse_answer    ' + response.url + ':get questionId error')
        try:
            item['answerAuthorId']=response.xpath('//div[@class="QuestionAnswer-content"]//a[@class="UserLink-link"]/@href').extract_first().split('/')[-1]
        except:
            item['answerAuthorId'] = u'匿名用户'
        try:
            item['likeNumber']=response.xpath('//div[@class="QuestionAnswer-content"]//button[@class="Button VoteButton VoteButton--up"]/text()').extract_first()
        except:
            item['likeNumber'] = self.getElementError
            LogManager.writeToLog('parse_answer    ' + response.url + ':get likeNumber error')
        try:
            item['answerCommentNumber']=re.findall(r"\d+\.?\d*", response.xpath('//div[@class="QuestionAnswer-content"]//button[@class="Button ContentItem-action Button--plain"]/text()').extract_first())[0]
        except:
            item['answerCommentNumber'] = u'0'
        yield item

        time.sleep(1)
예제 #5
0
    def parse_question(self, response):
        answers_page = json.loads(response.text)
        answers_data = answers_page['data']

        for answer in answers_data:
            answer_item = AnswerItem()

            answer_item['id'] = answer['id']

            answer_item['author_id'] = answer['author']['id']
            answer_item['author_name'] = answer['author']['name']
            answer_item['author_gender'] = answer['author']['gender']

            answer_item['content'] = answer['excerpt']
            answer_item['voteup_count'] = answer['voteup_count']

            yield answer_item
예제 #6
0
    def extract_answers(self, response):
        resp = json.loads(response.text)

        if not resp:
            return

        for answer in resp['data']:
            item = AnswerItem()
            item['answer_id'] = answer['id']
            item['content'] = answer['content']

            content_selector = Selector(text=answer['content'])
            imgs = content_selector.xpath(
                '//figure//img[@class="origin_image zh-lightbox-thumb lazy"]/@data-original'
            )
            item['imgs'] = imgs.extract()

            yield item
예제 #7
0
 def get_data(self, response):
     answer_item = AnswerItem()
     result = json.loads(response.text)
     data_list = result.get('data')
     if data_list:
         print("=====$$$$$$$$$$$$$$$$$$$$$$$$$$$===={}".format(
             len(data_list)))
         for item in data_list:  # 遍历items,获取0-5下的数据
             item = item.get('target')  # 定位到问题那一栏
             if item:  # 先判断能不能拿到item
                 try:
                     titles = item.get('question').get('title')  # 知乎问题标题
                     if titles:
                         content = item.get('content')  # 知乎问题id
                         answer_item['content'] = content
                         answer_item['title'] = titles
                         yield answer_item
                 except:
                     pass
예제 #8
0
 def parse_answer(self, response):
     json_results = json.loads(response.text)
     for result in json_results['data']:
         item = AnswerItem()
         item['answer_user_id'] = response.meta['answer_user_id']
         item['answer_id'] = result['id']
         item['cretated_time'] = time.strftime(
             "%Y-%m-%d %H:%M:%S", time.localtime(result['created_time']))
         item['updated_time'] = time.strftime(
             "%Y-%m-%d %H:%M:%S", time.localtime(result['updated_time']))
         item['voteup_count'] = result['voteup_count']
         item['comment_count'] = result['comment_count']
         item['content'] = result['content']
         yield item
     if json_results['paging']['is_end'] == False:
         offset = response.meta['offset'] + 20
         next_page = re.findall('(.*offset=)\d+', response.url)[0]
         yield Request(next_page + str(offset),
                       callback=self.parse_answer,
                       meta={
                           'answer_user_id':
                           response.meta['answer_user_id'],
                           'offset': offset
                       })