def dealWithPage(page_num):
    page = urlparse.urljoin(url_prefix, str(page_num))

    question_index_filename = 'q_index'
    try:
        question = Question(page)
    except:
        print 'Question Get Error.'
        return
    f = codecs.open(question_index_filename + '.txt', 'a')
    f.write(str(page_num) + '\r\n')
    f.close()
    question_folder = 'Zhihu/Question_' + str(page_num)
    question_filename = 'q'
    if (not os.path.exists(question_folder)):
        os.makedirs(question_folder)
    f = codecs.open(os.path.join(question_folder, question_filename + '.txt'),
                    'w')
    f.write(question.get_title() + '\r\n|||\r\n')
    f.write(question.get_detail() + '\r\n|||\r\n')
    for topic in question.get_topics():
        f.write(topic + '\t')
    f.write('\r\n|||\r\n')
    f.write(str(question.get_visit_times()) + '\r\n|||\r\n')
    f.write(str(question.get_followers_num()) + '\r\n|||\r\n')
    f.write(str(question.get_answers_num()) + '\r\n|||\r\n')
    f.close()

    answers = question.get_all_answers()
    for answer in answers:
        ansURL = answer.answer_url
        ans = Answer(ansURL)
        answer_folder = os.path.join(question_folder, 'Answer')
        answer_filename = str(ansURL.split('/')[-1])
        answer_index_filename = 'a_index'
        f = codecs.open(answer_index_filename + '.txt', 'a')
        f.write(str(page_num) + '|||' + str(answer_filename) + '\r\n')
        f.close()
        if (not os.path.exists(answer_folder)):
            os.makedirs(answer_folder)
        f = codecs.open(os.path.join(answer_folder, answer_filename + '.txt'),
                        'w')
        try:
            f.write(ans.get_content().find('body').get_text().strip().encode(
                "gbk", 'ignore') + '\r\n|||\r\n')
            f.write(ans.get_author().get_user_id() + '\r\n|||\r\n')
            f.write(str(ans.get_upvote()) + '\r\n|||\r\n')
        except:
            print 'TimeOut Occurred.'
            f.close()
            f = codecs.open(
                os.path.join(answer_folder, answer_filename + '.txt'), 'w')
            f.write('None')
            f.close()
            continue
        f.close()
Exemplo n.º 2
0
# -*- coding: utf-8 -*-
from zhihu import Question
from zhihu import Answer
from zhihu import User
from zhihu import Collection
import requests

url = "https://www.zhihu.com/question/19550321/answer/14240492"
ans = Answer(url)
print ans.get_content().find('body').get_text()
input()