def dealWithPage(page_num): page = urlparse.urljoin(url_prefix, str(page_num)) question_index_filename = 'q_index' try: question = Question(page) except: print 'Question Get Error.' return f = codecs.open(question_index_filename + '.txt', 'a') f.write(str(page_num) + '\r\n') f.close() question_folder = 'Zhihu/Question_' + str(page_num) question_filename = 'q' if (not os.path.exists(question_folder)): os.makedirs(question_folder) f = codecs.open(os.path.join(question_folder, question_filename + '.txt'), 'w') f.write(question.get_title() + '\r\n|||\r\n') f.write(question.get_detail() + '\r\n|||\r\n') for topic in question.get_topics(): f.write(topic + '\t') f.write('\r\n|||\r\n') f.write(str(question.get_visit_times()) + '\r\n|||\r\n') f.write(str(question.get_followers_num()) + '\r\n|||\r\n') f.write(str(question.get_answers_num()) + '\r\n|||\r\n') f.close() answers = question.get_all_answers() for answer in answers: ansURL = answer.answer_url ans = Answer(ansURL) answer_folder = os.path.join(question_folder, 'Answer') answer_filename = str(ansURL.split('/')[-1]) answer_index_filename = 'a_index' f = codecs.open(answer_index_filename + '.txt', 'a') f.write(str(page_num) + '|||' + str(answer_filename) + '\r\n') f.close() if (not os.path.exists(answer_folder)): os.makedirs(answer_folder) f = codecs.open(os.path.join(answer_folder, answer_filename + '.txt'), 'w') try: f.write(ans.get_content().find('body').get_text().strip().encode( "gbk", 'ignore') + '\r\n|||\r\n') f.write(ans.get_author().get_user_id() + '\r\n|||\r\n') f.write(str(ans.get_upvote()) + '\r\n|||\r\n') except: print 'TimeOut Occurred.' f.close() f = codecs.open( os.path.join(answer_folder, answer_filename + '.txt'), 'w') f.write('None') f.close() continue f.close()
def question(self, url): question = Question(url) title = question.get_title() # 标题 detail = question.get_detail() # 描述 answers_num = question.get_answers_num() # 回答个数 followers_num = question.get_followers_num() # 关注人数 topics = question.get_topics() # 所属话题 top_answers = question.get_top_i_answers(10) # 前十回答 return title, detail, answers_num, followers_num, topics, top_answers
def question_test(url): question = Question(url) # 获取该问题的标题 title = question.get_title() # 获取该问题的详细描述 detail = question.get_detail() # 获取回答个数 answers_num = question.get_answers_num() # 获取关注该问题的人数 followers_num = question.get_followers_num() # 获取该问题所属话题 topics = question.get_topics() # 获取该问题被浏览次数 visit_times = question.get_visit_times() # 获取排名第一的回答 top_answer = question.get_top_answer() # 获取排名前十的十个回答 top_answers = question.get_top_i_answers(10) # 获取所有回答 answers = question.get_all_answers() print title # 输出:现实可以有多美好? print detail # 输出: # 本问题相对于“现实可以多残酷?传送门:现实可以有多残酷? # 题主: 昨天看了“现实可以有多残酷“。感觉不太好,所以我 # 开了这个问题以相对应,希望能够“中和一下“。和那个问题题主不想 # 把它变成“比惨大会“一样,我也不想把这个变成“鸡汤故事会“,或者 # 是“晒幸福“比赛。所以大家从“现实,实际”的角度出发,讲述自己的 # 美好故事,让大家看看社会的冷和暖,能更加辨证地看待世界,是此 # 题和彼题共同的“心愿“吧。 print answers_num # 输出:2441 print followers_num # 输出:26910 for topic in topics: print topic, # 输出:情感克制 现实 社会 个人经历 print visit_times # 输出: 该问题当前被浏览的次数 print top_answer # 输出:<zhihu.Answer instance at 0x7f8b6582d0e0>(Answer类对象) print top_answers # 输出:<generator object get_top_i_answers at 0x7fed676eb320>(代表前十的Answer的生成器) print answers # 输出:<generator object get_all_answer at 0x7f8b66ba30a0>(代表所有Answer的生成器)
def question(self, url): question = Question(url) # 获取该问题的标题 title = question.get_title() # 获取该问题的详细描述 detail = question.get_detail() # 获取回答个数 answers_num = question.get_answers_num() # 获取关注该问题的人数 followers_num = question.get_followers_num() # 获取该问题所属话题 topics = question.get_topics() # 获取排名第一的回答 # top_answer = question.get_top_answer() # 获取排名前十的十个回答 top_answers = question.get_top_i_answers(10) # 获取所有回答 # answers = question.get_all_answers() # print title # 输出:现实可以有多美好? # print detail # # 输出: # # 本问题相对于“现实可以多残酷?传送门:现实可以有多残酷? # # 题主: 昨天看了“现实可以有多残酷“。感觉不太好,所以我 # # 开了这个问题以相对应,希望能够“中和一下“。和那个问题题主不想 # # 把它变成“比惨大会“一样,我也不想把这个变成“鸡汤故事会“,或者 # # 是“晒幸福“比赛。所以大家从“现实,实际”的角度出发,讲述自己的 # # 美好故事,让大家看看社会的冷和暖,能更加辨证地看待世界,是此 # # 题和彼题共同的“心愿“吧。 # print answers_num # 输出:2441 # print followers_num # 输出:26910 # for topic in topics: # print topic , # 输出:情感克制 现实 社会 个人经历 # print top_answer # 输出:<zhihu.Answer instance at 0x7f8b6582d0e0>(Answer类对象) # print top_answers # 输出:<generator object get_top_i_answers at 0x7fed676eb320>(代表前十的Answer的生成器) # print answers # 输出:<generator object get_all_answer at 0x7f8b66ba30a0>(代表所有Answer的生成器) return title, detail, answers_num, followers_num, topics, top_answers
# -*- coding: utf-8 -*- from zhihu import Question url = "http://www.zhihu.com/question/24269892" question = Question(url) # 获取该问题的标题 title = question.get_title() # 获取该问题的详细描述 detail = question.get_detail() # 获取回答个数 answers_num = question.get_answers_num() # 获取关注该问题的人数 followers_num = question.get_followers_num() # 获取该问题所属话题 topics = question.get_topics() # 获取该问题被浏览次数 visit_times = question.get_visit_times() # 获取排名第一的回答 top_answer = question.get_top_answer() # 获取排名前十的十个回答 top_answers = question.get_top_i_answers(100) # 获取所有回答 answers = question.get_all_answers() print "Title: " print title # 输出:现实可以有多美好? print "Detail: " print detail # 输出: # 本问题相对于“现实可以多残酷?传送门:现实可以有多残酷?