def question_test(self, url): question = Question(url) # 获取该问题的标题 title = question.get_title() # 获取该问题的详细描述 detail = question.get_detail() # 获取回答个数 # answers_num = question.get_answers_num() # 获取关注该问题的人数 # followers_num = question.get_followers_num() # 获取该问题所属话题 topics = question.get_topics() # 获取该问题被浏览次数 # visit_times = question.get_visit_times() # 获取排名第一的回答 # top_answer = question.get_top_answer() # 获取排名前十的十个回答 top_answers = question.get_top_i_answers(10) # 获取所有回答 answers = question.get_all_answers() # print title # 输出:现实可以有多美好? # print detail # 输出: # 本问题相对于“现实可以多残酷?传送门:现实可以有多残酷? # 题主: 昨天看了“现实可以有多残酷“。感觉不太好,所以我 # 开了这个问题以相对应,希望能够“中和一下“。和那个问题题主不想 # 把它变成“比惨大会“一样,我也不想把这个变成“鸡汤故事会“,或者 # 是“晒幸福“比赛。所以大家从“现实,实际”的角度出发,讲述自己的 # 美好故事,让大家看看社会的冷和暖,能更加辨证地看待世界,是此 # 题和彼题共同的“心愿“吧。 # print answers_num # 输出:2441 # print followers_num # 输出:26910 # for topic in topics: # print type(topic) # print topic, # 输出:情感克制 现实 社会 个人经历 # print visit_times # 输出: 该问题当前被浏览的次数 # print top_answer # 输出:<zhihu.Answer instance at 0x7f8b6582d0e0>(Answer类对象) # print top_answers # 输出:<generator object get_top_i_answers at 0x7fed676eb320>(代表前十的Answer的生成器) # print answers # 输出:<generator object get_all_answer at 0x7f8b66ba30a0>(代表所有Answer的生成器) answers_list = [] for answer in answers: #top_answers answers_list.append(answer.to_txt()) if len(answers_list) > 0: answers = [] topics = json.dumps(topics).decode('unicode-escape').encode('utf8') time_now = int(time.time()) p_str = 'INSERT IGNORE INTO ANSWER (QURL, TITLE, DETAIL, TOPICS, AURL, USERID, CONTENT, UPVOTE, USERURL, ADD_TIME, LAST_VISIT) VALUES (%s, %s, %s, %s, %s, %s, %s,%s, %s, %s,%s)' for answer in answers_list: answers = answers + [ (url, title, detail, topics, answer["aurl"], answer["userid"], answer["content"], answer["upvote"], answer["userurl"], time_now, 0) ] self.cursor.executemany(p_str, answers) return self.cursor.rowcount else: print "no answer" return 0
def dealWithPage(page_num): page = urlparse.urljoin(url_prefix, str(page_num)) question_index_filename = 'q_index' try: question = Question(page) except: print 'Question Get Error.' return f = codecs.open(question_index_filename + '.txt', 'a') f.write(str(page_num) + '\r\n') f.close() question_folder = 'Zhihu/Question_' + str(page_num) question_filename = 'q' if (not os.path.exists(question_folder)): os.makedirs(question_folder) f = codecs.open(os.path.join(question_folder, question_filename + '.txt'), 'w') f.write(question.get_title() + '\r\n|||\r\n') f.write(question.get_detail() + '\r\n|||\r\n') for topic in question.get_topics(): f.write(topic + '\t') f.write('\r\n|||\r\n') f.write(str(question.get_visit_times()) + '\r\n|||\r\n') f.write(str(question.get_followers_num()) + '\r\n|||\r\n') f.write(str(question.get_answers_num()) + '\r\n|||\r\n') f.close() answers = question.get_all_answers() for answer in answers: ansURL = answer.answer_url ans = Answer(ansURL) answer_folder = os.path.join(question_folder, 'Answer') answer_filename = str(ansURL.split('/')[-1]) answer_index_filename = 'a_index' f = codecs.open(answer_index_filename + '.txt', 'a') f.write(str(page_num) + '|||' + str(answer_filename) + '\r\n') f.close() if (not os.path.exists(answer_folder)): os.makedirs(answer_folder) f = codecs.open(os.path.join(answer_folder, answer_filename + '.txt'), 'w') try: f.write(ans.get_content().find('body').get_text().strip().encode( "gbk", 'ignore') + '\r\n|||\r\n') f.write(ans.get_author().get_user_id() + '\r\n|||\r\n') f.write(str(ans.get_upvote()) + '\r\n|||\r\n') except: print 'TimeOut Occurred.' f.close() f = codecs.open( os.path.join(answer_folder, answer_filename + '.txt'), 'w') f.write('None') f.close() continue f.close()
def question(self, url): question = Question(url) title = question.get_title() # 标题 detail = question.get_detail() # 描述 answers_num = question.get_answers_num() # 回答个数 followers_num = question.get_followers_num() # 关注人数 topics = question.get_topics() # 所属话题 top_answers = question.get_top_i_answers(10) # 前十回答 return title, detail, answers_num, followers_num, topics, top_answers
def question_test(url): question = Question(url) # 获取该问题的标题 title = question.get_title() # 获取该问题的详细描述 detail = question.get_detail() # 获取回答个数 answers_num = question.get_answers_num() # 获取关注该问题的人数 followers_num = question.get_followers_num() # 获取该问题所属话题 topics = question.get_topics() # 获取该问题被浏览次数 visit_times = question.get_visit_times() # 获取排名第一的回答 top_answer = question.get_top_answer() # 获取排名前十的十个回答 top_answers = question.get_top_i_answers(10) # 获取所有回答 answers = question.get_all_answers() print title # 输出:现实可以有多美好? print detail # 输出: # 本问题相对于“现实可以多残酷?传送门:现实可以有多残酷? # 题主: 昨天看了“现实可以有多残酷“。感觉不太好,所以我 # 开了这个问题以相对应,希望能够“中和一下“。和那个问题题主不想 # 把它变成“比惨大会“一样,我也不想把这个变成“鸡汤故事会“,或者 # 是“晒幸福“比赛。所以大家从“现实,实际”的角度出发,讲述自己的 # 美好故事,让大家看看社会的冷和暖,能更加辨证地看待世界,是此 # 题和彼题共同的“心愿“吧。 print answers_num # 输出:2441 print followers_num # 输出:26910 for topic in topics: print topic, # 输出:情感克制 现实 社会 个人经历 print visit_times # 输出: 该问题当前被浏览的次数 print top_answer # 输出:<zhihu.Answer instance at 0x7f8b6582d0e0>(Answer类对象) print top_answers # 输出:<generator object get_top_i_answers at 0x7fed676eb320>(代表前十的Answer的生成器) print answers # 输出:<generator object get_all_answer at 0x7f8b66ba30a0>(代表所有Answer的生成器)
def question(self, url): question = Question(url) # 获取该问题的标题 title = question.get_title() # 获取该问题的详细描述 detail = question.get_detail() # 获取回答个数 answers_num = question.get_answers_num() # 获取关注该问题的人数 followers_num = question.get_followers_num() # 获取该问题所属话题 topics = question.get_topics() # 获取排名第一的回答 # top_answer = question.get_top_answer() # 获取排名前十的十个回答 top_answers = question.get_top_i_answers(10) # 获取所有回答 # answers = question.get_all_answers() # print title # 输出:现实可以有多美好? # print detail # # 输出: # # 本问题相对于“现实可以多残酷?传送门:现实可以有多残酷? # # 题主: 昨天看了“现实可以有多残酷“。感觉不太好,所以我 # # 开了这个问题以相对应,希望能够“中和一下“。和那个问题题主不想 # # 把它变成“比惨大会“一样,我也不想把这个变成“鸡汤故事会“,或者 # # 是“晒幸福“比赛。所以大家从“现实,实际”的角度出发,讲述自己的 # # 美好故事,让大家看看社会的冷和暖,能更加辨证地看待世界,是此 # # 题和彼题共同的“心愿“吧。 # print answers_num # 输出:2441 # print followers_num # 输出:26910 # for topic in topics: # print topic , # 输出:情感克制 现实 社会 个人经历 # print top_answer # 输出:<zhihu.Answer instance at 0x7f8b6582d0e0>(Answer类对象) # print top_answers # 输出:<generator object get_top_i_answers at 0x7fed676eb320>(代表前十的Answer的生成器) # print answers # 输出:<generator object get_all_answer at 0x7f8b66ba30a0>(代表所有Answer的生成器) return title, detail, answers_num, followers_num, topics, top_answers
def question_test(url): print "looking "+url question = Question(url) # 获取该问题的标题 if question.isInvalid(): # print 'invalid '+url return title = question.get_title() # 获取该问题的详细描述 detail = question.get_detail() # 获取回答个数 answers_num = question.get_answers_num() # 获取关注该问题的人数 # followers_num = question.get_followers_num() # 获取该问题所属话题 topics = question.get_topics() # 获取该问题被浏览次数 visit_times = question.get_visit_times() # 获取排名第一的回答 top_answer = question.get_top_answer() # 获取排名前十的十个回答 top_answers = question.get_top_i_answers(10) # 获取所有回答 answers = question.get_all_answers() for answer in top_answers: read=file("folder/a.txt","a+"); if answer.get_upvote()>1000: print answer.get_upvote(),answer.answer_url read.write("\r\n"+url+"\r\n"+title+"--"+answer.get_author().get_user_id()+"\r\n"+"%d"%answer.get_upvote()+"\r\n") read.close()
def __init__(self, question_id): self.question_id = question_id que = Question(prefix_question + question_id) self.topics = que.get_topics()
from zhihu import User, UserDetail, Question user = UserDetail("https://www.zhihu.com/people/lan-jiang-26") t = user.get_followees() user.get_followees_num() # print(user.get_user_id()) print(user.get_data_id()) print(user.get_gender()) print(user.get_followees_num()) print(user.get_followers_num()) print(user.get_agree_num()) print(user.get_thanks_num()) print(user.get_asks_num()) print(user.get_answers_num()) print(user.get_collections_num()) print(user.get_profile_vote_num()) print(user.get_profile_thank_num()) print(user.get_profile_fav_num()) print(user.get_followers()) # for user in user.get_followers(): # print(user) t = user.get_topics() for tt in t: print(tt.name) print(tt.id) question = Question(url='https://www.zhihu.com/question/32046716') print(question.get_detail()) print(question.get_topics())
# -*- coding: utf-8 -*- from zhihu import Question url = "http://www.zhihu.com/question/24269892" question = Question(url) # 获取该问题的标题 title = question.get_title() # 获取该问题的详细描述 detail = question.get_detail() # 获取回答个数 answers_num = question.get_answers_num() # 获取关注该问题的人数 followers_num = question.get_followers_num() # 获取该问题所属话题 topics = question.get_topics() # 获取该问题被浏览次数 visit_times = question.get_visit_times() # 获取排名第一的回答 top_answer = question.get_top_answer() # 获取排名前十的十个回答 top_answers = question.get_top_i_answers(100) # 获取所有回答 answers = question.get_all_answers() print "Title: " print title # 输出:现实可以有多美好? print "Detail: " print detail # 输出: # 本问题相对于“现实可以多残酷?传送门:现实可以有多残酷?
def download_answers(urls, topic_id, log_fname): answer_file_name = "Answers_text_" + topic_id + ".txt" # Open a file to store all answers if not os.path.exists(os.path.join(os.getcwd(), answer_file_name)): out_answer_text = open(answer_file_name, "w") out_answer_text.close() for i in range(0, len(urls)): url = urls[i] print url question = Question('http://www.zhihu.com' + url) # 获取该问题的标题 title = question.get_title() # 获取该问题的详细描述 detail = question.get_detail() # 获取回答个数 answers_num = question.get_answers_num() # 获取关注该问题的人数 # try: # followers_num = question.get_followers_num() # except: # followers_num = 0 # 获取该问题所属话题 topics = question.get_topics() # 获取该问题被浏览次数 # visit_times = question.get_visit_times() # 获取排名第一的回答 # top_answer = question.get_top_answer() # 获取排名前十的十个回答 # top_answers = question.get_top_i_answers(10) # 获取所有回答 answers = question.get_all_answers() print "============" print "问题: \n", title print "问题描述: \n", detail print "Number of Answers: ", answers_num # print followers_num for topic in topics: print topic, # print visit_times # print top_answer # print top_answers print answers out_answer_text = open(answer_file_name, "a") this_question_answers_text = [] for answer in answers: author = answer.get_author() answer_cleaned = answer.text_one_line() print answer_cleaned this_question_answers_text.append(answer_cleaned) # answer.to_txt(title, detail, answers_num, followers_num, topics, visit_times, author) # t_sleep = random.uniform(2,4) # Stop after scraping one page. # print termcolor.colored(' '.join(['Sleep Time: ', str(t_sleep)]), "white") # time.sleep(t_sleep) # print "\n" out_answer_text.write("".join(this_question_answers_text)) # Modify log file after successfully write the answers log_file = open(log_fname, "w") log_file.write("\n".join(urls[i:])) print termcolor.colored("Log modified.", "white") # Stop after scraping one page. print "\n--------------\n--------------" t_sleep = random.uniform(1, 2) print termcolor.colored(' '.join(['Sleep Time: ', str(t_sleep)]), "red") time.sleep(t_sleep) out_answer_text.close()