def getLatestBestAnserwerAndSave(): # phoneNum = '+8613096348217' # pw = '2015141463222' ans_num = 20 i=0 TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) # try: # client.login(phoneNum, pw) # except NeedCaptchaException: # # 保存验证码并提示输入,重新登录 # with open('a.gif', 'wb') as f: # f.write(client.get_captcha()) # captcha = input('please input captcha:') # client.login(phoneNum, pw, captcha) java = client.topic(19550867) BA = java.best_answers for answ in BA: ansItem2artical(ansItem(answ)).save() i = i+1 if i==ans_num: break
# print('关注人数', topic.follower_count) # print('关注人', topic.followers) # print('关注人数', topic.followers_count) # print('话题ID', topic.id) # print('介绍', topic.introduction) # print('名称', topic.name) # print('父话题数', topic.parent_count) # print('父话题详情', topic.parents) # print('已回答问题数', topic.question_count) # print('已回答问题个数', topic.questions_count) # print('未回答问题数', topic.unanswered_count) # print('未回答问题', topic.unanswered_questions) # ==================查询话题下所有未回答问题================== tid = 19668865 topic = client.topic(tid) question_line = [] for question in topic.unanswered_questions: entry_start_time = time.time() allow_delete = question.allow_delete answer_count = question.answer_count answers = question.answers comment_count = question.comment_count comments = question.comments detail = question.detail excerpt = question.excerpt follower_count = question.follower_count followers = question.followers id = question.id redirection = question.redirection status = question.status
items2['thanks count'] = str(answer.thanks_count) items2['updated time'] = answer_ut answer_numbers += 1 items['answer' + str(a)] = items2 a += 1 # print('------------------------') f.write(json.dumps(items, indent=2, ensure_ascii=False)) return answer_numbers # # Main topic answer_numbers_all = 0 topic_id = 21239580 #新型冠状肺炎的话题id #topic_id = 21238418 #新型冠状病毒的话题id topic = client.topic(topic_id) topic_children = topic.children answer_numbers_all += save_answer(topic, answer_numbers_all) print('answer_numbers_all: ', answer_numbers_all) for topic_child in topic_children: answer_numbers_all += save_answer(topic_child, answer_numbers_all) print('answer_numbers_all: ', answer_numbers_all) print('answer numbers :', answer_numbers_all)
TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: try: client.login('email_or_phone', 'password') except NeedCaptchaException: with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('email_or_phone', 'password', captcha) client.save_token(TOKEN_FILE) topic = client.topic(int(topic_id)) print(topic.name) #日志设置 logging.basicConfig(level=logging.ERROR, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='zhi.log', filemode='w') if os.path.exists('知乎-{}.xlsx'.format(file_name)): queue = pickle.load(open("queue.pkl", "rb")) wb = load_workbook('知乎-{}.xlsx'.format(file_name)) sheet = wb.active data_rows = sheet.max_row - 1 print("上次进度已加载!")
import os from zhihu_oauth import ZhihuClient TOKEN_FILE = 'token.pkl' #login client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) #test topic class topic = client.topic(19551275) print(topic.followers_count) print(topic.best_answers_count) for fol in topic.followers: print(fol.id, fol.name) #没有提供话题关注者接口,只能获取关注者数目,不能得到具体每位关注者 #最佳回答者无法获取 # print(type(topic.best_answerers)) # for answerer in topic.best_answerers: # print(answerer.id) # 报错信息如下 # return People(data['id'], data, self._session) # KeyError: u'id' # 为什么呢
class Crawler: # Initialize the crawler with the name of database def __init__(self, dbname,email,key): self.con = sqlite3.connect(dbname) self.cursor = self.con.cursor() TOKEN_FILE = 'token.pkl' self.zhclient = ZhihuClient() try: # self.zhclient.login_in_terminal(email, key) self.zhclient.login(email, key) except NeedCaptchaException: print("需要输入验证码,账号 %s 可能已失效" %(email)) # if os.path.isfile(TOKEN_FILE): # self.zhclient.load_token(TOKEN_FILE) # else: # self.zhclient.login_in_terminal(email, key) # self.zhclient.save_token(TOKEN_FILE) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() #建立数据表 def createindextables(self): self.cursor.execute('create table userinfo(id primary key NOT NULL ,name text,headline text,gender int,address text,business text,school_name text,job text,company text,answer_count int ,question_count int ,voteup_count int ,thanked_count int ,following_count int ,follower_count int ,following_question_count int ,following_topic_count,collected_count int,identity text,best_topics text,is_organization int,org_name text,org_home_page text,org_industry text,record_time text)') self.cursor.execute('create table answerinfo(id primary key NOT NULL,content text,author_id int ,voteup_count int,thanks_count int, created_time text,comment_count int,updated_time text,record_time text)') self.cursor.execute('create table questioninfo(id primary key NOT NULL,title text,follower_count int ,answer_count int,created_time text,updated_time text,record_time text)') self.cursor.execute('create table topicinfo(id primary key NOT NULL,title text,best_answer_count int ,follower_count int ,question_count int,record_time text)') self.cursor.execute('create table topic_questions(topic_id ,topic_name text,question_id ,question_title text,record_time text)') self.cursor.execute('create table topic_users(topic_id,topic_name text,user_id,user_name text,record_time text)') self.cursor.execute('create table question_users(question_id,question_title text,user_id,user_name text,record_time text)') self.cursor.execute('create table question_answers(question_id,question_title text,answer_id,author_id,record_time text)') self.cursor.execute('create table user_users(user_id,user_follower_id)') self.cursor.execute('create table question_topics(question_id,topic_id,topic_name text,record_time text)') self.cursor.execute('create table user_topics(user_id,user_name text,topic_id,topic_name text,record_time text)') self.cursor.execute('create index userinfoidx on userinfo(id)') self.cursor.execute('create index answerinfoidx on answerinfo(id)') self.cursor.execute('create index questioninfoidx on questioninfo(id)') self.cursor.execute('create index topicinfoidx on topicinfo(id)') self.cursor.execute('create index topic_questionsidx on topic_questions(topic_id,question_id)') self.cursor.execute('create index topic_usersidx on topic_users(topic_id,user_id)') self.cursor.execute('create index question_usersidx on question_users(question_id,user_id)') self.cursor.execute('create index question_answersidx on question_answers(question_id,answer_id)') self.cursor.execute('create index user_usersidx on user_users(user_id,user_follower_id)') self.cursor.execute('create index question_topicsidx on question_topics(question_id,topic_id)') self.cursor.execute('create index user_topicsidx on user_topics(user_id,topic_id)') self.dbcommit() # #多线程尝试 # def crawl_data(self,work_set,table1,field1,table2,field2): # if table2 == "userinfo": # for subid in work_set: # subid = subid[0] # self.userinfo(subid) # elif table2 == "answerinfo": # for subid in work_set: # subid = subid[0] # self.answerinfo(subid) # # time.sleep(0.8) # # time.sleep(0.5) # elif table2 == "questioninfo": # for subid in work_set: # subid = subid[0] # self.questioninfo(subid) # elif table2 == "topicinfo": # for subid in work_set: # subid = subid[0] # self.topicinfo(subid) # elif table2 == "question_answers": # for subid in work_set: # subid = subid[0] # self.question_answers(subid) # elif table2 == "question_topics": # for subid in work_set: # subid = subid[0] # self.question_topics(subid) # elif table2 == "question_users": # for subid in work_set: # subid = subid[0] # self.question_users(subid) # elif table2 == "topic_questions": # for subid in work_set: # subid = subid[0] # self.topic_questions(subid) # elif table2 == "topic_users": # for subid in work_set: # subid = subid[0] # self.topic_users(subid) # elif table2 == "user_users": # for subid in work_set: # subid = subid[0] # self.user_users(subid) # elif table2 == "user_topics": # for subid in work_set: # subid = subid[0] # self.user_topics(subid) # return None def justdoit(self,table1,field1,table2,field2): set2 =set(self.cursor.execute("select DISTINCT {} from {}".format(field2,table2)).fetchall()) set1 = set(self.cursor.execute("select DISTINCT {} from {}".format(field1,table1)).fetchall()) work_set = set1-set2 # work_set = list(set1 - set2) # splitlen = int(len(work_set) / 2) # subwork_set = [work_set[i:i + splitlen] for i in range(0, len(work_set), splitlen)] # threads = [] # for i in range(0,len(subwork_set)): # t = multiprocessing.Process(target=self.crawl_data,args=(subwork_set[i],table1,field1,table2,field2)) # threads.append(t) # for t in threads: # t.start() # t.join() if table2 == "userinfo": for subid in work_set: subid = subid[0] self.userinfo(subid) elif table2 == "answerinfo": for subid in work_set: subid = subid[0] self.answerinfo(subid) # time.sleep(1.0) time.sleep(0.1) elif table2 == "questioninfo": for subid in work_set: subid = subid[0] self.questioninfo(subid) elif table2 == "topicinfo": for subid in work_set: subid = subid[0] self.topicinfo(subid) elif table2 == "question_answers": for subid in work_set: subid = subid[0] self.question_answers(subid) elif table2 == "question_topics": for subid in work_set: subid = subid[0] self.question_topics(subid) elif table2 == "question_users": for subid in work_set: subid = subid[0] self.question_users(subid) elif table2 == "topic_questions": for subid in work_set: subid = subid[0] self.topic_questions(subid) elif table2 == "topic_users": for subid in work_set: subid = subid[0] self.topic_users(subid) elif table2 == "user_users": for subid in work_set: subid = subid[0] self.user_users(subid) elif table2 == "user_topics": for subid in work_set: subid = subid[0] self.user_topics(subid) return None #话题-(精华)问题关系 def topic_questions(self,topic_id): try: topic = self.zhclient.topic(topic_id) record_time = self.logtime() ques_set = set() for hot_ques in shield(topic.best_answers,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("topic_questions", "topic_id", "question_id", topic.id, hot_ques.question.id) if status == None: if hot_ques.question.id not in ques_set: ques_set.add(hot_ques.question.id) values = (topic.id,topic.name,hot_ques.question.id,hot_ques.question.title,record_time) self.cursor.execute("insert into topic_questions(topic_id,topic_name,question_id,question_title,record_time) VALUES (?,?,?,?,?)" ,values) self.dbcommit() print("正在处理", hot_ques.question.id) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") raise except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #话题-关注者关系 def topic_users(self,topic_id,start_at = 0): try: topic = self.zhclient.topic(topic_id) record_time = self.logtime() user_set = set() for follower in shield(topic.followers,start_at=start_at,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("topic_users", "topic_id", "user_id", topic.id, follower.id) if status == None: if follower.id not in user_set: user_set.add(follower.id) values = (topic.id,topic.name,follower.id,follower.name,record_time) self.cursor.execute("insert into topic_users(topic_id,topic_name,user_id,user_name,record_time) VALUES (?,?,?,?,?)" ,values) self.dbcommit() print("正在处理",topic.name,follower.name) # time.sleep(0.3) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass # 问题-关注者关系 def question_users(self, question_id): try: question = self.zhclient.question(question_id) record_time = self.logtime() user_set = set() for follower in shield(question.followers,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("question_users", "question_id", "user_id", question.id, follower.id) if status == None: if follower.id not in user_set: user_set.add(follower.id) values = (question.id, question.title, follower.id, follower.name,record_time) self.cursor.execute( "insert into question_users(question_id,question_title,user_id,user_name,record_time) VALUES (?,?,?,?,?)", values) self.dbcommit() print("正在处理",follower.name,question.title) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass # 问题-回答关系 def question_answers(self, question_id): try: question = self.zhclient.question(question_id) record_time = self.logtime() answer_set = set() for answer in shield(question.answers): status = self.isdupicaterel("question_answers", "question_id", "answer_id", question.id, answer.id) if status == None: if answer.id not in answer_set: answer_set.add(answer.id) values = (question.id, question.title, answer.id, answer.author.id,record_time) self.cursor.execute("insert into question_answers(question_id,question_title,answer_id,author_id,record_time) VALUES (?,?,?,?,?)", values) self.dbcommit() print("正在处理", question.id, question.title, answer.id, answer.author.id) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass except ZhihuWarning: print("Pass the UnexpectedResponseException") pass #获取用户-用户关注关系,知乎有5020限制,api限制最多获取一个用户5020粉丝 def user_users(self,user_id): try: people = self.zhclient.people(user_id) record_time = self.logtime() user_set = set() for follower in shield(people.followers,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("user_users", "user_id", "user_follower_id", people.id, follower.id) if status == None: if follower.id not in user_set: user_set.add(follower.id) valus = (people.id,follower.id,record_time) self.cursor.execute("insert into user_users(user_id,user_follower_id,record_time) VALUES (?,?,?)",valus) self.dbcommit() print("正在处理",follower.name) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #获取问题-话题关系 def question_topics(self,question_id): try: question = self.zhclient.question(question_id) record_time = self.logtime() topic_set = set() for topic in shield(question.topics): status = self.isdupicaterel("question_topics", "question_id", "topic_id", question.id, topic.id) if status == None: if topic.id not in topic_set: topic_set.add(topic.id) values = (question.id,topic.id,topic.name,record_time) self.cursor.execute("insert into question_topics(question_id,topic_id,topic_name,record_time) VALUES (?,?,?,?)",values) self.dbcommit() print("正在处理", topic.name,question.title) else: print("已存在,正在跳过") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass # 获取用户-话题关系 def user_topics(self, user_id): try: people = self.zhclient.people(user_id) record_time = self.logtime() topic_set = set() for topic in shield(people.following_topics): status = self.isdupicaterel("user_topics", "user_id", "topic_id", people.id, topic.id) if status == None: if topic.id not in topic_set: topic_set.add(topic.id) values = (people.id, people.name, topic.id,topic.name, record_time) self.cursor.execute( "insert into user_topics(user_id,user_name,topic_id,topic_name,record_time) VALUES (?,?,?,?,?)", values) self.dbcommit() print("正在处理", people.name ,topic.name) else: print("已存在,正在跳过") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass # 判断数据重复 def isdupicateid(self, table, id): cur = self.cursor.execute( "select rowid from {} where id = ?".format(table), (id,)) self.dbcommit() res = cur.fetchone() res = None if res == None else res[0] return res def isdupicaterel(self,table,field1,field2, id1,id2): cur = self.cursor.execute( "select rowid from {} where {}= ? And {} = ?".format(table,field1,field2), (id1,id2)) res = cur.fetchone() self.dbcommit() res = None if res == None else res[0] return res #个人信息 def userinfo(self,user_id): try: status = self.isdupicateid("userinfo",user_id) if status==None: people = self.zhclient.people(user_id) record_time = self.logtime() address = "|".join([location.name for location in people.locations]) school_name = "|".join([education.school.name for education in people.educations if "school" in education]) job = "|".join([employment.job.name for employment in people.employments if "job" in employment]) company = "|".join([employment.company.name for employment in people.employments if "company" in employment]) business = people.business.name if people.business else None #勋章判断 if people.badge.has_identity: identity = people.badge.identity else: identity = None if people.badge.is_best_answerer: best_topics = "".join([topic.name for topic in people.badge.topics]) else: best_topics = None if people.badge.is_organization: is_organization = 1 org_name = people.badge.org_name org_home_page = people.badge.org_home_page org_industry = people.badge.org_industry else: is_organization = 0 org_name = None org_home_page = None org_industry = None values = ( people.id, people.name, people.headline, people.gender, address, business, school_name, job,company, people.answer_count, people.question_count, people.voteup_count, people.thanked_count, people.following_count, people.follower_count, people.following_question_count, people.following_topic_count, people.collected_count, identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) self.cursor.execute( "insert into userinfo(id,name,headline,gender,address,business,school_name,job,company,answer_count,question_count,voteup_count,thanked_count,following_count,follower_count,following_question_count,following_topic_count,collected_count,identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", values) self.dbcommit() print("正在处理", people.name) else: print("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass def answerinfo(self,answer_id): try: status = self.isdupicateid("answerinfo", answer_id) if status == None: answer = self.zhclient.answer(answer_id) record_time = self.logtime() values = (answer.id,answer.content,answer.author.id,answer.voteup_count,answer.thanks_count,answer.comment_count,answer.created_time,answer.updated_time,record_time) self.cursor.execute("insert into answerinfo(id,content,author_id,voteup_count,thanks_count,comment_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?,?,?)",values) self.dbcommit() print("正在处理",answer.id) else: return ("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") self.cursor.execute("delete from question_answers where answer_id = ?",(answer_id,))##在从question_answer表中获取及时删除无效问题,方式切换帐号后反复爬去无效问题。 pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #问题信息 def questioninfo(self,question_id): try: status = self.isdupicateid("questioninfo", question_id) if status == None: question = self.zhclient.question(question_id) record_time = self.logtime() values = (question.id,question.title,question.follower_count,question.answer_count,question.created_time,question.updated_time,record_time) self.cursor.execute("insert into questioninfo(id,title,follower_count,answer_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?)",values) self.dbcommit() print("正在处理" ,question.title) else: return ("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #话题信息 def topicinfo(self,topic_id): try: status = self.isdupicateid("topicinfo", topic_id) if status == None: topic = self.zhclient.topic(topic_id) record_time = self.logtime() values=(topic.id,topic.name,topic.best_answer_count,topic.follower_count,topic.question_count,record_time) self.cursor.execute("insert into topicinfo(id,title,best_answer_count,follower_count,question_count,record_time) VALUES (?,?,?,?,?,?)",values) self.dbcommit() print("正在处理", topic.name) else: return ("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #时间戳 def logtime(self): fmt = '%Y-%m-%d' # 定义时间显示格式 Date = time.strftime(fmt, time.localtime(time.time())) return Date def add_counts(self,filepath = "logincounts.txt"): counts = [] for line in open(filepath): count = {} count["count"], count["key"] = line.split("----") count["key"] = count["key"].strip("\n") counts.append(count) return counts def get_proxy(self): try: PROXY_POOL_URL = 'http://localhost:5000/get' response = requests.get(PROXY_POOL_URL) if response.status_code == 200: return response.text except ConnectionError: return None
class ZhiHu(object): TOKEN_FILE = 'token.pkl' def __init__(self): """ 初始化 """ self.login_zhihu() self.db = EasySqlite('zhihu.db') def login_zhihu(self): """ 登录知乎 :return: """ self.client = ZhihuClient() if os.path.isfile(self.TOKEN_FILE): self.client.load_token(self.TOKEN_FILE) else: self.client.login_in_terminal() self.client.save_token(self.TOKEN_FILE) def save_quesions(self, topic_id): """ 保存话题下的问题 :param topic_id: :return: """ topic = self.client.topic(topic_id) print(topic) questions = topic.unanswered_questions sql_tmp = 'replace into questions values(?,?,?,?,?,?)' for question in questions: if question.answer_count < 10: continue row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count, topic_id] print(row) ret = self.db.update(sql_tmp, args=row) if not ret: print('insert error!') else: print('insert success!') def save_answer_info(self, question_id): """ 保存指定问题的答案概况 :param question_id: :return: """ question = self.client.question(question_id) print(question.title) answers = question.answers for answer in answers: print(answer.comment_count, answer.excerpt, answer.question, answer.thanks_count, answer.voteup_count) answer.save() break # sql_tmp = 'replace into questions values(?,?,?,?,?,?)' # for question in questions: # if question.answer_count < 10: # continue # row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count, # topic_id] # print(row) # ret = self.db.update(sql_tmp, args=row) # if not ret: # print('insert error!') # else: # print('insert success!') def to_md(self, topic, file_name): sql = "select * from questions where topic_id = '%s' order by follower_count desc limit 1000" % topic ret = self.db.query(sql) line_tmp = "%s. [%s](https://www.zhihu.com/question/%s) 关注数:%s 回答数:%s 评论数:%s<br>\n" i = 1 with open(file_name, 'w', encoding='utf8') as f: for item in ret: line = line_tmp % (i, item['title'], item['id'], item['follower_count'], item['answer_count'], item['comment_count']) f.write(line) i += 1
TOKEN_FILE = 'token.cache' TOP_SIZE = 50 # Login client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) # The topest root of topic root_topic = client.topic(19776749) # The array to store top hot topic hot_topics = list() # Whether the hot_topics is full hot_topics_full = False # The fewest topic in hot_topics last_topic = {} # The file to output file_name = 'result' # The number of topics has been searched search_count = 0 # How many times output once output_time = 500 # How many logs to researve log_num = 100 # 程序暂定活着中断后重新开始的树的层数
def login(): TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) """ me = client.me() print('name', me.name) print('headline', me.headline) print('description', me.description) print('following topic count', me.following_topic_count) print('following people count', me.following_topic_count) print('followers count', me.follower_count) print('voteup count', me.voteup_count) print('get thanks count', me.thanked_count) print('answered question', me.answer_count) print('question asked', me.question_count) print('collection count', me.collection_count) print('article count', me.articles_count) print('following column count', me.following_column_count) # 获取最近 5 个回答 for _, answer in zip(range(5), me.answers): print(answer.question.title, answer.voteup_count) print('----------') # 获取点赞量最高的 5 个回答 for _, answer in zip(range(5), me.answers.order_by('votenum')): print(answer.question.title, answer.voteup_count) print('----------') # 获取最近提的 5 个问题 for _, question in zip(range(5), me.questions): print(question.title, question.answer_count) print('----------') # 获取最近发表的 5 个文章 for _, article in zip(range(5), me.articles): print(article.title, article.voteup_count) """ topic = client.topic(19560072) # 转基因 # topic = client.topic(19578906) # 气候变化 # topic = client.topic(19551296) # 网络游戏 answers_count = 0 for question in topic.unanswered_questions: print(question.id) print(question.title) print(question.answer_count) answers_count += question.answer_count for answer in question.answers: print(answer.author.id,answer.author.name) answer.save('Data\\Gene\\'+str(question.id)+'#'+question.title, str(answer.author.id)+'#'+answer.author.name) print("总共有{0}个回答".format(answers_count))
# print ("author: {0}".format(item.author.name)) # counter-=1 # a1 = client.answer(143216281) # #https://www.zhihu.com/question/20251786/answer/143216281 # print (a1.author.answer_count) # # author’s profile and influence. # print ("name: {0}".format(a1.author.name)) # print ("collected_count: {0}".format(a1.author.collected_count)) # print ("favorited_count: {0}".format(a1.author.favorited_count)) # print ("follower_count: {0}".format(a1.author.follower_count)) # print ("voteup_count: {0}".format(a1.author.voteup_count)) # #print ("is_best_answerer: {0}".format(a1.author.is_best_answerer)) output_file = "./question.csv" headers = ["Qid", "Followers", "Created_time", "Answer_count"] #https://www.zhihu.com/topic/20019119/top-answers topic1 = client.topic(20019119) questions = topic1.unanswered_questions rows = [] for v in questions: rows.append((v._id, v.follower_count, v.created_time, v.answer_count)) with open(output_file,'a') as f: f_csv = csv.writer(f) f_csv.writerow(headers) f_csv.writerows(rows)
from zhihu_oauth import ZhihuClient from getUser import getUser import MySQLdb TOKEN_FILE = 'token.pkl' # login client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) topic = client.topic() if __name__ == '__main__': conn = MySQLdb.connect(host='127.0.0.1', user='******', passwd='root', port=3306, charset='utf8') cur = conn.cursor() u_table = 'topic_' + str(topic.name) cre_utable = 'create table IF NOT EXISTS %s (uid VARCHAR (50) PRIMARY KEY ,name VARCHAR (20),gender VARCHAR (10),headline VARCHAR (400),description VARCHAR (1000),que_count INT ,ans_count INT ,art_count INT ,column_ INT ,column_fol_sum INT ,collection INT ,coll_ans_sum INT ,coll_fol_sum INT ,voteup INT ,thanks INT ,collected INT ,shared INT ,art_vote_sum INT ,following INT ,follower INT ,fol_column INT ,fol_topic INT ,fol_topic_name MEDIUMTEXT,fol_ques INT ,location VARCHAR (200),business VARCHAR (50),school VARCHAR (200),major VARCHAR (200),company VARCHAR (200),job VARCHAR (200), avatar VARCHAR (10),avatar_url VARCHAR (100),weibo VARCHAR (10),weibo_name VARCHAR (50),weibo_url VARCHAR (50), give_ans_vote INT, give_art_vote INT, topic_name VARCHAR (200))' % u_table ins_utable = 'insert into ' + u_table + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' try: # cur.execute('set interactive_timeout=96*3600') # cur.execute('CREATE DATABASE IF NOT EXISTS zhihu DEFAULT CHARSET utf8 COLLATE utf8_unicode_ci')