def crawling(id): #id为问题id client = ZhihuClient() # 登录 client.load_token('token.pkl') # 加载token文件 question = client.question(id) print(u"问题:", question.title) print(u"回答数量:", question.answer_count) if not os.path.exists(question.title): os.mkdir(question.title) path = question.title index = 1 # 图片序号 for i, answer in enumerate(question.answers): content = answer.content # 回答内容 anther = answer.author.name re_compile = re.compile( r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>') img_lists = re.findall(re_compile, content) if (img_lists): for img in img_lists: img_url = img[0] # 图片url image_name = anther + '_' + str(index) + '.jpg' if not os.path.exists(path + '/' + image_name): urllib.request.urlretrieve(img_url, path + '/' + image_name) print(u"成功保存第%d张图片:%s,当前总进度%.2f%%" % (index, image_name, i / question.answer_count * 100)) index += 1 print('第%d个答案爬取完成,当前总进度%.2f%%' % (i, i / question.answer_count * 100))
except NeedCaptchaException: # 保存验证码并提示输入,重新登录 print u'登录失败,需要输入验证码' with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = raw_input(u'please input captcha:') login_result = client.login(test_email, test_password, captcha) print 'login result => ' print login_result client.save_token(token_file) print 'save token success' # question response_file_uri = './question_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构 question_id = 35005800 question = client.question(question_id) data = question.pure_data response_json = json.dumps(data) response_file = open(response_file_uri, 'w+') response_file.write(response_json) print u"数据保存完成" response_file_uri = './people_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构 people_id = '404-Page-Not-found' people = client.people(people_id) for i in people.answers: data = i.pure_data response_json = json.dumps(data) response_file = open(response_file_uri, 'w+') response_file.write(response_json) print u"数据保存完成"
client.load_token('token.pkl') me = client.me() # answer = client.answer(94150403) # print(answer.question.title) # print(answer.author.name) # print(answer.voteup_count) # print(answer.thanks_count) # print(answer.created_time) # print(answer.updated_time) # for voter in answer.voters: # print(voter.name, voter.headline) question_number = [20787350] for q in question_number: index = 0 question = client.question(q) print(question.title) for answer in question.answers: if index > 666: break print(answer.author.name, answer.voteup_count) answer.save(question.title) index += 1
class Crawler: # Initialize the crawler with the name of database def __init__(self, dbname,email,key): self.con = sqlite3.connect(dbname) self.cursor = self.con.cursor() TOKEN_FILE = 'token.pkl' self.zhclient = ZhihuClient() try: # self.zhclient.login_in_terminal(email, key) self.zhclient.login(email, key) except NeedCaptchaException: print("需要输入验证码,账号 %s 可能已失效" %(email)) # if os.path.isfile(TOKEN_FILE): # self.zhclient.load_token(TOKEN_FILE) # else: # self.zhclient.login_in_terminal(email, key) # self.zhclient.save_token(TOKEN_FILE) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() #建立数据表 def createindextables(self): self.cursor.execute('create table userinfo(id primary key NOT NULL ,name text,headline text,gender int,address text,business text,school_name text,job text,company text,answer_count int ,question_count int ,voteup_count int ,thanked_count int ,following_count int ,follower_count int ,following_question_count int ,following_topic_count,collected_count int,identity text,best_topics text,is_organization int,org_name text,org_home_page text,org_industry text,record_time text)') self.cursor.execute('create table answerinfo(id primary key NOT NULL,content text,author_id int ,voteup_count int,thanks_count int, created_time text,comment_count int,updated_time text,record_time text)') self.cursor.execute('create table questioninfo(id primary key NOT NULL,title text,follower_count int ,answer_count int,created_time text,updated_time text,record_time text)') self.cursor.execute('create table topicinfo(id primary key NOT NULL,title text,best_answer_count int ,follower_count int ,question_count int,record_time text)') self.cursor.execute('create table topic_questions(topic_id ,topic_name text,question_id ,question_title text,record_time text)') self.cursor.execute('create table topic_users(topic_id,topic_name text,user_id,user_name text,record_time text)') self.cursor.execute('create table question_users(question_id,question_title text,user_id,user_name text,record_time text)') self.cursor.execute('create table question_answers(question_id,question_title text,answer_id,author_id,record_time text)') self.cursor.execute('create table user_users(user_id,user_follower_id)') self.cursor.execute('create table question_topics(question_id,topic_id,topic_name text,record_time text)') self.cursor.execute('create table user_topics(user_id,user_name text,topic_id,topic_name text,record_time text)') self.cursor.execute('create index userinfoidx on userinfo(id)') self.cursor.execute('create index answerinfoidx on answerinfo(id)') self.cursor.execute('create index questioninfoidx on questioninfo(id)') self.cursor.execute('create index topicinfoidx on topicinfo(id)') self.cursor.execute('create index topic_questionsidx on topic_questions(topic_id,question_id)') self.cursor.execute('create index topic_usersidx on topic_users(topic_id,user_id)') self.cursor.execute('create index question_usersidx on question_users(question_id,user_id)') self.cursor.execute('create index question_answersidx on question_answers(question_id,answer_id)') self.cursor.execute('create index user_usersidx on user_users(user_id,user_follower_id)') self.cursor.execute('create index question_topicsidx on question_topics(question_id,topic_id)') self.cursor.execute('create index user_topicsidx on user_topics(user_id,topic_id)') self.dbcommit() # #多线程尝试 # def crawl_data(self,work_set,table1,field1,table2,field2): # if table2 == "userinfo": # for subid in work_set: # subid = subid[0] # self.userinfo(subid) # elif table2 == "answerinfo": # for subid in work_set: # subid = subid[0] # self.answerinfo(subid) # # time.sleep(0.8) # # time.sleep(0.5) # elif table2 == "questioninfo": # for subid in work_set: # subid = subid[0] # self.questioninfo(subid) # elif table2 == "topicinfo": # for subid in work_set: # subid = subid[0] # self.topicinfo(subid) # elif table2 == "question_answers": # for subid in work_set: # subid = subid[0] # self.question_answers(subid) # elif table2 == "question_topics": # for subid in work_set: # subid = subid[0] # self.question_topics(subid) # elif table2 == "question_users": # for subid in work_set: # subid = subid[0] # self.question_users(subid) # elif table2 == "topic_questions": # for subid in work_set: # subid = subid[0] # self.topic_questions(subid) # elif table2 == "topic_users": # for subid in work_set: # subid = subid[0] # self.topic_users(subid) # elif table2 == "user_users": # for subid in work_set: # subid = subid[0] # self.user_users(subid) # elif table2 == "user_topics": # for subid in work_set: # subid = subid[0] # self.user_topics(subid) # return None def justdoit(self,table1,field1,table2,field2): set2 =set(self.cursor.execute("select DISTINCT {} from {}".format(field2,table2)).fetchall()) set1 = set(self.cursor.execute("select DISTINCT {} from {}".format(field1,table1)).fetchall()) work_set = set1-set2 # work_set = list(set1 - set2) # splitlen = int(len(work_set) / 2) # subwork_set = [work_set[i:i + splitlen] for i in range(0, len(work_set), splitlen)] # threads = [] # for i in range(0,len(subwork_set)): # t = multiprocessing.Process(target=self.crawl_data,args=(subwork_set[i],table1,field1,table2,field2)) # threads.append(t) # for t in threads: # t.start() # t.join() if table2 == "userinfo": for subid in work_set: subid = subid[0] self.userinfo(subid) elif table2 == "answerinfo": for subid in work_set: subid = subid[0] self.answerinfo(subid) # time.sleep(1.0) time.sleep(0.1) elif table2 == "questioninfo": for subid in work_set: subid = subid[0] self.questioninfo(subid) elif table2 == "topicinfo": for subid in work_set: subid = subid[0] self.topicinfo(subid) elif table2 == "question_answers": for subid in work_set: subid = subid[0] self.question_answers(subid) elif table2 == "question_topics": for subid in work_set: subid = subid[0] self.question_topics(subid) elif table2 == "question_users": for subid in work_set: subid = subid[0] self.question_users(subid) elif table2 == "topic_questions": for subid in work_set: subid = subid[0] self.topic_questions(subid) elif table2 == "topic_users": for subid in work_set: subid = subid[0] self.topic_users(subid) elif table2 == "user_users": for subid in work_set: subid = subid[0] self.user_users(subid) elif table2 == "user_topics": for subid in work_set: subid = subid[0] self.user_topics(subid) return None #话题-(精华)问题关系 def topic_questions(self,topic_id): try: topic = self.zhclient.topic(topic_id) record_time = self.logtime() ques_set = set() for hot_ques in shield(topic.best_answers,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("topic_questions", "topic_id", "question_id", topic.id, hot_ques.question.id) if status == None: if hot_ques.question.id not in ques_set: ques_set.add(hot_ques.question.id) values = (topic.id,topic.name,hot_ques.question.id,hot_ques.question.title,record_time) self.cursor.execute("insert into topic_questions(topic_id,topic_name,question_id,question_title,record_time) VALUES (?,?,?,?,?)" ,values) self.dbcommit() print("正在处理", hot_ques.question.id) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") raise except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #话题-关注者关系 def topic_users(self,topic_id,start_at = 0): try: topic = self.zhclient.topic(topic_id) record_time = self.logtime() user_set = set() for follower in shield(topic.followers,start_at=start_at,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("topic_users", "topic_id", "user_id", topic.id, follower.id) if status == None: if follower.id not in user_set: user_set.add(follower.id) values = (topic.id,topic.name,follower.id,follower.name,record_time) self.cursor.execute("insert into topic_users(topic_id,topic_name,user_id,user_name,record_time) VALUES (?,?,?,?,?)" ,values) self.dbcommit() print("正在处理",topic.name,follower.name) # time.sleep(0.3) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass # 问题-关注者关系 def question_users(self, question_id): try: question = self.zhclient.question(question_id) record_time = self.logtime() user_set = set() for follower in shield(question.followers,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("question_users", "question_id", "user_id", question.id, follower.id) if status == None: if follower.id not in user_set: user_set.add(follower.id) values = (question.id, question.title, follower.id, follower.name,record_time) self.cursor.execute( "insert into question_users(question_id,question_title,user_id,user_name,record_time) VALUES (?,?,?,?,?)", values) self.dbcommit() print("正在处理",follower.name,question.title) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass # 问题-回答关系 def question_answers(self, question_id): try: question = self.zhclient.question(question_id) record_time = self.logtime() answer_set = set() for answer in shield(question.answers): status = self.isdupicaterel("question_answers", "question_id", "answer_id", question.id, answer.id) if status == None: if answer.id not in answer_set: answer_set.add(answer.id) values = (question.id, question.title, answer.id, answer.author.id,record_time) self.cursor.execute("insert into question_answers(question_id,question_title,answer_id,author_id,record_time) VALUES (?,?,?,?,?)", values) self.dbcommit() print("正在处理", question.id, question.title, answer.id, answer.author.id) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass except ZhihuWarning: print("Pass the UnexpectedResponseException") pass #获取用户-用户关注关系,知乎有5020限制,api限制最多获取一个用户5020粉丝 def user_users(self,user_id): try: people = self.zhclient.people(user_id) record_time = self.logtime() user_set = set() for follower in shield(people.followers,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("user_users", "user_id", "user_follower_id", people.id, follower.id) if status == None: if follower.id not in user_set: user_set.add(follower.id) valus = (people.id,follower.id,record_time) self.cursor.execute("insert into user_users(user_id,user_follower_id,record_time) VALUES (?,?,?)",valus) self.dbcommit() print("正在处理",follower.name) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #获取问题-话题关系 def question_topics(self,question_id): try: question = self.zhclient.question(question_id) record_time = self.logtime() topic_set = set() for topic in shield(question.topics): status = self.isdupicaterel("question_topics", "question_id", "topic_id", question.id, topic.id) if status == None: if topic.id not in topic_set: topic_set.add(topic.id) values = (question.id,topic.id,topic.name,record_time) self.cursor.execute("insert into question_topics(question_id,topic_id,topic_name,record_time) VALUES (?,?,?,?)",values) self.dbcommit() print("正在处理", topic.name,question.title) else: print("已存在,正在跳过") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass # 获取用户-话题关系 def user_topics(self, user_id): try: people = self.zhclient.people(user_id) record_time = self.logtime() topic_set = set() for topic in shield(people.following_topics): status = self.isdupicaterel("user_topics", "user_id", "topic_id", people.id, topic.id) if status == None: if topic.id not in topic_set: topic_set.add(topic.id) values = (people.id, people.name, topic.id,topic.name, record_time) self.cursor.execute( "insert into user_topics(user_id,user_name,topic_id,topic_name,record_time) VALUES (?,?,?,?,?)", values) self.dbcommit() print("正在处理", people.name ,topic.name) else: print("已存在,正在跳过") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass # 判断数据重复 def isdupicateid(self, table, id): cur = self.cursor.execute( "select rowid from {} where id = ?".format(table), (id,)) self.dbcommit() res = cur.fetchone() res = None if res == None else res[0] return res def isdupicaterel(self,table,field1,field2, id1,id2): cur = self.cursor.execute( "select rowid from {} where {}= ? And {} = ?".format(table,field1,field2), (id1,id2)) res = cur.fetchone() self.dbcommit() res = None if res == None else res[0] return res #个人信息 def userinfo(self,user_id): try: status = self.isdupicateid("userinfo",user_id) if status==None: people = self.zhclient.people(user_id) record_time = self.logtime() address = "|".join([location.name for location in people.locations]) school_name = "|".join([education.school.name for education in people.educations if "school" in education]) job = "|".join([employment.job.name for employment in people.employments if "job" in employment]) company = "|".join([employment.company.name for employment in people.employments if "company" in employment]) business = people.business.name if people.business else None #勋章判断 if people.badge.has_identity: identity = people.badge.identity else: identity = None if people.badge.is_best_answerer: best_topics = "".join([topic.name for topic in people.badge.topics]) else: best_topics = None if people.badge.is_organization: is_organization = 1 org_name = people.badge.org_name org_home_page = people.badge.org_home_page org_industry = people.badge.org_industry else: is_organization = 0 org_name = None org_home_page = None org_industry = None values = ( people.id, people.name, people.headline, people.gender, address, business, school_name, job,company, people.answer_count, people.question_count, people.voteup_count, people.thanked_count, people.following_count, people.follower_count, people.following_question_count, people.following_topic_count, people.collected_count, identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) self.cursor.execute( "insert into userinfo(id,name,headline,gender,address,business,school_name,job,company,answer_count,question_count,voteup_count,thanked_count,following_count,follower_count,following_question_count,following_topic_count,collected_count,identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", values) self.dbcommit() print("正在处理", people.name) else: print("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass def answerinfo(self,answer_id): try: status = self.isdupicateid("answerinfo", answer_id) if status == None: answer = self.zhclient.answer(answer_id) record_time = self.logtime() values = (answer.id,answer.content,answer.author.id,answer.voteup_count,answer.thanks_count,answer.comment_count,answer.created_time,answer.updated_time,record_time) self.cursor.execute("insert into answerinfo(id,content,author_id,voteup_count,thanks_count,comment_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?,?,?)",values) self.dbcommit() print("正在处理",answer.id) else: return ("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") self.cursor.execute("delete from question_answers where answer_id = ?",(answer_id,))##在从question_answer表中获取及时删除无效问题,方式切换帐号后反复爬去无效问题。 pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #问题信息 def questioninfo(self,question_id): try: status = self.isdupicateid("questioninfo", question_id) if status == None: question = self.zhclient.question(question_id) record_time = self.logtime() values = (question.id,question.title,question.follower_count,question.answer_count,question.created_time,question.updated_time,record_time) self.cursor.execute("insert into questioninfo(id,title,follower_count,answer_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?)",values) self.dbcommit() print("正在处理" ,question.title) else: return ("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #话题信息 def topicinfo(self,topic_id): try: status = self.isdupicateid("topicinfo", topic_id) if status == None: topic = self.zhclient.topic(topic_id) record_time = self.logtime() values=(topic.id,topic.name,topic.best_answer_count,topic.follower_count,topic.question_count,record_time) self.cursor.execute("insert into topicinfo(id,title,best_answer_count,follower_count,question_count,record_time) VALUES (?,?,?,?,?,?)",values) self.dbcommit() print("正在处理", topic.name) else: return ("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #时间戳 def logtime(self): fmt = '%Y-%m-%d' # 定义时间显示格式 Date = time.strftime(fmt, time.localtime(time.time())) return Date def add_counts(self,filepath = "logincounts.txt"): counts = [] for line in open(filepath): count = {} count["count"], count["key"] = line.split("----") count["key"] = count["key"].strip("\n") counts.append(count) return counts def get_proxy(self): try: PROXY_POOL_URL = 'http://localhost:5000/get' response = requests.get(PROXY_POOL_URL) if response.status_code == 200: return response.text except ConnectionError: return None
import pymysql from zhihu_oauth import ZhihuClient from getAnswer import getAnswer from getUser import getUser # login TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) # 创建问题对象,参数为问题id question = client.question(67079761) if __name__ == '__main__': connection = pymysql.connect(host='localhost', user='******', password='******', port=3306, db='zhihu_live', charset='utf8mb4') u_table = str(question.id) + '_user' a_table = str(question.id) + '_ans' cre_utable = 'create table IF NOT EXISTS %s (uid VARCHAR (50),name VARCHAR (20),gender VARCHAR (10),headline VARCHAR (400),description VARCHAR (1000),que_count INT ,ans_count INT ,art_count INT ,column_ INT ,column_fol_sum INT ,collection INT ,coll_ans_sum INT ,coll_fol_sum INT ,voteup INT ,thanks INT ,collected INT ,shared INT ,art_vote_sum INT ,following INT ,follower INT ,fol_column INT ,fol_topic INT ,fol_topic_name MEDIUMTEXT,fol_ques INT ,location VARCHAR (200),business VARCHAR (50),school VARCHAR (200),major VARCHAR (200),company VARCHAR (200),job VARCHAR (200), avatar VARCHAR (10),avatar_url VARCHAR (100),weibo VARCHAR (10),weibo_name VARCHAR (50),weibo_url VARCHAR (50), give_ans_vote INT, give_art_vote INT, ans_id INT ,que_title VARCHAR (200))' % u_table ins_utable = 'insert into ' + u_table + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' cre_atable = 'create TABLE IF NOT EXISTS %s (ans_ques VARCHAR (200),que_id INT ,ans_id INT ,ans_auth VARCHAR (20),ans_cont MEDIUMTEXT ,ans_vote INT ,ans_than INT ,ans_comm INT ,com_perm VARCHAR (20) ,cre_timestamp VARCHAR (30),upd_timestamp VARCHAR (30),cre_time VARCHAR (30),upd_time VARCHAR (30))' % a_table ins_atable = 'insert into ' + a_table + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
# for _ in range(1000): # num = random.randint(20000000, 39999999) for num in [ 40023941, 36582119, 23434853, 37027323, 39124944, 22345285, 26992616, 28066166, 41035200, 21396519, 35947787, 36851579, 21372989, 47955389, 37236484, 19861023, 25877081, 27063206, 29166103, 23246914, 38540397, 36543921, 32158092, 41207814, 41404094, 36734444, 31819473, 29336768, 32171411, 37184080, 20468104, 36238122, 36573907, 23415802, 30605806, 37737298, 37059032, 48837193, 48296279, 41053015, 22978737, 22621327, 42082026, 30470093, 41038770, 21155222, 28489148, 32081129, 32369239, 30830614, 29213441, 41113819, 36770197, 48831736, 35990525, 48779414, 22364486, 33032798, 29604768, 21900376, 26500277 ]: n = 0 try: question = client.question(num) # question = client.from_url('https://www.zhihu.com/question/35166763') print(question.title) with open("name.txt", 'a', encoding='utf-8') as f: for answer in question.answers: n += 1 try: if answer.author.name != "匿名用户" and answer.author.name != "[已重置]": print(answer.author.name) f.write(answer.author.name + '\n') except: pass print(n) except: print("空")
35367500, 35210878, 35134422, 35062190, 35012924, 35004585, 34893663, 34370944, 34225657, 33488763, 33259890, 32207070, 31592568, 31365240, 31337752, 30966406, 30158223, 29735498, 29582607, 29550579, 29525971, 29519716, 29518811, 29511036, 29508808, 29448162, 27255630, 25951351, 23863606, 19930380 ] qid_run = qid_remaining[200:] rows = [] fail_qid = [] counter = 1 for qid in qid_run: try: ans = client.question(int(qid)).answers for v in ans: rows.append((v._id, qid, v.voteup_count, v.comment_count)) print("success {0}".format(counter)) counter += 1 except: # if crawl failed, append qid to list fail_qid[] fail_qid.append(qid) print("fail {0}".format(counter)) counter += 1 print(fail_qid) continue output_file = "./answers.csv" headers = ["aid", "qid", "voteup_count", "comment_count"] with open(output_file, 'a') as f:
if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) print('login success!') else: client.login_in_terminal() client.save_token(TOKEN_FILE) # # 回答信息 # answer = client.answer(94150403) # # print(answer.question.title) # print(answer.author.name) # print(answer.voteup_count) # print(answer.thanks_count) # print(answer.created_time) # print(answer.updated_time) # # for voter in answer.voters: # print(voter.name, voter.headline) question = client.question(35166763) print(question.title) count = 0 for answer in question.answers: answer.save(r'Data\Answers\\' + question.title) count += 1 if count == 10: break
class ZhiHu(object): TOKEN_FILE = 'token.pkl' def __init__(self): """ 初始化 """ self.login_zhihu() self.db = EasySqlite('zhihu.db') def login_zhihu(self): """ 登录知乎 :return: """ self.client = ZhihuClient() if os.path.isfile(self.TOKEN_FILE): self.client.load_token(self.TOKEN_FILE) else: self.client.login_in_terminal() self.client.save_token(self.TOKEN_FILE) def save_quesions(self, topic_id): """ 保存话题下的问题 :param topic_id: :return: """ topic = self.client.topic(topic_id) print(topic) questions = topic.unanswered_questions sql_tmp = 'replace into questions values(?,?,?,?,?,?)' for question in questions: if question.answer_count < 10: continue row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count, topic_id] print(row) ret = self.db.update(sql_tmp, args=row) if not ret: print('insert error!') else: print('insert success!') def save_answer_info(self, question_id): """ 保存指定问题的答案概况 :param question_id: :return: """ question = self.client.question(question_id) print(question.title) answers = question.answers for answer in answers: print(answer.comment_count, answer.excerpt, answer.question, answer.thanks_count, answer.voteup_count) answer.save() break # sql_tmp = 'replace into questions values(?,?,?,?,?,?)' # for question in questions: # if question.answer_count < 10: # continue # row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count, # topic_id] # print(row) # ret = self.db.update(sql_tmp, args=row) # if not ret: # print('insert error!') # else: # print('insert success!') def to_md(self, topic, file_name): sql = "select * from questions where topic_id = '%s' order by follower_count desc limit 1000" % topic ret = self.db.query(sql) line_tmp = "%s. [%s](https://www.zhihu.com/question/%s) 关注数:%s 回答数:%s 评论数:%s<br>\n" i = 1 with open(file_name, 'w', encoding='utf8') as f: for item in ret: line = line_tmp % (i, item['title'], item['id'], item['follower_count'], item['answer_count'], item['comment_count']) f.write(line) i += 1
path = r"D:\Kuangyichen\Repository_py3\Zhihu\Data\Gene" download = [int(i.split('#')[0]) for i in os.listdir(path)] out = set(questions) - set(download) print(len(out)) for i in out: print(i) TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) path = r'D:\Kuangyichen\Repository_py3\Zhihu\Data\lefted' questions = [] with open(path, 'r', encoding='UTF8') as Reader: for line in Reader.readlines(): questions.append(int(line)) for q in questions: question_t = client.question(q) print(str(q) + "start") for answer in question_t.answers: print(answer.author.id, answer.author.name) answer.save( 'Data\\Gene\\' + str(question_t.id) + '#' + question_t.title, str(answer.author.id) + '#' + answer.author.name) print(str(q) + "end")
client = ZhihuClient() #登录部分 try: client.login(ZHIHU_ID, ZHIHU_KEY) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(ZHIHU_ID, ZHIHU_KEY, captcha) the_question = client.question(QUESTION_ID) print(the_question.title) a = 0 with open('all_answers.txt', 'w') as f_txt: f_txt.write("昵称&用户关注数&用户粉丝数&用户回答数&用户获赞数&用户获得感谢数&用户性别&用户学校&用户学院&回答内容(去标点符号)&回答日期&最后修改日期&赞数&感谢数&评论数&是否允许评论&是否被建议修改&回答可信度指数&情感分析积极性&情感分析消极性&情感倾向&回答内容") for the_answer in the_question.answers: the_author = the_answer.author author_name = the_author.name
from lxml import html import requests, time, zhihu_oauth start_time = time.time() # 初始时间戳 # ========================登录======================== from zhihu_oauth import ZhihuClient client = ZhihuClient() client.load_token('/Users/alicewish/我的坚果云/token.pkl') # ========================查询问题======================== qid = 48217184 question = client.question(qid) print('允许删除', question.allow_delete) print('答案数', question.answer_count) print('答案', question.answers) print('评论数', question.comment_count) print('评论', question.comments) print('细节', question.detail) print('摘录', question.excerpt) print('关注数', question.follower_count) print('关注人', question.followers) print('问题ID', question.id) print('重定向', question.redirection) print('状态', question.status) print('建议修改', question.suggest_edit) print('标题', question.title) print('话题', question.topics) print('更新时间', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(question.updated_time)))
TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: try: client.login('email_or_phone', 'password') except NeedCaptchaException: with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('email_or_phone', 'password', captcha) client.save_token(TOKEN_FILE) question = client.question(int(question_id)) print(question.title) wb = Workbook() sheet = wb.active sheet.title = "知乎" item_name = [ 'time_now', 'content', 'author', 'gender', 'loc', 'business', 'company', 'job', 'created_time', 'updated_time', 'voteup_count', 'comment_count', 'thanks_count' ] for j, title in enumerate(item_name): sheet.cell(row=1, column=j + 1).value = title num = 0 for answer in question.answers: num += 1 item_data = [datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')]
# @Email : [email protected] # @File : save_images.py ''' @Description:保存知乎某个问题下所有答案的图片 ''' from __future__ import print_function # 使用python3的print方法 from zhihu_oauth import ZhihuClient import re import os import urllib.request client = ZhihuClient() # 登录 client.load_token('token.pkl') # 加载token文件 id = 24400664 # https://www.zhihu.com/question/24400664(长得好看是一种怎么样的体验) question = client.question(id) print(u"问题:", question.title) print(u"回答数量:", question.answer_count) # 建立存放图片的文件夹 os.mkdir(question.title + u"(图片)") path = question.title + u"(图片)" index = 1 # 图片序号 for answer in question.answers: content = answer.content # 回答内容 re_compile = re.compile( r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>') img_lists = re.findall(re_compile, content) if (img_lists): for img in img_lists: img_url = img[0] # 图片url urllib.request.urlretrieve(img_url, path + u"/%d.jpg" % index)
Zhihu = dbClient['Zhihu'] ZhihuData = Zhihu[str(questionID)] if ZhihuData.find(): ZhihuData.remove({}) # 登陆知乎账号 client = ZhihuClient() try: client.login(account, passwd) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(account, passwd, captcha) # 创建问题对象 question = client.question(questionID) # 读取问题下所有的回答并保存起来 print(question.title) count = 0 for answer in question.answers: count+=1 try: data = { 'title':question.title, 'author':answer.author.name, 'description':answer.author.description, 'content':answer.content, 'voteup':answer.voteup_count, 'thanks':answer.thanks_count } print("正在保存第%s个回答" %count)
# tableNum = mysql.cur.execute("SELECT concat('DROP TABLE IF EXISTS ', table_name, ';') ""FROM information_schema.tables WHERE table_schema = 'urldata';") # if tableNum == 0: # print("该数据库下所有表删除完毕\n---------------------------------------") # else: # print("删除失败") # # 在数据库(urldata)中新建表 try: mysql.cur.execute( "create table answer(answer_id int(4),author_name varchar(40),author_healine varchar(40),agree_num int(4),comment_num int(4),thanks_count int(4), url varchar(100))" ) mysql.cur.execute( "create table comments(currentanswer_id int(4),commentID int(4), commentpersonName varchar(40), words varchar(300))" ) # 爬取知乎热榜第一的所有回答//5G发放牌照 question = client.question(328058110) for answer in question.answers: try: mysql.cur.execute( "insert into answer values(%d,'%s','%s',%d,%d,%d,'%s')" % (answer.id, answer.author.name, answer.author.headline, answer.voteup_count, answer.comment_count, answer.thanks_count, answer._build_url())) except BaseException: print("a answer is nelected") question = client.question(328058110) for answer in question.answers: print(answer.pure_data) # 爬取知乎热榜所有评论 question = client.question(328058110) for answer in question.answers:
def time2str(timeStamp): timeArray = time.localtime(timeStamp) return time.strftime("%Y-%m-%d %H:%M:%S", timeArray) def ucps2str(ucpstr): '''Convert unicode code point (in hex) ascii string to unicode string''' s = '' for i in range(len(ucpstr) / 4): ucp = ucpstr[i * 4:i * 4 + 4] s = s + unichr(int(ucp, 16)) return s question = client.question(20840874)#哪些东西买了之后,会让人因生活质量和幸福感提升而感觉相见恨晚? # 通过question类的answers这个生成器属性可以获取到每个回答的author(answer.author类),进而过去回答者的档案信息 # for answer in question.answers: # print(answer.author.name+ str(answer.author.answer_count) + ' ' + str(answer.voteup_count)) # print(answer.author.id) #为什么id是这样的?10fc5f92b8f7f7cd1a058d10a0f36ce0 # for answer in question.answers: # # print('id',ucps2str(answer.author.id),answer.author.id.decode('hex'),binascii.unhexlify(answer.author.id)) # print('id', answer.author.i) # print('uid',answer.author.uid) # print('name', answer.author.name) # print('gender', answer.author.gender) # print('headline', answer.author.headline) # print('description', answer.author.description) # print('\n')