def zhihu_login(): client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login('*****@*****.**', 'a4906639') client.save_token(TOKEN_FILE) me = client.me() print(me.name) return client
def main(): #login client = ZhihuClient() log_in(client) Image('./a.gif') captcha = input('please input captcha:') client.login('account', 'psw', captcha) get_data(client) write_hk_student_info() write_modules()
def login(username, password): from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login(username, password) print(u"登陆成功!") except NeedCaptchaException: # 处理要验证码的情况 # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(username, password, captcha) client.save_token('token.pkl') # 保存token
def login(account, password): client = ZhihuClient() try: client.load_token(TOKEN_FILE) except FileNotFoundError: try: client.login(account, password) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('./captcha/a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(account, password, captcha) client.save_token('./token/token.pkl') finally: return client
def main(): client = ZhihuClient() try: # client.login(email_or_phone, password) client.login_in_terminal(username=email_or_phone, password=password) client.save_token(TOKEN_FILE) # 保存登录会话,留着以后登录用 # raise NeedCaptchaException except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('请输入验证码: ') client.login(email_or_phone, password, captcha) data_out_list_a = [] line_saved = 0 max_lines = 1 with open(USER_CSV_PATH) as file: for line in file.readlines(): crawl_id = line.strip('\n') my_crawl = MyCrawler(crawl_id, client) print('------>>>| 待爬取的用户的知乎id为: ', crawl_id) data_a = my_crawl.crawling_answer(crawl_id) print('该用户爬取完毕'.center(60, '*')) if len(data_a) % 60 == 0: tmp_time = int(len(data_a) / 60) for i in range(tmp_time): data_out_list_a.append(data_a[60*i:60*(i+1)]) else: print('无用的输出!') # sleep(randint(1, 3)) line_saved += 1 if line_saved == max_lines: save_to_csv_a(data_out_list_a, client) data_out_list_a = [] line_saved = 0 print('全部用户采集完毕'.center(40, '*'))
def main(): client = ZhihuClient() try: client.login('*****@*****.**', 'durant') except NeedCaptchaException: print("Login Error") with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('*****@*****.**', 'durant', captcha) max_lines = 1 line_saved = 0 data_out_list_a = [] with open(USER_TRY_CSV_PATH) as f: for line in f.readlines(): craw_id = line.strip("\n") craw = MyCrawler(craw_id, client) print(craw_id) data_a = craw.crawling_answer(craw_id) if len(data_a) % 60 == 0: times = int(len(data_a) / 60) for i in range(times): data_out_list_a.append(data_a[60 * i:60 * (i + 1)]) else: print("Invalid Output") a = random.randint(1, 3) time.sleep(a) line_saved += 1 if line_saved == max_lines: save_to_csv_a(data_out_list_a) data_out_list_a = [] line_saved = 0
class Crawler: # Initialize the crawler with the name of database def __init__(self, dbname,email,key): self.con = sqlite3.connect(dbname) self.cursor = self.con.cursor() TOKEN_FILE = 'token.pkl' self.zhclient = ZhihuClient() try: # self.zhclient.login_in_terminal(email, key) self.zhclient.login(email, key) except NeedCaptchaException: print("需要输入验证码,账号 %s 可能已失效" %(email)) # if os.path.isfile(TOKEN_FILE): # self.zhclient.load_token(TOKEN_FILE) # else: # self.zhclient.login_in_terminal(email, key) # self.zhclient.save_token(TOKEN_FILE) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() #建立数据表 def createindextables(self): self.cursor.execute('create table userinfo(id primary key NOT NULL ,name text,headline text,gender int,address text,business text,school_name text,job text,company text,answer_count int ,question_count int ,voteup_count int ,thanked_count int ,following_count int ,follower_count int ,following_question_count int ,following_topic_count,collected_count int,identity text,best_topics text,is_organization int,org_name text,org_home_page text,org_industry text,record_time text)') self.cursor.execute('create table answerinfo(id primary key NOT NULL,content text,author_id int ,voteup_count int,thanks_count int, created_time text,comment_count int,updated_time text,record_time text)') self.cursor.execute('create table questioninfo(id primary key NOT NULL,title text,follower_count int ,answer_count int,created_time text,updated_time text,record_time text)') self.cursor.execute('create table topicinfo(id primary key NOT NULL,title text,best_answer_count int ,follower_count int ,question_count int,record_time text)') self.cursor.execute('create table topic_questions(topic_id ,topic_name text,question_id ,question_title text,record_time text)') self.cursor.execute('create table topic_users(topic_id,topic_name text,user_id,user_name text,record_time text)') self.cursor.execute('create table question_users(question_id,question_title text,user_id,user_name text,record_time text)') self.cursor.execute('create table question_answers(question_id,question_title text,answer_id,author_id,record_time text)') self.cursor.execute('create table user_users(user_id,user_follower_id)') self.cursor.execute('create table question_topics(question_id,topic_id,topic_name text,record_time text)') self.cursor.execute('create table user_topics(user_id,user_name text,topic_id,topic_name text,record_time text)') self.cursor.execute('create index userinfoidx on userinfo(id)') self.cursor.execute('create index answerinfoidx on answerinfo(id)') self.cursor.execute('create index questioninfoidx on questioninfo(id)') self.cursor.execute('create index topicinfoidx on topicinfo(id)') self.cursor.execute('create index topic_questionsidx on topic_questions(topic_id,question_id)') self.cursor.execute('create index topic_usersidx on topic_users(topic_id,user_id)') self.cursor.execute('create index question_usersidx on question_users(question_id,user_id)') self.cursor.execute('create index question_answersidx on question_answers(question_id,answer_id)') self.cursor.execute('create index user_usersidx on user_users(user_id,user_follower_id)') self.cursor.execute('create index question_topicsidx on question_topics(question_id,topic_id)') self.cursor.execute('create index user_topicsidx on user_topics(user_id,topic_id)') self.dbcommit() # #多线程尝试 # def crawl_data(self,work_set,table1,field1,table2,field2): # if table2 == "userinfo": # for subid in work_set: # subid = subid[0] # self.userinfo(subid) # elif table2 == "answerinfo": # for subid in work_set: # subid = subid[0] # self.answerinfo(subid) # # time.sleep(0.8) # # time.sleep(0.5) # elif table2 == "questioninfo": # for subid in work_set: # subid = subid[0] # self.questioninfo(subid) # elif table2 == "topicinfo": # for subid in work_set: # subid = subid[0] # self.topicinfo(subid) # elif table2 == "question_answers": # for subid in work_set: # subid = subid[0] # self.question_answers(subid) # elif table2 == "question_topics": # for subid in work_set: # subid = subid[0] # self.question_topics(subid) # elif table2 == "question_users": # for subid in work_set: # subid = subid[0] # self.question_users(subid) # elif table2 == "topic_questions": # for subid in work_set: # subid = subid[0] # self.topic_questions(subid) # elif table2 == "topic_users": # for subid in work_set: # subid = subid[0] # self.topic_users(subid) # elif table2 == "user_users": # for subid in work_set: # subid = subid[0] # self.user_users(subid) # elif table2 == "user_topics": # for subid in work_set: # subid = subid[0] # self.user_topics(subid) # return None def justdoit(self,table1,field1,table2,field2): set2 =set(self.cursor.execute("select DISTINCT {} from {}".format(field2,table2)).fetchall()) set1 = set(self.cursor.execute("select DISTINCT {} from {}".format(field1,table1)).fetchall()) work_set = set1-set2 # work_set = list(set1 - set2) # splitlen = int(len(work_set) / 2) # subwork_set = [work_set[i:i + splitlen] for i in range(0, len(work_set), splitlen)] # threads = [] # for i in range(0,len(subwork_set)): # t = multiprocessing.Process(target=self.crawl_data,args=(subwork_set[i],table1,field1,table2,field2)) # threads.append(t) # for t in threads: # t.start() # t.join() if table2 == "userinfo": for subid in work_set: subid = subid[0] self.userinfo(subid) elif table2 == "answerinfo": for subid in work_set: subid = subid[0] self.answerinfo(subid) # time.sleep(1.0) time.sleep(0.1) elif table2 == "questioninfo": for subid in work_set: subid = subid[0] self.questioninfo(subid) elif table2 == "topicinfo": for subid in work_set: subid = subid[0] self.topicinfo(subid) elif table2 == "question_answers": for subid in work_set: subid = subid[0] self.question_answers(subid) elif table2 == "question_topics": for subid in work_set: subid = subid[0] self.question_topics(subid) elif table2 == "question_users": for subid in work_set: subid = subid[0] self.question_users(subid) elif table2 == "topic_questions": for subid in work_set: subid = subid[0] self.topic_questions(subid) elif table2 == "topic_users": for subid in work_set: subid = subid[0] self.topic_users(subid) elif table2 == "user_users": for subid in work_set: subid = subid[0] self.user_users(subid) elif table2 == "user_topics": for subid in work_set: subid = subid[0] self.user_topics(subid) return None #话题-(精华)问题关系 def topic_questions(self,topic_id): try: topic = self.zhclient.topic(topic_id) record_time = self.logtime() ques_set = set() for hot_ques in shield(topic.best_answers,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("topic_questions", "topic_id", "question_id", topic.id, hot_ques.question.id) if status == None: if hot_ques.question.id not in ques_set: ques_set.add(hot_ques.question.id) values = (topic.id,topic.name,hot_ques.question.id,hot_ques.question.title,record_time) self.cursor.execute("insert into topic_questions(topic_id,topic_name,question_id,question_title,record_time) VALUES (?,?,?,?,?)" ,values) self.dbcommit() print("正在处理", hot_ques.question.id) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") raise except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #话题-关注者关系 def topic_users(self,topic_id,start_at = 0): try: topic = self.zhclient.topic(topic_id) record_time = self.logtime() user_set = set() for follower in shield(topic.followers,start_at=start_at,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("topic_users", "topic_id", "user_id", topic.id, follower.id) if status == None: if follower.id not in user_set: user_set.add(follower.id) values = (topic.id,topic.name,follower.id,follower.name,record_time) self.cursor.execute("insert into topic_users(topic_id,topic_name,user_id,user_name,record_time) VALUES (?,?,?,?,?)" ,values) self.dbcommit() print("正在处理",topic.name,follower.name) # time.sleep(0.3) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass # 问题-关注者关系 def question_users(self, question_id): try: question = self.zhclient.question(question_id) record_time = self.logtime() user_set = set() for follower in shield(question.followers,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("question_users", "question_id", "user_id", question.id, follower.id) if status == None: if follower.id not in user_set: user_set.add(follower.id) values = (question.id, question.title, follower.id, follower.name,record_time) self.cursor.execute( "insert into question_users(question_id,question_title,user_id,user_name,record_time) VALUES (?,?,?,?,?)", values) self.dbcommit() print("正在处理",follower.name,question.title) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass # 问题-回答关系 def question_answers(self, question_id): try: question = self.zhclient.question(question_id) record_time = self.logtime() answer_set = set() for answer in shield(question.answers): status = self.isdupicaterel("question_answers", "question_id", "answer_id", question.id, answer.id) if status == None: if answer.id not in answer_set: answer_set.add(answer.id) values = (question.id, question.title, answer.id, answer.author.id,record_time) self.cursor.execute("insert into question_answers(question_id,question_title,answer_id,author_id,record_time) VALUES (?,?,?,?,?)", values) self.dbcommit() print("正在处理", question.id, question.title, answer.id, answer.author.id) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass except ZhihuWarning: print("Pass the UnexpectedResponseException") pass #获取用户-用户关注关系,知乎有5020限制,api限制最多获取一个用户5020粉丝 def user_users(self,user_id): try: people = self.zhclient.people(user_id) record_time = self.logtime() user_set = set() for follower in shield(people.followers,action=SHIELD_ACTION.PASS): status = self.isdupicaterel("user_users", "user_id", "user_follower_id", people.id, follower.id) if status == None: if follower.id not in user_set: user_set.add(follower.id) valus = (people.id,follower.id,record_time) self.cursor.execute("insert into user_users(user_id,user_follower_id,record_time) VALUES (?,?,?)",valus) self.dbcommit() print("正在处理",follower.name) else: print("已存在,正在跳过") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #获取问题-话题关系 def question_topics(self,question_id): try: question = self.zhclient.question(question_id) record_time = self.logtime() topic_set = set() for topic in shield(question.topics): status = self.isdupicaterel("question_topics", "question_id", "topic_id", question.id, topic.id) if status == None: if topic.id not in topic_set: topic_set.add(topic.id) values = (question.id,topic.id,topic.name,record_time) self.cursor.execute("insert into question_topics(question_id,topic_id,topic_name,record_time) VALUES (?,?,?,?)",values) self.dbcommit() print("正在处理", topic.name,question.title) else: print("已存在,正在跳过") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass # 获取用户-话题关系 def user_topics(self, user_id): try: people = self.zhclient.people(user_id) record_time = self.logtime() topic_set = set() for topic in shield(people.following_topics): status = self.isdupicaterel("user_topics", "user_id", "topic_id", people.id, topic.id) if status == None: if topic.id not in topic_set: topic_set.add(topic.id) values = (people.id, people.name, topic.id,topic.name, record_time) self.cursor.execute( "insert into user_topics(user_id,user_name,topic_id,topic_name,record_time) VALUES (?,?,?,?,?)", values) self.dbcommit() print("正在处理", people.name ,topic.name) else: print("已存在,正在跳过") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass except GetDataErrorException: print("Pass the GetDataErrorException") pass # 判断数据重复 def isdupicateid(self, table, id): cur = self.cursor.execute( "select rowid from {} where id = ?".format(table), (id,)) self.dbcommit() res = cur.fetchone() res = None if res == None else res[0] return res def isdupicaterel(self,table,field1,field2, id1,id2): cur = self.cursor.execute( "select rowid from {} where {}= ? And {} = ?".format(table,field1,field2), (id1,id2)) res = cur.fetchone() self.dbcommit() res = None if res == None else res[0] return res #个人信息 def userinfo(self,user_id): try: status = self.isdupicateid("userinfo",user_id) if status==None: people = self.zhclient.people(user_id) record_time = self.logtime() address = "|".join([location.name for location in people.locations]) school_name = "|".join([education.school.name for education in people.educations if "school" in education]) job = "|".join([employment.job.name for employment in people.employments if "job" in employment]) company = "|".join([employment.company.name for employment in people.employments if "company" in employment]) business = people.business.name if people.business else None #勋章判断 if people.badge.has_identity: identity = people.badge.identity else: identity = None if people.badge.is_best_answerer: best_topics = "".join([topic.name for topic in people.badge.topics]) else: best_topics = None if people.badge.is_organization: is_organization = 1 org_name = people.badge.org_name org_home_page = people.badge.org_home_page org_industry = people.badge.org_industry else: is_organization = 0 org_name = None org_home_page = None org_industry = None values = ( people.id, people.name, people.headline, people.gender, address, business, school_name, job,company, people.answer_count, people.question_count, people.voteup_count, people.thanked_count, people.following_count, people.follower_count, people.following_question_count, people.following_topic_count, people.collected_count, identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) self.cursor.execute( "insert into userinfo(id,name,headline,gender,address,business,school_name,job,company,answer_count,question_count,voteup_count,thanked_count,following_count,follower_count,following_question_count,following_topic_count,collected_count,identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", values) self.dbcommit() print("正在处理", people.name) else: print("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass def answerinfo(self,answer_id): try: status = self.isdupicateid("answerinfo", answer_id) if status == None: answer = self.zhclient.answer(answer_id) record_time = self.logtime() values = (answer.id,answer.content,answer.author.id,answer.voteup_count,answer.thanks_count,answer.comment_count,answer.created_time,answer.updated_time,record_time) self.cursor.execute("insert into answerinfo(id,content,author_id,voteup_count,thanks_count,comment_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?,?,?)",values) self.dbcommit() print("正在处理",answer.id) else: return ("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") self.cursor.execute("delete from question_answers where answer_id = ?",(answer_id,))##在从question_answer表中获取及时删除无效问题,方式切换帐号后反复爬去无效问题。 pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #问题信息 def questioninfo(self,question_id): try: status = self.isdupicateid("questioninfo", question_id) if status == None: question = self.zhclient.question(question_id) record_time = self.logtime() values = (question.id,question.title,question.follower_count,question.answer_count,question.created_time,question.updated_time,record_time) self.cursor.execute("insert into questioninfo(id,title,follower_count,answer_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?)",values) self.dbcommit() print("正在处理" ,question.title) else: return ("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #话题信息 def topicinfo(self,topic_id): try: status = self.isdupicateid("topicinfo", topic_id) if status == None: topic = self.zhclient.topic(topic_id) record_time = self.logtime() values=(topic.id,topic.name,topic.best_answer_count,topic.follower_count,topic.question_count,record_time) self.cursor.execute("insert into topicinfo(id,title,best_answer_count,follower_count,question_count,record_time) VALUES (?,?,?,?,?,?)",values) self.dbcommit() print("正在处理", topic.name) else: return ("重复,rowid",status) except GetDataErrorException: print("Pass the GetDataErrorException") pass except UnexpectedResponseException: print("Pass the UnexpectedResponseException") pass #时间戳 def logtime(self): fmt = '%Y-%m-%d' # 定义时间显示格式 Date = time.strftime(fmt, time.localtime(time.time())) return Date def add_counts(self,filepath = "logincounts.txt"): counts = [] for line in open(filepath): count = {} count["count"], count["key"] = line.split("----") count["key"] = count["key"].strip("\n") counts.append(count) return counts def get_proxy(self): try: PROXY_POOL_URL = 'http://localhost:5000/get' response = requests.get(PROXY_POOL_URL) if response.status_code == 200: return response.text except ConnectionError: return None
#-*- coding:utf-8 -*- from __future__ import print_function from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login('邮箱或电话', '密码') # 使用手机号登录需要在手机号前加 +86 前缀 except NeedCaptchaException: # 保存验证码并提示输入,输入gif所示验证码进行登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('邮箱或电话', '密码', captcha) article = client.from_url('https://zhuanlan.zhihu.com/p/25671089') # 随意改知乎链接,自动识别 print(article.author.name) print(article.voteup_count) result = '' for i in article.comments: j = i.author.name + '\n' result += j f = open('E:\GitHouse\zhihu-oauth\jilu4.txt', 'w') f.write(result) f.close()
#coding=utf-8 from __future__ import print_function # 使用python3的print方法 from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login('account', 'password') except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = raw_input('please input captcha:') client.login('account', 'password', captcha) client.save_token('token.pkl') # 保存token #有了token之后,下次登录就可以直接加载token文件了 # client.load_token('filename') from zhihu_oauth import ZhihuClient import re import os import urllib client = ZhihuClient() # 登录 # 加载token文件 client.load_token('token.pkl')
from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login('#Email#', '#Password#') except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('email_or_phone', 'password', captcha) client.save_token('token.pkl')
str_sentiment = '正向' out = {} out['confidence'] = str(confidence) out['positive'] = str(positive_prob) out['negative'] = str(negative_prob) out['sentiment'] = str_sentiment return out client = ZhihuClient() #登录部分 try: client.login(ZHIHU_ID, ZHIHU_KEY) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(ZHIHU_ID, ZHIHU_KEY, captcha) the_question = client.question(QUESTION_ID) print(the_question.title)
import os import requests from pyquery import PyQuery as pq from requests import RequestException from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException import time import csv today = time.strftime('%Y-%m-%d', time.localtime(time.time())) now = time.strftime('%H', time.localtime(time.time())) total_detail = [] client = ZhihuClient() try: client.login('#', '#') # 使用手机号登录需要在手机号前加 +86 前缀 except NeedCaptchaException: # 保存验证码并提示输入,输入gif所示验证码进行登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('#', '#', captcha) def get_links(Num): url = 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A' + Num + '%2C"type"%3A"day"%7D' header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8',
from zhihu_oauth.exception import NeedCaptchaException from bs4 import BeautifulSoup import json import random s = requests.session() s.keep_alive = True # In[2]: # login ZhihuClient client = ZhihuClient() user = '******' pwd = '961204yy' try: client.login(user, pwd) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(user, pwd, captcha) client.save_token('token.kpl') # TOKEN_FILE = 'token.pkl' # # if os.path.isfile(TOKEN_FILE): # client.load_token(TOKEN_FILE) # else: # client.login_in_terminal() # client.save_token(TOKEN_FILE)
from openpyxl import Workbook from openpyxl import load_workbook from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException topic_id = input('请输入话题id号,获取所有答案:') file_name = input('请输入保存文件名,不带后缀:') TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: try: client.login('email_or_phone', 'password') except NeedCaptchaException: with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('email_or_phone', 'password', captcha) client.save_token(TOKEN_FILE) topic = client.topic(int(topic_id)) print(topic.name) #日志设置 logging.basicConfig(level=logging.ERROR, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='zhi.log',
text_readline = [] # 初始化按行存储数据列表,不接受结尾换行符 with open(txt_file_path) as fin: for line in fin: text_readline.append((line).replace('\n', '')) print(text_readline) for i in range(len(text_readline)): print(text_readline[i]) # ================读取账号和密码================ account = text_readline[0] passward = text_readline[1] client = ZhihuClient() try: client.login(account, passward) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(account, passward, captcha) # 必须在 client 已经处于登录状态时才能使用 client.save_token('/Users/alicewish/我的坚果云/token.pkl') # ================运行时间计时================ run_time = time.time() - start_time if run_time < 60: # 两位小数的秒 print("耗时:{:.2f}秒".format(run_time)) elif run_time < 3600: # 分秒取整
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/1/15 17:40 # @Author : glacier # @Site : # @File : zhihu_new.py # @Software: PyCharm Edu from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException import pymysql client = ZhihuClient() try: client.login('13776390465', '14715912300.mm') # 必须在 client 已经处于登录状态时才能使用 client.save_token('token.pkl') except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('13776390465', '14715912300.mm', captcha) client.load_token('token.pkl') me = client.me() import traceback client = ZhihuClient()
#! /usr/bin/env python3 # coding: utf-8 username = '******' pwd = 'zhihu password' username = '******' pwd = '!fGP+GT5dSK*' #----------------------------- from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login(username, pwd) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(username, pwd, captcha) client.save_token('token.pk1') # 保存 token
### crawls the question from zhihu.com by using the module zhihu_oauth from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login(zhihu_account, zhihu_password) except NeedCaptchaException: ### here we need to save the CAPTCHA and relogin with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(zhihu_account, zhihu_password, captcha) ### the id of zhihu.com is not always continuous, we have to traverse all the id q = [] for id in range(281736391, 311982210): question = client.question(id) try: q.append(question.title) except zhihu_oauth.exception.GetDataErrorException: continue ### I totally crawled 14W+ questions from zhihu.com
import pandas as pd from zhihu_oauth import ZhihuClient client = ZhihuClient() from zhihu_oauth.exception import NeedCaptchaException try: client.login('*****@*****.**', 'justbemyself1998') except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('*****@*****.**', 'justbemyself1998', captcha) client.save_token("token.pkl")
# coding=utf-8 from __future__ import unicode_literals, print_function import os from zhihu_oauth import ZhihuClient TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login('*****@*****.**', 'Zhihu2Ebook') client.save_token(TOKEN_FILE)
from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() test_email = '*****@*****.**' test_password = '******' token_file = './token.pkl' if os.path.lexists(token_file): client.load_token(token_file) print 'load token success' else: try: login_result = client.login(test_email, test_password) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 print u'登录失败,需要输入验证码' with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = raw_input(u'please input captcha:') login_result = client.login(test_email, test_password, captcha) print 'login result => ' print login_result client.save_token(token_file) print 'save token success' # question response_file_uri = './question_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构 question_id = 35005800
from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login('account', 'pwd') except NeedCaptchaException: with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('account', 'pwd', captcha) print("Exception") client.save_token('token.pkl')
#!/usr/bin/env python # coding: utf-8 import os from zhihu_oauth import ZhihuClient, ActType, People from zhihu_oauth.exception import NeedCaptchaException from zhihu_oauth.helpers import ts2str, act2str token = './XXX.pk1' client = ZhihuClient() try: if os.path.exists(token): client.load_token(token) else: client.login('username', 'passwd') except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = raw_input('please input captcha:') client.login('username', 'passwd', captcha) client.save_token(token) def dump_activities(pid): person = client.people(pid) filter_types = { ActType.COLLECT_ANSWER, ActType.COLLECT_ANSWER, ActType.COLLECT_ARTICLE, ActType.CREATE_ANSWER,
from requests import RequestException from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException import time import csv from smtplib import SMTP_SSL from email.header import Header from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart today = time.strftime('%Y-%m-%d', time.localtime(time.time())) now = time.strftime('%H', time.localtime(time.time())) total_detail = [] client = ZhihuClient() try: client.login('ID', 'password') # 使用手机号登录需要在手机号前加 +86 前缀 except NeedCaptchaException: # 保存验证码并提示输入,输入gif所示验证码进行登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('ID', 'password', captcha) def get_links(Num): url = 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A' + Num + '%2C"type"%3A"day"%7D' header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8',
# -*- coding: utf-8 -*- """ Created on Sat Oct 7 18:02:01 2017 @author: roger """ from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: (state, reason) = client.login('*****@*****.**', 'qi142857') print(state, reason) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('email_or_phone', 'password', captcha) client.save_token('./token.pkl')
from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login('*****@*****.**', '449137973zazazzh') client.save_token('token.pkl') except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('email_or_phone', 'password', captcha)
class Crawl: def __init__(self): self.client = ZhihuClient() def login(self, username, password): if os.path.isfile('app/Resource/' + username + '.token'): self.client.load_token('app/Resource/' + username + '.token') else: try: self.client.login(username, password) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(self.client.get_captcha()) captcha = input('please input captcha:') self.client.login(username, password, captcha) self.client.save_token('app/Resource/' + username + '.token') def get_live_list(self): lives = self.client.me().lives return lives @staticmethod def save_live_list(livedata): new_live = MyLive(live_id=livedata.id, title=livedata.title, speaker=livedata.speaker.name, speaker_description=livedata.speaker.description, live_description=livedata.description, seats_count=livedata.seat_taken, price=livedata.fee) new_live.save() def live_list_work(self): for live in self.get_live_list(): exist = MyLive.objects(live_id=live.id) if not exist: self.save_live_list(live) def get_live_content(self, live_id, before_id=''): res = self.client._session.get( LIVECONTENT_URL.format(live_id, before_id)) data = json.loads(res.content) return data def save_live_content_image(self, id, url): content = self.client._session.get(url).content file = 'app/Resource/' + str(id) + '.png' with open(file, 'wb') as f: f.write(content) @staticmethod def save_live_content(live_id, livedata): for r in livedata['data']: exist = LiveContent.objects(message_id=r['id']) if exist: continue if r['type'] == 'audio': url = r['audio']['url'] elif r['type'] == 'image': url = r['image']['full']['url'] else: url = '' content = r['text'] if 'text' in r else '' reply = ','.join(r['replies']) if 'replies' in r else '' new_live_content = LiveContent( message_id=int(r['id']), sender=r['sender']['member']['name'], type=r['type'], content=content, url=url, reply=reply, likes=r['likes']['count'], created_at=datetime.fromtimestamp((r['created_at'])), live_title=live_id) new_live_content.save() def live_content_work(self, id): live = MyLive.objects(id=id).first() # 使用知乎的live的ID值传入获取详情 data = self.get_live_content(live.live_id) while data['unload_count'] > 0: # 存储时使用mongo的ID值传入 self.save_live_content(live.id, data) data = self.get_live_content(live.live_id, data['data'][0]['id']) else: print('success') image_contents = LiveContent.objects(live_title=live.id, type='image') for item in image_contents: self.save_live_content_image(item.id, item.url)
wait = WebDriverWait(browser, 10) seachAddress = [ 'https://www.zhihu.com/explore', 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A5%2C"type"%3A"day"%7D', 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A10%2C"type"%3A"day"%7D', 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A15%2C"type"%3A"day"%7D', 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A20%2C"type"%3A"day"%7D', 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A25%2C"type"%3A"day"%7D', 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A30%2C"type"%3A"day"%7D', 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A35%2C"type"%3A"day"%7D', 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A40%2C"type"%3A"day"%7D', 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A45%2C"type"%3A"day"%7D', ] client = ZhihuClient() try: client.login('', '') # 使用手机号登录需要在手机号前加 +86 前缀 except NeedCaptchaException: # 保存验证码并提示输入,输入gif所示验证码进行登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('', '', captcha) def get_links(url): try: browser.get(url) wait.until( EC.element_to_be_clickable( (By.CSS_SELECTOR, 'div:nth-child(1) > h2 > a')))
from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException #custom your email/phone number matched with password email = "" password = "" client = ZhihuClient() try: client.login(email, password) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(email, password, captcha) client.save_token('token.pkl')
import time import random # 账户信息以及问题ID account = 'account' passwd = 'passwd' questionID = 264747923 # 建立数据库 dbClient = pymongo.MongoClient(host='localhost', port=27017) Zhihu = dbClient['Zhihu'] ZhihuData = Zhihu[str(questionID)] if ZhihuData.find(): ZhihuData.remove({}) # 登陆知乎账号 client = ZhihuClient() try: client.login(account, passwd) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(account, passwd, captcha) # 创建问题对象 question = client.question(questionID) # 读取问题下所有的回答并保存起来 print(question.title) count = 0 for answer in question.answers: count+=1 try: data = {
from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login('*****@*****.**', '110119rick') except NeedCaptchaException: with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('*****@*****.**', '110119rick', captcha)