def download(cid): client = ZhihuClient() client.load_token('token.pk1') # 登陆 column = client.column(cid) images_dir = os.path.join(column.title, 'images') regex = re.compile(r"https://pic\d.zhimg.com/", re.IGNORECASE) if not os.path.exists(column.title): os.makedirs(column.title) if not os.path.exists(images_dir): os.makedirs(images_dir) for index, article in enumerate(column.articles): # 处理文章 article_f = dealArticle(article) # 下载图片 downloadImg(article_f['content'], images_dir) # 替换图片路径 article_f['content'] = re.sub(regex, './images/', article_f['content']) # 写入 with open(os.path.join(column.title, article_f['title']), 'w+') as f: f.write(article_f['content']) print('[%s] download %s success!' % (str(index), article.title))
def parse_author(command): result = Match.author(command) author_id = result.group('author_id') task = SingleTask() task.kind = 'author' task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id) task.book.kind = 'author' client = ZhihuClient() try: client.load_token(Path.pwd_path + str(u'/ZHIHUTOKEN.pkl')) except IOError: print u"没有找到登录信息文件,请先登录" sys.exit() except NeedLoginException: print u"登录信息过期,请重新登录" sys.exit() people_oauth = client.people(author_id) _ = people_oauth.follower_count # zhihu-oauth, issues #4 author_id_hash = people_oauth.id task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id_hash) task.book.sql.question = 'select * from Question where question_id in (select question_id from \ Answer where author_id = "{}")'.format(author_id_hash) task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id_hash) return task
def crawling(id): #id为问题id client = ZhihuClient() # 登录 client.load_token('token.pkl') # 加载token文件 question = client.question(id) print(u"问题:", question.title) print(u"回答数量:", question.answer_count) if not os.path.exists(question.title): os.mkdir(question.title) path = question.title index = 1 # 图片序号 for i, answer in enumerate(question.answers): content = answer.content # 回答内容 anther = answer.author.name re_compile = re.compile( r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>') img_lists = re.findall(re_compile, content) if (img_lists): for img in img_lists: img_url = img[0] # 图片url image_name = anther + '_' + str(index) + '.jpg' if not os.path.exists(path + '/' + image_name): urllib.request.urlretrieve(img_url, path + '/' + image_name) print(u"成功保存第%d张图片:%s,当前总进度%.2f%%" % (index, image_name, i / question.answer_count * 100)) index += 1 print('第%d个答案爬取完成,当前总进度%.2f%%' % (i, i / question.answer_count * 100))
def Analyse(self): client = ZhihuClient() self.aa.denglu(client) self.aa.findquestion(client) print'------------find','\n' list_q = self.aa.Analyse_question(self.aa.dic_name) print'----------q','\n' self.aa.Analyse_answer(list_q)#最慢 print'-------------a','\n'
class TopicTree: client = ZhihuClient() def __init__(self): pass def login(self): if os.path.isfile(TOKEN_FILE): self.client.load_token(TOKEN_FILE) else: self.client.login_in_terminal(username='******', password=self.getpass()) self.client.save_token(TOKEN_FILE) def login_next(self): self.client.login_in_terminal(username='******', password=self.getpass()) def test(self): me = self.client.me() print('name', me.name) def get_topic(self, uid): topic = self.client.topic(uid) topic_dic = {} topic_dic['id'] = uid topic_dic['name'] = topic.name topic_dic['children'] = [{ 'id': item._id, 'name': item.name } for item in topic.children] with open(out_path, 'a', encoding='utf-8') as fw: json.dump(topic_dic, fw, ensure_ascii=False, sort_keys=True) fw.write('\n') for item in topic_dic['children']: cid = item['id'] try: self.get_topic(cid) time.sleep(random.random()) except Exception as e: traceback.print_exc() print('ban: ', cid) self.run(cid) def run(self, uid): self.login_next() self.get_topic(uid) def getpass(self): # with open('C:\\Users\\BaoQiang\\Desktop\\password.txt', 'r') as f: with open('/mnt/home/baoqiang/password.txt', 'r') as f: return f.read().strip()
def start(self): try: client = ZhihuClient() client.login_in_terminal() client.save_token(Path.ZHIHUTOKEN) except NeedLoginException: print u"Oops, please try again." sys.exit() return
def LoginZhihuClient(token_name): TOKEN_FILE = 'liuximing.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) me = client.me() return me
def prepare(self): if not os.path.isdir(self.data_path): os.makedirs(self.data_path) if os.path.isfile(self.friends_file): with open(self.friends_file, "rb") as f: self.name_map = pickle.load(f) else: self.name_map = dict() self.client = ZhihuClient()
def __init__(self, dbname,email,key): self.con = sqlite3.connect(dbname) self.cursor = self.con.cursor() TOKEN_FILE = 'token.pkl' self.zhclient = ZhihuClient() try: # self.zhclient.login_in_terminal(email, key) self.zhclient.login(email, key) except NeedCaptchaException: print("需要输入验证码,账号 %s 可能已失效" %(email))
def main(): #login client = ZhihuClient() log_in(client) Image('./a.gif') captcha = input('please input captcha:') client.login('account', 'psw', captcha) get_data(client) write_hk_student_info() write_modules()
def login_zhihu(self): """ 登录知乎 :return: """ self.client = ZhihuClient() if os.path.isfile(self.TOKEN_FILE): self.client.load_token(self.TOKEN_FILE) else: self.client.login_in_terminal() self.client.save_token(self.TOKEN_FILE)
def zhihu_login(): client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login('*****@*****.**', 'a4906639') client.save_token(TOKEN_FILE) me = client.me() print(me.name) return client
def get_client(self, reset_=0): client = ZhihuClient() if reset_ != 0: client.login_in_terminal() client.save_token(TOKEN_FILE) if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) return client
def login(self): TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) return client
def zhihu_login(): r""" 知乎登陆 :return: 登陆之后的客户端client """ client = ZhihuClient() # 登录 if os.path.isfile(TOKEN_FILE_NAME): client.load_token(TOKEN_FILE_NAME) else: client.login_in_terminal() client.save_token(TOKEN_FILE_NAME) return client
def login(username, password): from zhihu_oauth import ZhihuClient from zhihu_oauth.exception import NeedCaptchaException client = ZhihuClient() try: client.login(username, password) print(u"登陆成功!") except NeedCaptchaException: # 处理要验证码的情况 # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(username, password, captcha) client.save_token('token.pkl') # 保存token
def login(account, password): client = ZhihuClient() try: client.load_token(TOKEN_FILE) except FileNotFoundError: try: client.login(account, password) except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('./captcha/a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login(account, password, captcha) client.save_token('./token/token.pkl') finally: return client
def setUp(self): super(ZhihuClientClassTest, self).setUp() if not os.path.isdir('test') and os.path.isfile('token.pkl'): os.chdir('..') if not os.path.isfile('test/token.pkl'): print('\nno token file, skip all tests.') self.skipTest('no token file.') self.client = ZhihuClient() try: self.client.load_token('test/token.pkl') except ValueError: print('\ntoken version not math python version, skip all tests.') self.skipTest('token version not math python version.')
def main(): client = ZhihuClient() try: # client.login(email_or_phone, password) client.login_in_terminal(username=email_or_phone, password=password) client.save_token(TOKEN_FILE) # 保存登录会话,留着以后登录用 # raise NeedCaptchaException except NeedCaptchaException: # 保存验证码并提示输入,重新登录 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('请输入验证码: ') client.login(email_or_phone, password, captcha) data_out_list_a = [] line_saved = 0 max_lines = 1 with open(USER_CSV_PATH) as file: for line in file.readlines(): crawl_id = line.strip('\n') my_crawl = MyCrawler(crawl_id, client) print('------>>>| 待爬取的用户的知乎id为: ', crawl_id) data_a = my_crawl.crawling_answer(crawl_id) print('该用户爬取完毕'.center(60, '*')) if len(data_a) % 60 == 0: tmp_time = int(len(data_a) / 60) for i in range(tmp_time): data_out_list_a.append(data_a[60*i:60*(i+1)]) else: print('无用的输出!') # sleep(randint(1, 3)) line_saved += 1 if line_saved == max_lines: save_to_csv_a(data_out_list_a, client) data_out_list_a = [] line_saved = 0 print('全部用户采集完毕'.center(40, '*'))
def setUp(self): super(ZhihuClientClassTest, self).setUp() if not os.path.isdir('test') and os.path.isfile(TOKEN_FILE_NAME): os.chdir('..') token_file_path = os.path.join('test', TOKEN_FILE_NAME) if not os.path.isfile(token_file_path): print('\nno token file, skip all tests.') self.skipTest('no token file.') self.client = ZhihuClient() try: self.client.load_token(token_file_path) except ValueError: print('\ntoken version not math python version, skip all tests.') self.skipTest('token version not math python version.')
def main(): client = ZhihuClient() try: client.login('*****@*****.**', 'durant') except NeedCaptchaException: print("Login Error") with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('*****@*****.**', 'durant', captcha) max_lines = 1 line_saved = 0 data_out_list_a = [] with open(USER_TRY_CSV_PATH) as f: for line in f.readlines(): craw_id = line.strip("\n") craw = MyCrawler(craw_id, client) print(craw_id) data_a = craw.crawling_answer(craw_id) if len(data_a) % 60 == 0: times = int(len(data_a) / 60) for i in range(times): data_out_list_a.append(data_a[60 * i:60 * (i + 1)]) else: print("Invalid Output") a = random.randint(1, 3) time.sleep(a) line_saved += 1 if line_saved == max_lines: save_to_csv_a(data_out_list_a) data_out_list_a = [] line_saved = 0
def download(uid): client = ZhihuClient() client.load_token('token.pk1') # 登陆 people = client.people(uid) for index, answer in enumerate(people.answers): # 下载图片 downloadImg(answer.content) # 处理文本 article = dealArticle(answer) # 写入本地 if not os.path.exists(uid): os.makedirs(uid) with open(os.path.join(uid, article['title']), 'w+') as f: f.write(article['content']) print('[%s] download %s success!' % (str(index), answer.question.title))
# coding=utf-8 # https://pypi.org/project/zhihu-oauth from __future__ import unicode_literals, print_function import os from zhihu_oauth import ZhihuClient from zhihu_oauth import SearchType TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) me = client.me() print('name', me.name) print('headline', me.headline) print('description', me.description) print('following topic count', me.following_topic_count) print('following people count', me.following_topic_count) print('followers count', me.follower_count) print('voteup count', me.voteup_count)
from __future__ import unicode_literals, print_function from RedisQueue import RedisQueue from zhihu_oauth import ZhihuClient import datetime import time import random import sys from timeout import timeout import os from utils import print_err from pymongo import MongoClient MAX_SLEEP_TIME = 15 Cookies_File = './cookies/cookies%s.json' % sys.argv[1] global client client = ZhihuClient() if os.path.isfile(Cookies_File): client.load_token(Cookies_File) else: client_info = open('./cookies/client_info_list.data').readlines() client_info = client_info[int(sys.argv[1])].strip().split('\t') client.login_in_terminal(client_info[0], client_info[1]) client.save_token(Cookies_File) def get_user_questions(uname): global client if uname == '': return print(uname)
class Crawler: client_ = ZhihuClient() topic_question_df = pd.DataFrame() topics_list_ = [19610354, 20010203] #监控的话题列表 questions_detail_ = {} #每个问题的问题标题,问题内容,关注人数,回答人数,回答内容,评论内容,包含话题 answers_persisted = set() answers_to_download = [] def __init__(self, topics_list=[]): if (len(topics_list) > 0): self.topic_lists_ = topics_list if os.path.isfile(TOKEN_FILE): self.client_.load_token(TOKEN_FILE) else: self.client_.login_in_terminal() self.client_.save_token(TOKEN_FILE) self.client_.save_token('token.pkl') self.db_ = MongoClient(host="localhost", port=27017)[DB_NAME] def get_topics_detail(self): ''' dataframe format [topic_id, question_id, answer_id_list] :return: ''' #old_topics_detail = pd.DataFrame(list(self.db_[TOPICS_COLLECTION].find())) #print(old_topics_detail) for topic_id in self.topics_list_: count = 0 topic = self.client_.topic(topic_id) for q in topic.unanswered_questions: if (count > 0): break answers = [ans.id for ans in q.answers] if (len(answers) > 10): #只记录有人回答的问题id,减少储存量 for ans_id in answers: self.answers_to_download.append(ans_id) count += 1 print(len(self.answers_to_download)) # new_record = { # 'tid' : topic_id, # 'qid' : q.id, # 'aids' : answers # # } # if(len(old_topics_detail[(old_topics_detail['tid'] == topic_id) & (old_topics_detail['qid'] == q.id)]) > 0): # self.db_[TOPICS_COLLECTION].update_one({'tid': topic_id, 'qid': q.id}, {'$set': new_record}) # print("update {}".format(new_record)) # else: # self.db_[TOPICS_COLLECTION].insert_one(new_record) # print("insert {}".format(new_record)) #TODO-some rest function here if (random.randint(0, 100) % 10 == 0): time.sleep(random.randint(1, 3)) print("sleep done") arr = np.array(self.answers_to_download) np.save("answers_to_download.npy", arr) #TODO-find out how to update this list def get_questions_detail(self): #只搜集符合一定条件的问题详细内容,如关注者/回答数量等超过一定值,但本值不应较大 #如果mongodb中已经存了回答内容,那么也不考虑作者更新回答的情况,因为答案通常较长,IO成本较高 all_answers_id = pd.DataFrame(list(self.db_[TOPICS_COLLECTION].find())) for answers in all_answers_id['aids']: #提取每个问题对应的所有回答id for ans_id in answers: ans = self.client_.answer(ans_id) ans_content = Cleaner.filter_tags(ans.content) comment_content = "" if (ans.comment_count > 0): for comment in ans.comments: comment_content += Cleaner.filter_tags( comment.content) + "@" #用@分割,以后可以选择直接测情绪也可以每句评论分开测 time.sleep(random.randint(1, 4)) print("ans {} done retrival and cleaning".format(ans.id)) ans_detail = { 'aid': ans_id, 'votes': ans.voteup_count, 'content': ans_content, 'comments': comment_content, 'author_follower_num': ans.author.follower_count } self.db_[ANSWERS_COLLECTION].insert_one(ans_detail) #TODO-data cleaning #TODO-in what format to save these data def parse_img_src(self, html): replace_pattern = r'<[img|IMG].*?/>' # img标签的正则式 img_url_pattern = r'.+?src="(\S+)"' # img_url的正则式 replaced_img_url_list = [] img_url_list = [] need_replace_list = re.findall(replace_pattern, html) # 找到所有的img标签 for tag in need_replace_list: img_url_list.append(re.findall(img_url_pattern, tag)[0]) # 找到所有的img_url return img_url_list def save_img(self, img_url, file_name, file_path='./images/'): # 保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的 book\img文件夹 try: if not os.path.exists(file_path): # os.mkdir(file_path) os.makedirs(file_path) # 获得图片后缀 #file_suffix = os.path.splitext(img_url)[1] # 拼接图片名(包含路径) filename = '{}{}{}'.format(file_path, os.sep, file_name) # 下载图片,并保存到文件夹中 img = requests.get(img_url) with open(filename, "wb") as f: f.write(img.content) except IOError as e: print("error") except Exception as e: print("error") def download_images(self, path): ''' answers_to_download里回答的images下载存下来 :return: ''' for ans_id in self.answers_to_download: answer = self.client_.answer(ans_id) content = answer.content img_src_list = self.parse_img_src(content) print("img src list : {}".format(img_src_list)) for img_url in img_src_list: self.save_img(img_url, img_url[-10:], "./images") print("picture {} download complete".format(img_url)) print("answer {} download complete".format(ans_id))
def login(): TOKEN_FILE = 'token.pkl' client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) """ me = client.me() print('name', me.name) print('headline', me.headline) print('description', me.description) print('following topic count', me.following_topic_count) print('following people count', me.following_topic_count) print('followers count', me.follower_count) print('voteup count', me.voteup_count) print('get thanks count', me.thanked_count) print('answered question', me.answer_count) print('question asked', me.question_count) print('collection count', me.collection_count) print('article count', me.articles_count) print('following column count', me.following_column_count) # 获取最近 5 个回答 for _, answer in zip(range(5), me.answers): print(answer.question.title, answer.voteup_count) print('----------') # 获取点赞量最高的 5 个回答 for _, answer in zip(range(5), me.answers.order_by('votenum')): print(answer.question.title, answer.voteup_count) print('----------') # 获取最近提的 5 个问题 for _, question in zip(range(5), me.questions): print(question.title, question.answer_count) print('----------') # 获取最近发表的 5 个文章 for _, article in zip(range(5), me.articles): print(article.title, article.voteup_count) """ topic = client.topic(19560072) # 转基因 # topic = client.topic(19578906) # 气候变化 # topic = client.topic(19551296) # 网络游戏 answers_count = 0 for question in topic.unanswered_questions: print(question.id) print(question.title) print(question.answer_count) answers_count += question.answer_count for answer in question.answers: print(answer.author.id,answer.author.name) answer.save('Data\\Gene\\'+str(question.id)+'#'+question.title, str(answer.author.id)+'#'+answer.author.name) print("总共有{0}个回答".format(answers_count))
def __init__(self): self.TOKEN_FILE = 'token.pkl.' + str(sys.version_info[0]) self.client = ZhihuClient()
def __init__(self): self.client = ZhihuClient()