Python ZhihuClient.ZhihuClient示例，zhihu_oauth.ZhihuClient.ZhihuClient Python示例

示例#1

0

显示文件

文件： zhuanlan.py 项目： kexin9752/crawler

def download(cid):
    client = ZhihuClient()
    client.load_token('token.pk1')  # 登陆
    column = client.column(cid)
    images_dir = os.path.join(column.title, 'images')
    regex = re.compile(r"https://pic\d.zhimg.com/", re.IGNORECASE)

    if not os.path.exists(column.title):
        os.makedirs(column.title)

    if not os.path.exists(images_dir):
        os.makedirs(images_dir)

    for index, article in enumerate(column.articles):
        # 处理文章
        article_f = dealArticle(article)

        # 下载图片
        downloadImg(article_f['content'], images_dir)

        # 替换图片路径
        article_f['content'] = re.sub(regex, './images/', article_f['content'])

        # 写入
        with open(os.path.join(column.title, article_f['title']), 'w+') as f:
            f.write(article_f['content'])

        print('[%s] download %s success!' % (str(index), article.title))

示例#2

0

显示文件

文件： url_parser.py 项目： handuoZhang/ee-book

        def parse_author(command):
            result = Match.author(command)
            author_id = result.group('author_id')
            task = SingleTask()
            task.kind = 'author'
            task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id)
            task.book.kind = 'author'

            client = ZhihuClient()
            try:
                client.load_token(Path.pwd_path + str(u'/ZHIHUTOKEN.pkl'))
            except IOError:
                print u"没有找到登录信息文件，请先登录"
                sys.exit()
            except NeedLoginException:
                print u"登录信息过期，请重新登录"
                sys.exit()
            people_oauth = client.people(author_id)
            _ = people_oauth.follower_count    # zhihu-oauth, issues #4
            author_id_hash = people_oauth.id
            task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id_hash)
            task.book.sql.question = 'select * from Question where question_id in (select question_id from \
            Answer where author_id = "{}")'.format(author_id_hash)
            task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id_hash)
            return task

示例#3

0

显示文件

def crawling(id):
    #id为问题id
    client = ZhihuClient()
    # 登录
    client.load_token('token.pkl')  # 加载token文件
    question = client.question(id)
    print(u"问题:", question.title)
    print(u"回答数量:", question.answer_count)
    if not os.path.exists(question.title):
        os.mkdir(question.title)
    path = question.title
    index = 1  # 图片序号
    for i, answer in enumerate(question.answers):
        content = answer.content  # 回答内容
        anther = answer.author.name
        re_compile = re.compile(
            r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
        img_lists = re.findall(re_compile, content)
        if (img_lists):
            for img in img_lists:
                img_url = img[0]  # 图片url
                image_name = anther + '_' + str(index) + '.jpg'
                if not os.path.exists(path + '/' + image_name):
                    urllib.request.urlretrieve(img_url,
                                               path + '/' + image_name)
                    print(u"成功保存第%d张图片:%s,当前总进度%.2f%%" %
                          (index, image_name, i / question.answer_count * 100))
                index += 1
        print('第%d个答案爬取完成,当前总进度%.2f%%' % (i, i / question.answer_count * 100))

示例#4

0

显示文件

文件： start_crawl.py 项目： jryyufeng/zhihu_spider

 def Analyse(self):
     client = ZhihuClient()
     self.aa.denglu(client)
     self.aa.findquestion(client)
     print'------------find','\n'
     list_q = self.aa.Analyse_question(self.aa.dic_name)
     print'----------q','\n'
     self.aa.Analyse_answer(list_q)#最慢
     print'-------------a','\n'

示例#5

0

显示文件

文件： topictree.py 项目： xiaoaxe/xiao-zhihu-spider

class TopicTree:
    client = ZhihuClient()

    def __init__(self):
        pass

    def login(self):
        if os.path.isfile(TOKEN_FILE):
            self.client.load_token(TOKEN_FILE)
        else:
            self.client.login_in_terminal(username='******',
                                          password=self.getpass())
            self.client.save_token(TOKEN_FILE)

    def login_next(self):
        self.client.login_in_terminal(username='******',
                                      password=self.getpass())

    def test(self):
        me = self.client.me()
        print('name', me.name)

    def get_topic(self, uid):
        topic = self.client.topic(uid)

        topic_dic = {}

        topic_dic['id'] = uid
        topic_dic['name'] = topic.name
        topic_dic['children'] = [{
            'id': item._id,
            'name': item.name
        } for item in topic.children]

        with open(out_path, 'a', encoding='utf-8') as fw:
            json.dump(topic_dic, fw, ensure_ascii=False, sort_keys=True)
            fw.write('\n')

        for item in topic_dic['children']:
            cid = item['id']
            try:
                self.get_topic(cid)
                time.sleep(random.random())
            except Exception as e:
                traceback.print_exc()

                print('ban: ', cid)
                self.run(cid)

    def run(self, uid):
        self.login_next()
        self.get_topic(uid)

    def getpass(self):
        # with open('C:\\Users\\BaoQiang\\Desktop\\password.txt', 'r') as f:
        with open('/mnt/home/baoqiang/password.txt', 'r') as f:
            return f.read().strip()

示例#6

0

显示文件

文件： login.py 项目： jajohe/taoguba_xueqiu_book

 def start(self):
     try:
         client = ZhihuClient()
         client.login_in_terminal()
         client.save_token(Path.ZHIHUTOKEN)
     except NeedLoginException:
         print u"Oops, please try again."
         sys.exit()
     return

示例#7

0

显示文件

def LoginZhihuClient(token_name):
    TOKEN_FILE = 'liuximing.pkl'
    client = ZhihuClient()
    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)
    me = client.me()
    return me

示例#8

0

显示文件

文件： sp_zhihu.py 项目： HIT-Three-Friends/Friend-Reader

    def prepare(self):
        if not os.path.isdir(self.data_path): os.makedirs(self.data_path)

        if os.path.isfile(self.friends_file):
            with open(self.friends_file, "rb") as f:
                self.name_map = pickle.load(f)
        else:
            self.name_map = dict()

        self.client = ZhihuClient()

示例#9

0

显示文件

 def __init__(self, dbname,email,key):
     self.con = sqlite3.connect(dbname)
     self.cursor = self.con.cursor()
     TOKEN_FILE = 'token.pkl'
     self.zhclient = ZhihuClient()
     try:
         # self.zhclient.login_in_terminal(email, key)
         self.zhclient.login(email, key)
     except NeedCaptchaException:
         print("需要输入验证码，账号 %s 可能已失效" %(email))

示例#10

0

显示文件

def main():
    #login
    client = ZhihuClient()
    log_in(client)
    Image('./a.gif')
    captcha = input('please input captcha:')
    client.login('account', 'psw', captcha)

    get_data(client)
    write_hk_student_info()
    write_modules()

示例#11

0

显示文件

 def login_zhihu(self):
     """
     登录知乎
     :return:
     """
     self.client = ZhihuClient()
     if os.path.isfile(self.TOKEN_FILE):
         self.client.load_token(self.TOKEN_FILE)
     else:
         self.client.login_in_terminal()
         self.client.save_token(self.TOKEN_FILE)

示例#12

0

显示文件

def zhihu_login():
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login('*****@*****.**', 'a4906639')
        client.save_token(TOKEN_FILE)
    me = client.me()
    print(me.name)
    return client

示例#13

0

显示文件

 def get_client(self, reset_=0):
     client = ZhihuClient()
     if reset_ != 0:
         client.login_in_terminal()
         client.save_token(TOKEN_FILE)
     if os.path.isfile(TOKEN_FILE):
         client.load_token(TOKEN_FILE)
     else:
         client.login_in_terminal()
         client.save_token(TOKEN_FILE)
     return client

示例#14

0

显示文件

    def login(self):
        TOKEN_FILE = 'token.pkl'

        client = ZhihuClient()

        if os.path.isfile(TOKEN_FILE):
            client.load_token(TOKEN_FILE)
        else:
            client.login_in_terminal()
            client.save_token(TOKEN_FILE)
        return client

示例#15

0

显示文件

def zhihu_login():
    r"""
    知乎登陆
    :return:        登陆之后的客户端client
    """
    client = ZhihuClient()
    # 登录
    if os.path.isfile(TOKEN_FILE_NAME):
        client.load_token(TOKEN_FILE_NAME)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE_NAME)
    return client

示例#16

0

显示文件

def login(username, password):
    from zhihu_oauth import ZhihuClient
    from zhihu_oauth.exception import NeedCaptchaException
    client = ZhihuClient()
    try:
        client.login(username, password)
        print(u"登陆成功!")
    except NeedCaptchaException:  # 处理要验证码的情况
        # 保存验证码并提示输入，重新登录
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('please input captcha:')
        client.login(username, password, captcha)
    client.save_token('token.pkl')  # 保存token

示例#17

0

显示文件

文件： problems.py 项目： sxhylkl/zhihu-problems

def login(account, password):
    client = ZhihuClient()
    try:
        client.load_token(TOKEN_FILE)
    except FileNotFoundError:
        try:
            client.login(account, password)
        except NeedCaptchaException:
            # 保存验证码并提示输入，重新登录
            with open('./captcha/a.gif', 'wb') as f:
                f.write(client.get_captcha())
            captcha = input('please input captcha:')
            client.login(account, password, captcha)
            client.save_token('./token/token.pkl')
    finally:
        return client

示例#18

0

显示文件

文件： client_test_base.py 项目： xiaoxing1120/zhihu-spider1.0

    def setUp(self):
        super(ZhihuClientClassTest, self).setUp()

        if not os.path.isdir('test') and os.path.isfile('token.pkl'):
            os.chdir('..')

        if not os.path.isfile('test/token.pkl'):
            print('\nno token file, skip all tests.')
            self.skipTest('no token file.')

        self.client = ZhihuClient()

        try:
            self.client.load_token('test/token.pkl')
        except ValueError:
            print('\ntoken version not math python version, skip all tests.')
            self.skipTest('token version not math python version.')

示例#19

0

显示文件

文件： my_zhihu_use_oauth.py 项目： ttggaa/Python_example

def main():
    client = ZhihuClient()

    try:
        # client.login(email_or_phone, password)
        client.login_in_terminal(username=email_or_phone, password=password)
        client.save_token(TOKEN_FILE)          # 保存登录会话,留着以后登录用
        # raise NeedCaptchaException
    except NeedCaptchaException:
        # 保存验证码并提示输入，重新登录
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('请输入验证码: ')
        client.login(email_or_phone, password, captcha)

    data_out_list_a = []
    line_saved = 0
    max_lines = 1

    with open(USER_CSV_PATH) as file:
        for line in file.readlines():
            crawl_id = line.strip('\n')
            my_crawl = MyCrawler(crawl_id, client)
            print('------>>>| 待爬取的用户的知乎id为: ', crawl_id)

            data_a = my_crawl.crawling_answer(crawl_id)
            print('该用户爬取完毕'.center(60, '*'))
            if len(data_a) % 60 == 0:
                tmp_time = int(len(data_a) / 60)
                for i in range(tmp_time):
                    data_out_list_a.append(data_a[60*i:60*(i+1)])
            else:
                print('无用的输出!')

            # sleep(randint(1, 3))
            line_saved += 1

            if line_saved == max_lines:
                save_to_csv_a(data_out_list_a, client)
                data_out_list_a = []
                line_saved = 0

    print('全部用户采集完毕'.center(40, '*'))

示例#20

0

显示文件

    def setUp(self):
        super(ZhihuClientClassTest, self).setUp()

        if not os.path.isdir('test') and os.path.isfile(TOKEN_FILE_NAME):
            os.chdir('..')

        token_file_path = os.path.join('test', TOKEN_FILE_NAME)

        if not os.path.isfile(token_file_path):
            print('\nno token file, skip all tests.')
            self.skipTest('no token file.')

        self.client = ZhihuClient()

        try:
            self.client.load_token(token_file_path)
        except ValueError:
            print('\ntoken version not math python version, skip all tests.')
            self.skipTest('token version not math python version.')

示例#21

0

显示文件

文件： use_ques_tag.py 项目： ttggaa/Python_example

def main():
    client = ZhihuClient()

    try:
        client.login('*****@*****.**', 'durant')

    except NeedCaptchaException:
        print("Login Error")
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('please input captcha:')
        client.login('*****@*****.**', 'durant', captcha)

    max_lines = 1
    line_saved = 0
    data_out_list_a = []

    with open(USER_TRY_CSV_PATH) as f:
        for line in f.readlines():
            craw_id = line.strip("\n")
            craw = MyCrawler(craw_id, client)
            print(craw_id)

            data_a = craw.crawling_answer(craw_id)
            if len(data_a) % 60 == 0:
                times = int(len(data_a) / 60)
                for i in range(times):
                    data_out_list_a.append(data_a[60 * i:60 * (i + 1)])
            else:
                print("Invalid Output")

            a = random.randint(1, 3)
            time.sleep(a)

            line_saved += 1

            if line_saved == max_lines:
                save_to_csv_a(data_out_list_a)

                data_out_list_a = []

                line_saved = 0

示例#22

0

显示文件

def download(uid):
    client = ZhihuClient()
    client.load_token('token.pk1')  # 登陆
    people = client.people(uid)

    for index, answer in enumerate(people.answers):
        # 下载图片
        downloadImg(answer.content)

        # 处理文本
        article = dealArticle(answer)

        # 写入本地
        if not os.path.exists(uid):
            os.makedirs(uid)

        with open(os.path.join(uid, article['title']), 'w+') as f:
            f.write(article['content'])

        print('[%s] download %s success!' %
              (str(index), answer.question.title))

示例#23

0

显示文件

# coding=utf-8
# https://pypi.org/project/zhihu-oauth

from __future__ import unicode_literals, print_function

import os

from zhihu_oauth import ZhihuClient
from zhihu_oauth import SearchType

TOKEN_FILE = 'token.pkl'

client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

me = client.me()

print('name', me.name)
print('headline', me.headline)
print('description', me.description)

print('following topic count', me.following_topic_count)
print('following people count', me.following_topic_count)
print('followers count', me.follower_count)

print('voteup count', me.voteup_count)

示例#24

0

显示文件

from __future__ import unicode_literals, print_function
from RedisQueue import RedisQueue
from zhihu_oauth import ZhihuClient
import datetime
import time
import random
import sys
from timeout import timeout
import os
from utils import print_err
from pymongo import MongoClient

MAX_SLEEP_TIME = 15
Cookies_File = './cookies/cookies%s.json' % sys.argv[1]
global client
client = ZhihuClient()
if os.path.isfile(Cookies_File):
    client.load_token(Cookies_File)
else:
    client_info = open('./cookies/client_info_list.data').readlines()
    client_info = client_info[int(sys.argv[1])].strip().split('\t')
    client.login_in_terminal(client_info[0], client_info[1])
    client.save_token(Cookies_File)


def get_user_questions(uname):
    global client
    if uname == '':
        return
    print(uname)

示例#25

0

显示文件

文件： Crawler.py 项目： brucechin/zhihu-monitoring

class Crawler:
    client_ = ZhihuClient()
    topic_question_df = pd.DataFrame()
    topics_list_ = [19610354, 20010203]  #监控的话题列表
    questions_detail_ = {}  #每个问题的问题标题，问题内容，关注人数，回答人数，回答内容，评论内容，包含话题
    answers_persisted = set()
    answers_to_download = []

    def __init__(self, topics_list=[]):
        if (len(topics_list) > 0):
            self.topic_lists_ = topics_list

        if os.path.isfile(TOKEN_FILE):
            self.client_.load_token(TOKEN_FILE)
        else:
            self.client_.login_in_terminal()
            self.client_.save_token(TOKEN_FILE)

        self.client_.save_token('token.pkl')
        self.db_ = MongoClient(host="localhost", port=27017)[DB_NAME]

    def get_topics_detail(self):
        '''
        dataframe format

        [topic_id, question_id, answer_id_list]

        :return:
        '''
        #old_topics_detail = pd.DataFrame(list(self.db_[TOPICS_COLLECTION].find()))
        #print(old_topics_detail)
        for topic_id in self.topics_list_:
            count = 0
            topic = self.client_.topic(topic_id)
            for q in topic.unanswered_questions:
                if (count > 0):
                    break
                answers = [ans.id for ans in q.answers]
                if (len(answers) > 10):  #只记录有人回答的问题id，减少储存量
                    for ans_id in answers:
                        self.answers_to_download.append(ans_id)
                    count += 1
                    print(len(self.answers_to_download))
                    # new_record = {
                    #     'tid' : topic_id,
                    #     'qid' : q.id,
                    #     'aids' : answers
                    #
                    # }
                    # if(len(old_topics_detail[(old_topics_detail['tid'] == topic_id) & (old_topics_detail['qid'] == q.id)]) > 0):
                    #     self.db_[TOPICS_COLLECTION].update_one({'tid': topic_id, 'qid': q.id}, {'$set': new_record})
                    #     print("update {}".format(new_record))
                    # else:
                    #     self.db_[TOPICS_COLLECTION].insert_one(new_record)
                    #     print("insert {}".format(new_record))
                #TODO-some rest function here
                if (random.randint(0, 100) % 10 == 0):
                    time.sleep(random.randint(1, 3))
                    print("sleep done")

        arr = np.array(self.answers_to_download)
        np.save("answers_to_download.npy", arr)
        #TODO-find out how to update this list

    def get_questions_detail(self):  #只搜集符合一定条件的问题详细内容，如关注者/回答数量等超过一定值，但本值不应较大
        #如果mongodb中已经存了回答内容，那么也不考虑作者更新回答的情况，因为答案通常较长，IO成本较高
        all_answers_id = pd.DataFrame(list(self.db_[TOPICS_COLLECTION].find()))
        for answers in all_answers_id['aids']:  #提取每个问题对应的所有回答id
            for ans_id in answers:

                ans = self.client_.answer(ans_id)
                ans_content = Cleaner.filter_tags(ans.content)
                comment_content = ""
                if (ans.comment_count > 0):
                    for comment in ans.comments:
                        comment_content += Cleaner.filter_tags(
                            comment.content) + "@"  #用@分割，以后可以选择直接测情绪也可以每句评论分开测
                time.sleep(random.randint(1, 4))
                print("ans {} done retrival and cleaning".format(ans.id))

                ans_detail = {
                    'aid': ans_id,
                    'votes': ans.voteup_count,
                    'content': ans_content,
                    'comments': comment_content,
                    'author_follower_num': ans.author.follower_count
                }

                self.db_[ANSWERS_COLLECTION].insert_one(ans_detail)

        #TODO-data cleaning

        #TODO-in what format to save these data

    def parse_img_src(self, html):
        replace_pattern = r'<[img|IMG].*?/>'  # img标签的正则式
        img_url_pattern = r'.+?src="(\S+)"'  # img_url的正则式
        replaced_img_url_list = []
        img_url_list = []
        need_replace_list = re.findall(replace_pattern, html)  # 找到所有的img标签
        for tag in need_replace_list:
            img_url_list.append(re.findall(img_url_pattern,
                                           tag)[0])  # 找到所有的img_url
        return img_url_list

    def save_img(self, img_url, file_name, file_path='./images/'):
        # 保存图片到磁盘文件夹 file_path中，默认为当前脚本运行目录下的 book\img文件夹
        try:
            if not os.path.exists(file_path):
                # os.mkdir(file_path)
                os.makedirs(file_path)
            # 获得图片后缀
            #file_suffix = os.path.splitext(img_url)[1]
            # 拼接图片名（包含路径）
            filename = '{}{}{}'.format(file_path, os.sep, file_name)
            # 下载图片，并保存到文件夹中
            img = requests.get(img_url)
            with open(filename, "wb") as f:
                f.write(img.content)
        except IOError as e:
            print("error")
        except Exception as e:
            print("error")

    def download_images(self, path):
        '''
            answers_to_download里回答的images下载存下来
        :return:
        '''
        for ans_id in self.answers_to_download:
            answer = self.client_.answer(ans_id)
            content = answer.content
            img_src_list = self.parse_img_src(content)
            print("img src list : {}".format(img_src_list))
            for img_url in img_src_list:
                self.save_img(img_url, img_url[-10:], "./images")
                print("picture {} download complete".format(img_url))
            print("answer {} download complete".format(ans_id))

示例#26

0

显示文件

文件： main.py 项目： Zavier-Wang/Zhihu-1

def login():
    TOKEN_FILE = 'token.pkl'
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)

    """
    me = client.me()
    print('name', me.name)
    print('headline', me.headline)
    print('description', me.description)

    print('following topic count', me.following_topic_count)
    print('following people count', me.following_topic_count)
    print('followers count', me.follower_count)

    print('voteup count', me.voteup_count)
    print('get thanks count', me.thanked_count)

    print('answered question', me.answer_count)
    print('question asked', me.question_count)
    print('collection count', me.collection_count)
    print('article count', me.articles_count)
    print('following column count', me.following_column_count)

    # 获取最近 5 个回答
    for _, answer in zip(range(5), me.answers):
        print(answer.question.title, answer.voteup_count)

    print('----------')

    # 获取点赞量最高的 5 个回答
    for _, answer in zip(range(5), me.answers.order_by('votenum')):
        print(answer.question.title, answer.voteup_count)

    print('----------')

    # 获取最近提的 5 个问题
    for _, question in zip(range(5), me.questions):
        print(question.title, question.answer_count)

    print('----------')

    # 获取最近发表的 5 个文章
    for _, article in zip(range(5), me.articles):
        print(article.title, article.voteup_count)
    """
    topic = client.topic(19560072)  # 转基因
    # topic = client.topic(19578906)  # 气候变化
    # topic = client.topic(19551296)  # 网络游戏

    answers_count = 0
    for question in topic.unanswered_questions:
        print(question.id)
        print(question.title)
        print(question.answer_count)
        answers_count += question.answer_count
        for answer in question.answers:
            print(answer.author.id,answer.author.name)
            answer.save('Data\\Gene\\'+str(question.id)+'#'+question.title, str(answer.author.id)+'#'+answer.author.name)
    print("总共有{0}个回答".format(answers_count))

示例#27

0

显示文件

 def __init__(self):
     self.TOKEN_FILE = 'token.pkl.' + str(sys.version_info[0])
     self.client = ZhihuClient()

示例#28

0

显示文件

文件： crawl.py 项目： yug930/zhihulive

 def __init__(self):
     self.client = ZhihuClient()