Пример #1
0
def grab(url, threadID):
    logging.info(url)
    user_grab = User(url)
    followees = user_grab.get_followees()

    for i, user_grab in enumerate(followees):
        user = Users()
        flag = True
        while True:
            try:
                if Users.objects(data_id = user_grab.get_data_id()).count():
                    user = Users.objects(data_id = user_grab.get_data_id()).first()
                break
            except Exception, e:
                flag = False
                logging.error("========error1")
                logging.error(e)
                time.sleep(300)
                break
        try:
            user.user_id = user_grab.get_user_id()
            user.data_id = user_grab.get_data_id()
            user.followees_num = user_grab.get_followees_num()
            user.followers_num = user_grab.get_followers_num()
            user.asks_num = user_grab.get_asks_num()
            user.answers_num = user_grab.get_answers_num()
            user.collections_num = user_grab.get_collections_num()
            user.agree_num = user_grab.get_agree_num()
            user.thanks_num = user_grab.get_thanks_num()
            user.url = user_grab.get_user_url()
            user.modify_time = datetime.utcnow()
        except Exception, e:
            logging.error("========error2")
            logging.error(e)
            logging.debug(user_grab.get_user_url())
Пример #2
0
def grab(url, threadID):
    print url
    user_grab = User(url)
    followers = user_grab.get_followers()

    for i, user_grab in enumerate(followers):
        user = Users()
        while True:
            try:
                if Users.objects(data_id = user_grab.get_data_id()).count():
                    user = Users.objects(data_id = user_grab.get_data_id()).first()
                break
            except Exception, e:
                logging.error("========error1")
                logging.error(e)
                time.sleep(300)
        user.user_id = user_grab.get_user_id()
        user.data_id = user_grab.get_data_id()
        user.followees_num = user_grab.get_followees_num()
        user.followers_num = user_grab.get_followers_num()
        try:
            user.asks_num = user_grab.get_asks_num()
            user.answers_num = user_grab.get_answers_num()
            user.collections_num = user_grab.get_collections_num()
        except Exception, e:
            logging.error("========error2")
            logging.error(e)
            logging.debug(user_grab.get_user_url())
Пример #3
0
def user_test(user_url):

    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Пример #4
0
def user_test(user_url):

    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num =user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id # 黄继新
    print followers_num # 614840
    print followees_num # 8408
    print asks_num # 1323
    print answers_num # 786
    print collections_num # 44
    print agree_num # 46387
    print thanks_num # 11477

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Пример #5
0
 def __init__(self, user_uuid, layer):
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followees()) if layer < 3 else []
     # self.followers = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followers())
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
Пример #6
0
 def __init__(self, user_uuid, layer):
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(lambda x: x.user_url.replace(prefix_people, ""),
                          user.get_followees()) if layer < 3 else []
     # self.followers = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followers())
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
Пример #7
0
 def __init__(self, user_uuid, layer):
     """
     Agrs:
         user_uuid: the unique id of the user
         layer: the number of hops to reach to this user from the seed user
     """
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(lambda x: x.user_url.replace(prefix_people, "").replace(prefix_people_http, ""), user.get_followees()) if layer < 3 else []
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
Пример #8
0
 def __init__(self, user_uuid, layer):
     """
     Agrs:
         user_uuid: the unique id of the user
         layer: the number of hops to reach to this user from the seed user
     """
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(
         lambda x: x.user_url.replace(prefix_people, "").replace(
             prefix_people_http, ""),
         user.get_followees()) if layer < 3 else []
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
Пример #9
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取用户性别
    user_gender = user.get_gender()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()
    # 获取该用户的头像url
    head_img_url = user.get_head_img_url()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print user_gender  #male
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477
    print head_img_url  # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Пример #10
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取用户性别
    user_gender = user.get_gender()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()
    # 获取该用户的头像url
    head_img_url = user.get_head_img_url()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    topics = user.get_topics()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print user_gender #male
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477
    print head_img_url  # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    for topic in topics:
        print topic

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Пример #11
0
# -*- coding: utf-8 -*-
from zhihu import User

user_url = "http://www.zhihu.com/people/jixin"
user = User(user_url)
# 获取用户ID
user_id = user.get_user_id()
# 获取该用户的关注者人数
followers_num = user.get_followers_num()
# 获取该用户关注的人数
followees_num =user.get_followees_num()
# 获取该用户提问的个数
asks_num = user.get_asks_num()
# 获取该用户回答的个数
answers_num = user.get_answers_num()
# 获取该用户收藏夹个数
collections_num = user.get_collections_num()
# 获取该用户获得的赞同数
agree_num = user.get_agree_num()
# 获取该用户获得的感谢数
thanks_num = user.get_thanks_num()

# 获取该用户关注的人
followees = user.get_followees()
# 获取关注该用户的人
followers = user.get_followers()
# 获取该用户提的问题
asks = user.get_asks()
# 获取该用户回答的问题的答案
answers = user.get_answers()
# 获取该用户的收藏夹
Пример #12
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户2014年回答的问题的答案
    answers_in_2014 = user.get_answers(begin_date='2014-01-01', end_date='2014-12-31')
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print answers_in_2014
    # 代表该用户2014年回答的所有问题的答案的生成器对象
    print collections
Пример #13
0
def main():
    client = pymongo.MongoClient("localhost", 27017)
    db = client.zhihu_user
    urllist = db.urllist
    userlist = db.userlist

    origin_users = ["https://www.zhihu.com/people/jixin",
                "https://www.zhihu.com/people/zhang-jia-wei",
		"https://www.zhihu.com/people/zhu-xuan-86",
                "https://www.zhihu.com/people/kaifulee",
		"https://www.zhihu.com/people/e-miao-de-nai-ba"]

    urls = urllist.distinct("user_url")

    for u in origin_users:
        if u in urls:
            pass
        else:
            urllist.insert({"user_url": u,
                            "jlzt": "1"})

    while 1:
        item = urllist.find_one({'jlzt':'1'})
        if item == None:
            print u'已全部处理完成'
            break
        else:
            user_url = item["user_url"]
        
        starttime = datetime.datetime.now()
        urllist.update({"user_url":user_url},{"$set":{"jlzt":"3"}})
	
	try:
	    user = User(user_url)


	    zhihu_id = user.get_data_id()
	    # 用户唯一id
	    zhihu_name = user.get_user_id()
	    # 用户名
	    followees_num = user.get_followees_num()
	    # 用户关注人数
	    followers_num = user.get_followers_num()
	    # 用户关注者人数
	    gender = user.get_gender()
	    # 性别

	    # 提问数
	    asks_num = user.get_asks_num()
	    # 获取该用户回答的个数
	    answers_num = user.get_answers_num()
	    # 获取该用户收藏夹个数
	    collections_num = user.get_collections_num()
	    # 获取该用户获得的赞同数
	    agree_num = user.get_agree_num()
	    # 获取该用户获得的感谢数
	    thanks_num = user.get_thanks_num()
	    # 获取该用户的头像url
	    head_img_url = user.get_head_img_url()
	    # 关注的话题数
	    topics_num = user.get_topics_num()


	    # 获取该用户关注的人
	    followees = user.get_followees()
	    # 获取关注该用户的人
	    followers = user.get_followers()
	    # 获取提出的问题
	    questions = user.get_asks()
	    # 获取回答的问题
	    answers = user.get_answers()
	    # 获取话题
	    topics = user.get_topics()

	    print "start process " + zhihu_name + ";\n"
	    #print zhihu_id # 黄继新
	    #print zhihu_name # 614840
	    #print followees_num # 8408
	    #print followers_num # 1323
	    #print gender # 786
	    #print asks_num # 44
	    #print answers_num # 46387
	    #print collections_num # 11477
	    #print agree_num
	    #print thanks_num
	    #print head_img_url
	    #print topics_num



	    followee_l = []
	    follower_l = []
	    questions_l = []
	    answers_l = []
	    topics_l = []



	    print u'开始处理关注的人'
	    for followee in followees:
		followee_l.append(followee.user_url.split('/')[4])
		if len(followee_l) % 100 == 0:
		    print zhihu_name  + "'s NO." + str(len(followee_l)) + " followee is being processed. please wait..."
		time.sleep(0.05)
	    followee_list  = ','.join(followee_l)
	    print u'添加完成'

	    print u'开始添加关注者至处理队列'
	    for follower in followers:
		follower_l.append(follower.user_url.split('/')[4])
		urls = urllist.distinct("user_url")
		if follower.user_url in urls:
		    pass
		    # print "follower_url:'"+follower.user_url+"' passed"
		else:
		    urllist.insert({"user_url": follower.user_url, "jlzt": "1"})
		    #print "follower_url:'" +follower.user_url + "' added"
		if len(follower_l) % 100 == 0:
		    print zhihu_name  + "'s NO." + str(len(follower_l)) + " follower is being processed. please wait..."
		time.sleep(0.05)


	    print u'添加完成'

	    for q in questions:
		questions_l.append("url=" + q.url + "|title=" + q.get_title())
		time.sleep(0.01)
	    for a in answers:
		answers_l.append(a.answer_url)
		time.sleep(0.01)
	    for t in topics:
		topics_l.append(t)
		time.sleep(0.01)

	    user_data = {"zhihu_id":zhihu_id,
			"zhihu_name":zhihu_name,
			"followees_num":followees_num,
			"followers_num":followers_num,
			"followees":followee_l,
			"followers":follower_l,
			"questions":questions_l,
			"gender":gender,
			"asks_num":asks_num,
			"answers_num":answers_num,
			"ansers":answers_l,
			"collections_num":collections_num,
			"agree_num":agree_num,
			"thanks_num":thanks_num,
			"topics_num":topics_num,
			"topics":topics_l,
			"head_img_url":head_img_url
			}

	    print "user_data prepared:"

	    urls = userlist.distinct("user_url")

	    if user_url in urls:
		     pass
	    else:
		userlist.insert(user_data)
		print "user_data inserted: \n"

	    urllist.update({"user_url":user_url},{"$set":{"jlzt":"2"}})
	    endtime = datetime.datetime.now()
	    interval=(endtime - starttime).seconds
	    print zhihu_name + "finnished. spent " + str(interval) + "seconds."
	except:
            traceback.print_exc() 
	    time.sleep(10)
	    urllist.update({"user_url":user_url},{"$set":{"jlzt":"1"}})
            continue

    print "处理完毕"