def grab(url, threadID): logging.info(url) user_grab = User(url) followees = user_grab.get_followees() for i, user_grab in enumerate(followees): user = Users() flag = True while True: try: if Users.objects(data_id = user_grab.get_data_id()).count(): user = Users.objects(data_id = user_grab.get_data_id()).first() break except Exception, e: flag = False logging.error("========error1") logging.error(e) time.sleep(300) break try: user.user_id = user_grab.get_user_id() user.data_id = user_grab.get_data_id() user.followees_num = user_grab.get_followees_num() user.followers_num = user_grab.get_followers_num() user.asks_num = user_grab.get_asks_num() user.answers_num = user_grab.get_answers_num() user.collections_num = user_grab.get_collections_num() user.agree_num = user_grab.get_agree_num() user.thanks_num = user_grab.get_thanks_num() user.url = user_grab.get_user_url() user.modify_time = datetime.utcnow() except Exception, e: logging.error("========error2") logging.error(e) logging.debug(user_grab.get_user_url())
def grab(url, threadID): print url user_grab = User(url) followers = user_grab.get_followers() for i, user_grab in enumerate(followers): user = Users() while True: try: if Users.objects(data_id = user_grab.get_data_id()).count(): user = Users.objects(data_id = user_grab.get_data_id()).first() break except Exception, e: logging.error("========error1") logging.error(e) time.sleep(300) user.user_id = user_grab.get_user_id() user.data_id = user_grab.get_data_id() user.followees_num = user_grab.get_followees_num() user.followers_num = user_grab.get_followers_num() try: user.asks_num = user_grab.get_asks_num() user.answers_num = user_grab.get_answers_num() user.collections_num = user_grab.get_collections_num() except Exception, e: logging.error("========error2") logging.error(e) logging.debug(user_grab.get_user_url())
def user_test(user_url): user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num = user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户的收藏夹 collections = user.get_collections() print user_id # 黄继新 print followers_num # 614840 print followees_num # 8408 print asks_num # 1323 print answers_num # 786 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 print followees # <generator object get_followee at 0x7ffcac3af050> # 代表所有该用户关注的人的生成器对象 print followers # <generator object get_follower at 0x7ffcac3af0f0> # 代表所有关注该用户的人的生成器对象 print asks # <generator object get_ask at 0x7ffcab9db780> # 代表该用户提的所有问题的生成器对象 print answers # <generator object get_answer at 0x7ffcab9db7d0> # 代表该用户回答的所有问题的答案的生成器对象 print collections
def user_test(user_url): user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num =user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户的收藏夹 collections = user.get_collections() print user_id # 黄继新 print followers_num # 614840 print followees_num # 8408 print asks_num # 1323 print answers_num # 786 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 print followees # <generator object get_followee at 0x7ffcac3af050> # 代表所有该用户关注的人的生成器对象 print followers # <generator object get_follower at 0x7ffcac3af0f0> # 代表所有关注该用户的人的生成器对象 print asks # <generator object get_ask at 0x7ffcab9db780> # 代表该用户提的所有问题的生成器对象 print answers # <generator object get_answer at 0x7ffcab9db7d0> # 代表该用户回答的所有问题的答案的生成器对象 print collections
def __init__(self, user_uuid, layer): user = User(prefix_people + user_uuid) self.user = user self.uuid = user_uuid self.user_id = user.get_user_id() self.followees = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followees()) if layer < 3 else [] # self.followers = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followers()) self.answer_num = user.get_answers_num() self.following_num = user.get_followees_num() self.follower_num = user.get_followers_num() self.upvote_num = user.get_agree_num() self.thank_num = user.get_thanks_num() self.layer = layer
def __init__(self, user_uuid, layer): """ Agrs: user_uuid: the unique id of the user layer: the number of hops to reach to this user from the seed user """ user = User(prefix_people + user_uuid) self.user = user self.uuid = user_uuid self.user_id = user.get_user_id() self.followees = map(lambda x: x.user_url.replace(prefix_people, "").replace(prefix_people_http, ""), user.get_followees()) if layer < 3 else [] self.answer_num = user.get_answers_num() self.following_num = user.get_followees_num() self.follower_num = user.get_followers_num() self.upvote_num = user.get_agree_num() self.thank_num = user.get_thanks_num() self.layer = layer
def __init__(self, user_uuid, layer): """ Agrs: user_uuid: the unique id of the user layer: the number of hops to reach to this user from the seed user """ user = User(prefix_people + user_uuid) self.user = user self.uuid = user_uuid self.user_id = user.get_user_id() self.followees = map( lambda x: x.user_url.replace(prefix_people, "").replace( prefix_people_http, ""), user.get_followees()) if layer < 3 else [] self.answer_num = user.get_answers_num() self.following_num = user.get_followees_num() self.follower_num = user.get_followers_num() self.upvote_num = user.get_agree_num() self.thank_num = user.get_thanks_num() self.layer = layer
def user_test(user_url): user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取用户性别 user_gender = user.get_gender() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num = user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户的头像url head_img_url = user.get_head_img_url() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户的收藏夹 collections = user.get_collections() print user_id # 黄继新 print user_gender #male print followers_num # 614840 print followees_num # 8408 print asks_num # 1323 print answers_num # 786 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 print head_img_url # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg print followees # <generator object get_followee at 0x7ffcac3af050> # 代表所有该用户关注的人的生成器对象 i = 0 for followee in followees: print followee.get_user_id() i = i + 1 if i == 41: break print followers # <generator object get_follower at 0x7ffcac3af0f0> # 代表所有关注该用户的人的生成器对象 i = 0 for follower in followers: print follower.get_user_id() i = i + 1 if i == 41: break print asks # <generator object get_ask at 0x7ffcab9db780> # 代表该用户提的所有问题的生成器对象 print answers # <generator object get_answer at 0x7ffcab9db7d0> # 代表该用户回答的所有问题的答案的生成器对象 print collections
def user_test(user_url): user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取用户性别 user_gender = user.get_gender() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num = user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户的头像url head_img_url = user.get_head_img_url() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() topics = user.get_topics() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户的收藏夹 collections = user.get_collections() print user_id # 黄继新 print user_gender #male print followers_num # 614840 print followees_num # 8408 print asks_num # 1323 print answers_num # 786 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 print head_img_url # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg print followees # <generator object get_followee at 0x7ffcac3af050> # 代表所有该用户关注的人的生成器对象 i = 0 for followee in followees: print followee.get_user_id() i = i + 1 if i == 41: break print followers # <generator object get_follower at 0x7ffcac3af0f0> # 代表所有关注该用户的人的生成器对象 i = 0 for follower in followers: print follower.get_user_id() i = i + 1 if i == 41: break for topic in topics: print topic print asks # <generator object get_ask at 0x7ffcab9db780> # 代表该用户提的所有问题的生成器对象 print answers # <generator object get_answer at 0x7ffcab9db7d0> # 代表该用户回答的所有问题的答案的生成器对象 print collections
# -*- coding: utf-8 -*- from zhihu import User user_url = "http://www.zhihu.com/people/jixin" user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num =user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户的收藏夹
def user_test(user_url): user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num = user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户2014年回答的问题的答案 answers_in_2014 = user.get_answers(begin_date='2014-01-01', end_date='2014-12-31') # 获取该用户的收藏夹 collections = user.get_collections() print user_id # 黄继新 print followers_num # 614840 print followees_num # 8408 print asks_num # 1323 print answers_num # 786 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 print followees # <generator object get_followee at 0x7ffcac3af050> # 代表所有该用户关注的人的生成器对象 i = 0 for followee in followees: print followee.get_user_id() i = i + 1 if i == 41: break print followers # <generator object get_follower at 0x7ffcac3af0f0> # 代表所有关注该用户的人的生成器对象 i = 0 for follower in followers: print follower.get_user_id() i = i + 1 if i == 41: break print asks # <generator object get_ask at 0x7ffcab9db780> # 代表该用户提的所有问题的生成器对象 print answers # <generator object get_answer at 0x7ffcab9db7d0> # 代表该用户回答的所有问题的答案的生成器对象 print answers_in_2014 # 代表该用户2014年回答的所有问题的答案的生成器对象 print collections
def main(): client = pymongo.MongoClient("localhost", 27017) db = client.zhihu_user urllist = db.urllist userlist = db.userlist origin_users = ["https://www.zhihu.com/people/jixin", "https://www.zhihu.com/people/zhang-jia-wei", "https://www.zhihu.com/people/zhu-xuan-86", "https://www.zhihu.com/people/kaifulee", "https://www.zhihu.com/people/e-miao-de-nai-ba"] urls = urllist.distinct("user_url") for u in origin_users: if u in urls: pass else: urllist.insert({"user_url": u, "jlzt": "1"}) while 1: item = urllist.find_one({'jlzt':'1'}) if item == None: print u'已全部处理完成' break else: user_url = item["user_url"] starttime = datetime.datetime.now() urllist.update({"user_url":user_url},{"$set":{"jlzt":"3"}}) try: user = User(user_url) zhihu_id = user.get_data_id() # 用户唯一id zhihu_name = user.get_user_id() # 用户名 followees_num = user.get_followees_num() # 用户关注人数 followers_num = user.get_followers_num() # 用户关注者人数 gender = user.get_gender() # 性别 # 提问数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户的头像url head_img_url = user.get_head_img_url() # 关注的话题数 topics_num = user.get_topics_num() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取提出的问题 questions = user.get_asks() # 获取回答的问题 answers = user.get_answers() # 获取话题 topics = user.get_topics() print "start process " + zhihu_name + ";\n" #print zhihu_id # 黄继新 #print zhihu_name # 614840 #print followees_num # 8408 #print followers_num # 1323 #print gender # 786 #print asks_num # 44 #print answers_num # 46387 #print collections_num # 11477 #print agree_num #print thanks_num #print head_img_url #print topics_num followee_l = [] follower_l = [] questions_l = [] answers_l = [] topics_l = [] print u'开始处理关注的人' for followee in followees: followee_l.append(followee.user_url.split('/')[4]) if len(followee_l) % 100 == 0: print zhihu_name + "'s NO." + str(len(followee_l)) + " followee is being processed. please wait..." time.sleep(0.05) followee_list = ','.join(followee_l) print u'添加完成' print u'开始添加关注者至处理队列' for follower in followers: follower_l.append(follower.user_url.split('/')[4]) urls = urllist.distinct("user_url") if follower.user_url in urls: pass # print "follower_url:'"+follower.user_url+"' passed" else: urllist.insert({"user_url": follower.user_url, "jlzt": "1"}) #print "follower_url:'" +follower.user_url + "' added" if len(follower_l) % 100 == 0: print zhihu_name + "'s NO." + str(len(follower_l)) + " follower is being processed. please wait..." time.sleep(0.05) print u'添加完成' for q in questions: questions_l.append("url=" + q.url + "|title=" + q.get_title()) time.sleep(0.01) for a in answers: answers_l.append(a.answer_url) time.sleep(0.01) for t in topics: topics_l.append(t) time.sleep(0.01) user_data = {"zhihu_id":zhihu_id, "zhihu_name":zhihu_name, "followees_num":followees_num, "followers_num":followers_num, "followees":followee_l, "followers":follower_l, "questions":questions_l, "gender":gender, "asks_num":asks_num, "answers_num":answers_num, "ansers":answers_l, "collections_num":collections_num, "agree_num":agree_num, "thanks_num":thanks_num, "topics_num":topics_num, "topics":topics_l, "head_img_url":head_img_url } print "user_data prepared:" urls = userlist.distinct("user_url") if user_url in urls: pass else: userlist.insert(user_data) print "user_data inserted: \n" urllist.update({"user_url":user_url},{"$set":{"jlzt":"2"}}) endtime = datetime.datetime.now() interval=(endtime - starttime).seconds print zhihu_name + "finnished. spent " + str(interval) + "seconds." except: traceback.print_exc() time.sleep(10) urllist.update({"user_url":user_url},{"$set":{"jlzt":"1"}}) continue print "处理完毕"