def snowball_following(poi_db, net_db, level, check='N'): #Processing max 200 users each time. start_level = level while True: count = poi_db.count({'level': start_level, 'protected': False, 'following_scrape_flag': {'$exists': False}}) if count == 0: return False else: # print 'have user', count for user in poi_db.find({'level': start_level, 'protected': False, 'following_scrape_flag': {'$exists': False}}, ['id_str']).limit(min(200, count)): # print 'a new user' next_cursor = -1 params = {'user_id': user['id_str'], 'count': 5000, 'stringify_ids':True} # followee getting while next_cursor != 0: params['cursor'] = next_cursor followees = get_followings(params) if followees: followee_ids = followees['ids'] list_size = len(followee_ids) length = int(math.ceil(list_size/100.0)) # print length print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Process followings', list_size, 'for user', user['id_str'] for index in xrange(length): index_begin = index*100 index_end = min(list_size, index_begin+100) profiles = lookup.get_users_info(followee_ids[index_begin:index_end]) print 'user prof:', index_begin, index_end, len(profiles) # if profiles: for profile in profiles: check_flag = profiles_check.check_user(profile, check) if check_flag: profile['following_prelevel_node'] = user['id_str'] profile['level'] = start_level+1 try: poi_db.insert(profile) except pymongo.errors.DuplicateKeyError: pass try: net_db.insert({'user': int(profile['id_str']), 'follower': int(user['id_str']), 'scraped_at': datetime.datetime.now()}) except pymongo.errors.DuplicateKeyError: pass # prepare for next iterator next_cursor = followees['next_cursor'] else: break poi_db.update_one({'id': int(user['id_str'])}, {'$set':{"following_scrape_flag": True }}, upsert=False) return True
def lookup_user_list(user_list, poi_db, level, check="N"): infos = get_users_info(user_list) if infos: for profile in infos: check_flag = profiles_check.check_user(profile, check) if check_flag: profile["level"] = level try: poi_db.insert(profile) user_list.remove(profile["id"]) except pymongo.errors.DuplicateKeyError: print "Existing user:"******"id_str"] user_list.remove(profile["id"]) pass else: user_list.remove(profile["id"]) print "Protected account", profile["screen_name"] print "Deleted accounts", user_list
def trans_seed_to_poi(seed_list, poi_db, check="N"): infos = get_users_info(seed_list) if infos: for profile in infos: check_flag = profiles_check.check_user(profile, check) if check_flag: profile["level"] = 1 try: poi_db.insert(profile) seed_list.remove(profile["id"]) except pymongo.errors.DuplicateKeyError: print "Existing user:"******"id_str"] seed_list.remove(profile["id"]) poi_db.update_one({"id": int(profile["id_str"])}, {"$set": {"level": 1}}, upsert=False) else: seed_list.remove(profile["id"]) print "Protected account", profile["screen_name"] print "Deleted accounts", seed_list
def lookup_user_list(user_list, poi_db, level, check='N'): infos = get_users_info(user_list) if infos: for profile in infos: check_flag = profiles_check.check_user(profile, check) if check_flag: profile['level'] = level try: poi_db.insert(profile) user_list.remove(profile['id']) except pymongo.errors.DuplicateKeyError: print 'Existing user:'******'id_str'] user_list.remove(profile['id']) pass else: user_list.remove(profile['id']) print 'Protected account', profile['screen_name'] print 'Deleted accounts', user_list
def get_tweet_retweeters(tweet_id, poi_db, check='N'): next_cursor = -1 params = {'id': tweet_id, 'stringify_ids': True} # followee getting while next_cursor != 0: params['cursor'] = next_cursor retweeters = get_retweeters(params) if retweeters: retweeter_ids = retweeters['ids'] print 'Retweeters size', len(retweeter_ids) profiles = lookup.get_users_info(retweeter_ids) # if profiles: for profile in profiles: check_flag = profiles_check.check_user(profile, check) if check_flag: try: poi_db.insert(profile) except pymongo.errors.DuplicateKeyError: pass # prepare for next iterator next_cursor = retweeters['next_cursor']
def trans_seed_to_poi(seed_list, poi_db, check='N'): infos = get_users_info(seed_list) if infos: for profile in infos: check_flag = profiles_check.check_user(profile, check) if check_flag: profile['level'] = 1 try: poi_db.insert(profile) seed_list.remove(profile['id']) except pymongo.errors.DuplicateKeyError: print 'Existing user:'******'id_str'] seed_list.remove(profile['id']) poi_db.update_one({'id': int(profile['id_str'])}, {'$set': { "level": 1 }}, upsert=False) else: seed_list.remove(profile['id']) print 'Protected account', profile['screen_name'] print 'Deleted accounts', seed_list
def snowball_follower(poi_db, net_db, level, check='N'): #Processing max 200 users each time. start_level = level while True: count = poi_db.find_one({ 'level': start_level, 'protected': False, 'follower_scrape_flag': { '$exists': False } }) if count is None: return False else: for user in poi_db.find( { 'level': start_level, 'protected': False, 'follower_scrape_flag': { '$exists': False } }, ['id_str']).limit(200): next_cursor = -1 params = { 'user_id': user['id_str'], 'count': 5000, 'stringify_ids': True } # follower getting while next_cursor != 0: params['cursor'] = next_cursor followers = get_followers(params) if followers: follower_ids = followers['ids'] list_size = len(follower_ids) length = int(math.ceil(list_size / 100.0)) # print length print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S" ), 'Process followers', list_size, 'for user', user[ 'id_str'] for index in xrange(length): index_begin = index * 100 index_end = min(list_size, index_begin + 100) profiles = lookup.get_users_info( follower_ids[index_begin:index_end]) if profiles: print 'user prof:', index_begin, index_end, len( profiles) for profile in profiles: check_flag = profiles_check.check_user( profile, check) if check_flag: profile[ 'follower_prelevel_node'] = user[ 'id_str'] profile['level'] = start_level + 1 try: poi_db.insert(profile) except pymongo.errors.DuplicateKeyError: pass try: net_db.insert({ 'user': int(user['id_str']), 'follower': int(profile['id_str']), 'scraped_at': datetime.datetime.now(). strftime( '%a %b %d %H:%M:%S +0000 %Y' ) }) except pymongo.errors.DuplicateKeyError: pass # prepare for next iterator next_cursor = followers['next_cursor'] else: break poi_db.update_one({'id': int(user['id_str'])}, {'$set': { "follower_scrape_flag": True }}, upsert=False) continue
def snowball_following_proportion(poi_db, net_db, level, check='N', proportation=0.1): #Processing max 200 users each time., only retrieve 10% followings start_level = level while True: count = poi_db.find_one({ 'level': start_level, 'protected': False, 'following_scrape_flag': { '$exists': False } }) if count is None: return False else: # print 'have user', count for user in poi_db.find( { 'level': start_level, 'protected': False, 'following_scrape_flag': { '$exists': False } }, ['id_str', 'friends_count']).limit(200): # print 'a new user' following_limit = int(user['friends_count'] * proportation) next_cursor = -1 params = {'user_id': user['id_str'], 'stringify_ids': True} # followee getting while next_cursor != 0 and following_limit > 0: params['cursor'] = next_cursor print user['id_str'], ' following limit ', following_limit params['count'] = min(following_limit, 5000) followees = get_followings(params) if followees: followee_ids = followees['ids'] list_size = len(followee_ids) following_limit -= list_size length = int(math.ceil(list_size / 100.0)) # print length print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S" ), 'Process followings', list_size, 'for user', user[ 'id_str'] for index in xrange(length): index_begin = index * 100 index_end = min(list_size, index_begin + 100) profiles = lookup.get_users_info( followee_ids[index_begin:index_end]) if profiles: print 'user prof:', index_begin, index_end, len( profiles) for profile in profiles: check_flag = profiles_check.check_user( profile, check) if check_flag: profile[ 'following_prelevel_node'] = user[ 'id_str'] profile['level'] = start_level + 1 probablity = float( user['friends_count'] + 1) / ( profile['followers_count'] + 1) # probablity = 1.0/profile['followers_count'] randomv = random.uniform(0, 1) if randomv <= probablity: try: poi_db.insert(profile) except pymongo.errors.DuplicateKeyError: pass try: net_db.insert({ 'user': int(profile['id_str']), 'follower': int(user['id_str']), 'scraped_at': datetime.datetime.now() }) except pymongo.errors.DuplicateKeyError: pass # prepare for next iterator next_cursor = followees['next_cursor'] else: break poi_db.update_one({'id': int(user['id_str'])}, {'$set': { "following_scrape_flag": True }}, upsert=False) return True