def populate_friends_from_collection(api, seed_collection, friend_collection, edge_collection=None, user_sample=1.0, friends_threshold=20000, update_threshold=None, requery=True, print_progress_every=1000): """ Populates given 'friends_collection' with local user documents representing the friends of each user in given 'seed_collection'. Note: populated documents are NOT fully hydrated twitter user objects, just IDs with status fields. 'friend_ids' field of seed_collection user docs will also be updated with IDs of friends fetched from twitter API. Parameters: api - fully authenticated Tweepy api or smappPy TweepyPool api seed_collection - fully authenticated (read/write) mongo collection friend_collection - fully authenticated (read/write) mongo collection edge_collection - [OPTIONAL] collection to store simple edge: {to: ID, from: ID} user_sample - proportion of seed users to fetch friends for requery - If False, only query for user's friends if 'friend_ids' field is empty friends_threshold - If user has > friends_threshold, DO NOT QUERY for friends. If user's - doc does not contain 'friends_count', ignore and query anyway update_threshold - Datetime threshold on users to update. Only queries friends of users with 'friends_updated' field LT 'update_threshold' """ # Ensure indexes ensure_userdoc_indexes(seed_collection) ensure_userdoc_indexes(friend_collection) if edge_collection: ensure_edge_indexes(edge_collection) # Create cursor over users (sample and date restriction possible) users = _get_user_sample(seed_collection, user_sample, "friends_updated", update_threshold) # Progress vars user_count = users.count(with_limit_and_skip=True) user_it = 1 logger.info("Friends: Considering total {0} users".format(user_count)) # Iterate over users, get friends, save user and friends friend_request_failed_for = [] for user in users: # Print progress if user_it % print_progress_every == 0: print ".. Processing user {0} of {1}".format(user_it, user_count) user_it += 1 # Check user private/deleted fields - don't requery if unreachable if "deleted" in user and user["deleted"] == True: logging.info("User {0} deleted, skipping".format(user["id"])) continue elif "private" in user and user["private"] == True: logging.info("User {0} private, skipping".format(user["id"])) continue # Check requery. If false, and user has friend_ids, skip user if not requery and user["friend_ids"]: logger.debug("User {0} has friends, not re-querying".format( user["id"])) continue # Check friends count for threshold if "friends_count" in user and user[ "friends_count"] > friends_threshold: logger.info( "User {0} has friends {1} above threshold {2}, skipping". format(user["id"], user["friends_count"], friends_threshold)) continue r, friend_ids = get_friends_ids(api, user["id"]) if _check_return_set_user(r, user, seed_collection): logging.info("User {0} unreachable, skipping".format(user["id"])) continue if friend_ids == None: friend_request_failed_for.append(user["id"]) continue # Initialize (if necessary) and set user's friend_ids list if not user["friend_ids"]: user["friend_ids"] = list(set(friend_ids)) else: user["friend_ids"] = list(set(user["friend_ids"] + friend_ids)) # Save all friends as userdocs in friends collection _save_userdocs(friend_ids, friend_collection) # Optionally save "edge" documents if edge_collection: _save_friend_edges(user["id"], friend_ids, edge_collection) # Update user doc's timestamps and save user["updated_timestamp"] = datetime.now() user["friends_updated"] = datetime.now() seed_collection.save(user) # Print failure numbers logger.info("Failed to find friends for {0} users".format( len(friend_request_failed_for)))
def populate_followers_from_collection(api, seed_collection, follower_collection, edge_collection=None, user_sample=1.0, followers_threshold=20000, update_threshold=None, requery=True, print_progress_every=1000): """ See 'populate_friends_from_collection'. Exactly the same, but for followers """ # Ensure indexes ensure_userdoc_indexes(seed_collection) ensure_userdoc_indexes(follower_collection) if edge_collection: ensure_edge_indexes(edge_collection) # Create cursor over users (sample and date restriction possible) users = _get_user_sample(seed_collection, user_sample, "followers_updated", update_threshold) # Progress vars user_count = users.count(with_limit_and_skip=True) user_it = 1 logger.info("Considering total {0} users".format(user_count)) # Iterate over users, get followers, save user and followers follower_request_failed_for = [] for user in users: if user_it % print_progress_every == 0: print ".. Processing user {0} of {1}".format(user_it, user_count) user_it += 1 # Check user private/deleted fields - don't requery if unreachable if "deleted" in user and user["deleted"] == True: logging.info("User {0} deleted, skipping".format(user["id"])) continue elif "private" in user and user["private"] == True: logging.info("User {0} private, skipping".format(user["id"])) continue # Check requery. If false, and user has follower_ids, skip user if not requery and user["follower_ids"]: logger.debug("User {0} has followers, not re-querying".format( user["id"])) continue # Check followers count for threshold if "followers_count" in user and user[ "followers_count"] > followers_threshold: logger.info( "User {0} has followers {1} above threshold {2}, skipping". format(user["id"], user["followers_count"], followers_threshold)) continue r, follower_ids = get_followers_ids(api, user["id"]) if _check_return_set_user(r, user, seed_collection): logging.info("User {0} unreachable, skipping".format(user["id"])) continue if follower_ids == None: follower_request_failed_for.append(user["id"]) continue # Initialize (if necessary) and set user's follower_ids list if not user["follower_ids"]: user["follower_ids"] = list(set(follower_ids)) else: user["follower_ids"] = list( set(user["follower_ids"] + follower_ids)) # Save all followers as userdocs in followers collection _save_userdocs(follower_ids, follower_collection) # Optionally save "edge" documents if edge_collection: _save_follower_edges(user["id"], follower_ids, edge_collection) # Update user doc's timestamps and save user["updated_timestamp"] = datetime.now() user["followers_updated"] = datetime.now() seed_collection.save(user) # Print failure numbers logger.info("Failed to find followers for {0} users".format( len(follower_request_failed_for)))
# Set up DB connection client = MongoClient(args.server, args.port) database = client[args.database] if args.user and args.password: database.authenticate(args.user, args.password) collection = database[args.collection] # Get user list user_ids = [] with open(args.users_file, "r") as handle: for line in handle: user_ids.append(line.strip()) user_ids = list(set(user_ids)) print "Uploading {0} IDs from {1}".format(len(user_ids), args.users_file) # Ensure indexes on user collection print "Ensuring collection indexes" ensure_userdoc_indexes(collection) # Create and save userdocs for all userids for uid in user_ids: print ".. Processing user {0}".format(uid) userdoc = create_userdoc(uid) try: collection.save(userdoc) except DuplicateKeyError as e: print ".... Userdoc for user {0} already in DB. Skipping".format(uid) continue print "Complete"
# Set up DB connection client = MongoClient(args.server, args.port) database = client[args.database] if args.user and args.password: database.authenticate(args.user, args.password) collection = database[args.collection] # Get user list user_ids = [] with open(args.users_file, "r") as handle: for line in handle: user_ids.append(line.strip()) user_ids = list(set(user_ids)) print "Uploading {0} IDs from {1}".format(len(user_ids), args.users_file) # Ensure indexes on user collection print "Ensuring collection indexes" ensure_userdoc_indexes(collection) # Create and save userdocs for all userids for uid in user_ids: print ".. Processing user {0}".format(uid) userdoc = create_userdoc(uid) try: collection.save(userdoc) except DuplicateKeyError as e: print ".... Userdoc for user {0} already in DB. Skipping".format( uid) continue print "Complete"