Пример #1
0
def populate_friends_from_collection(api,
                                     seed_collection,
                                     friend_collection,
                                     edge_collection=None,
                                     user_sample=1.0,
                                     friends_threshold=20000,
                                     update_threshold=None,
                                     requery=True,
                                     print_progress_every=1000):
    """
    Populates given 'friends_collection' with local user documents representing the friends
    of each user in given 'seed_collection'.
    Note: populated documents are NOT fully hydrated twitter user objects, just IDs with 
    status fields. 
    'friend_ids' field of seed_collection user docs will also be updated with IDs of friends
    fetched from twitter API.
    Parameters:
        api     - fully authenticated Tweepy api or smappPy TweepyPool api
        seed_collection    - fully authenticated (read/write) mongo collection
        friend_collection  - fully authenticated (read/write) mongo collection
        edge_collection    - [OPTIONAL] collection to store simple edge: {to: ID, from: ID}
        user_sample        - proportion of seed users to fetch friends for
        requery            - If False, only query for user's friends if 'friend_ids' field is empty
        friends_threshold  - If user has > friends_threshold, DO NOT QUERY for friends. If user's
                           - doc does not contain 'friends_count', ignore and query anyway
        update_threshold   - Datetime threshold on users to update. Only queries friends of users
                             with 'friends_updated' field LT 'update_threshold'
    """
    # Ensure indexes
    ensure_userdoc_indexes(seed_collection)
    ensure_userdoc_indexes(friend_collection)
    if edge_collection:
        ensure_edge_indexes(edge_collection)

    # Create cursor over users (sample and date restriction possible)
    users = _get_user_sample(seed_collection, user_sample, "friends_updated",
                             update_threshold)

    # Progress vars
    user_count = users.count(with_limit_and_skip=True)
    user_it = 1
    logger.info("Friends: Considering total {0} users".format(user_count))

    # Iterate over users, get friends, save user and friends
    friend_request_failed_for = []
    for user in users:
        # Print progress
        if user_it % print_progress_every == 0:
            print ".. Processing user {0} of {1}".format(user_it, user_count)
        user_it += 1

        # Check user private/deleted fields - don't requery if unreachable
        if "deleted" in user and user["deleted"] == True:
            logging.info("User {0} deleted, skipping".format(user["id"]))
            continue
        elif "private" in user and user["private"] == True:
            logging.info("User {0} private, skipping".format(user["id"]))
            continue

        # Check requery. If false, and user has friend_ids, skip user
        if not requery and user["friend_ids"]:
            logger.debug("User {0} has friends, not re-querying".format(
                user["id"]))
            continue

        # Check friends count for threshold
        if "friends_count" in user and user[
                "friends_count"] > friends_threshold:
            logger.info(
                "User {0} has friends {1} above threshold {2}, skipping".
                format(user["id"], user["friends_count"], friends_threshold))
            continue

        r, friend_ids = get_friends_ids(api, user["id"])
        if _check_return_set_user(r, user, seed_collection):
            logging.info("User {0} unreachable, skipping".format(user["id"]))
            continue

        if friend_ids == None:
            friend_request_failed_for.append(user["id"])
            continue

        # Initialize (if necessary) and set user's friend_ids list
        if not user["friend_ids"]:
            user["friend_ids"] = list(set(friend_ids))
        else:
            user["friend_ids"] = list(set(user["friend_ids"] + friend_ids))

        # Save all friends as userdocs in friends collection
        _save_userdocs(friend_ids, friend_collection)

        # Optionally save "edge" documents
        if edge_collection:
            _save_friend_edges(user["id"], friend_ids, edge_collection)

        # Update user doc's timestamps and save
        user["updated_timestamp"] = datetime.now()
        user["friends_updated"] = datetime.now()
        seed_collection.save(user)

    # Print failure numbers
    logger.info("Failed to find friends for {0} users".format(
        len(friend_request_failed_for)))
Пример #2
0
def populate_followers_from_collection(api,
                                       seed_collection,
                                       follower_collection,
                                       edge_collection=None,
                                       user_sample=1.0,
                                       followers_threshold=20000,
                                       update_threshold=None,
                                       requery=True,
                                       print_progress_every=1000):
    """
    See 'populate_friends_from_collection'. Exactly the same, but for followers
    """
    # Ensure indexes
    ensure_userdoc_indexes(seed_collection)
    ensure_userdoc_indexes(follower_collection)
    if edge_collection:
        ensure_edge_indexes(edge_collection)

    # Create cursor over users (sample and date restriction possible)
    users = _get_user_sample(seed_collection, user_sample, "followers_updated",
                             update_threshold)

    # Progress vars
    user_count = users.count(with_limit_and_skip=True)
    user_it = 1
    logger.info("Considering total {0} users".format(user_count))

    # Iterate over users, get followers, save user and followers
    follower_request_failed_for = []
    for user in users:
        if user_it % print_progress_every == 0:
            print ".. Processing user {0} of {1}".format(user_it, user_count)
        user_it += 1

        # Check user private/deleted fields - don't requery if unreachable
        if "deleted" in user and user["deleted"] == True:
            logging.info("User {0} deleted, skipping".format(user["id"]))
            continue
        elif "private" in user and user["private"] == True:
            logging.info("User {0} private, skipping".format(user["id"]))
            continue

        # Check requery. If false, and user has follower_ids, skip user
        if not requery and user["follower_ids"]:
            logger.debug("User {0} has followers, not re-querying".format(
                user["id"]))
            continue

        # Check followers count for threshold
        if "followers_count" in user and user[
                "followers_count"] > followers_threshold:
            logger.info(
                "User {0} has followers {1} above threshold {2}, skipping".
                format(user["id"], user["followers_count"],
                       followers_threshold))
            continue

        r, follower_ids = get_followers_ids(api, user["id"])
        if _check_return_set_user(r, user, seed_collection):
            logging.info("User {0} unreachable, skipping".format(user["id"]))
            continue

        if follower_ids == None:
            follower_request_failed_for.append(user["id"])
            continue

        # Initialize (if necessary) and set user's follower_ids list
        if not user["follower_ids"]:
            user["follower_ids"] = list(set(follower_ids))
        else:
            user["follower_ids"] = list(
                set(user["follower_ids"] + follower_ids))

        # Save all followers as userdocs in followers collection
        _save_userdocs(follower_ids, follower_collection)

        # Optionally save "edge" documents
        if edge_collection:
            _save_follower_edges(user["id"], follower_ids, edge_collection)

        # Update user doc's timestamps and save
        user["updated_timestamp"] = datetime.now()
        user["followers_updated"] = datetime.now()
        seed_collection.save(user)

    # Print failure numbers
    logger.info("Failed to find followers for {0} users".format(
        len(follower_request_failed_for)))
Пример #3
0
    # Set up DB connection
    client = MongoClient(args.server, args.port)
    database = client[args.database]
    if args.user and args.password:
        database.authenticate(args.user, args.password)
    collection = database[args.collection]

    # Get user list
    user_ids = []
    with open(args.users_file, "r") as handle:
        for line in handle:
            user_ids.append(line.strip())
    user_ids = list(set(user_ids))
    print "Uploading {0} IDs from {1}".format(len(user_ids), args.users_file)

    # Ensure indexes on user collection
    print "Ensuring collection indexes"
    ensure_userdoc_indexes(collection)

    # Create and save userdocs for all userids
    for uid in user_ids:
        print ".. Processing user {0}".format(uid)
        userdoc = create_userdoc(uid)
        try:
            collection.save(userdoc)
        except DuplicateKeyError as e:
            print ".... Userdoc for user {0} already in DB. Skipping".format(uid)
            continue

    print "Complete"
Пример #4
0
def populate_friends_from_collection(api,
                                     seed_collection,
                                     friend_collection,
                                     edge_collection=None,
                                     user_sample=1.0,
                                     friends_threshold=20000,
                                     update_threshold=None,
                                     requery=True,
                                     print_progress_every=1000):
    """
    Populates given 'friends_collection' with local user documents representing the friends
    of each user in given 'seed_collection'.
    Note: populated documents are NOT fully hydrated twitter user objects, just IDs with 
    status fields. 
    'friend_ids' field of seed_collection user docs will also be updated with IDs of friends
    fetched from twitter API.
    Parameters:
        api     - fully authenticated Tweepy api or smappPy TweepyPool api
        seed_collection    - fully authenticated (read/write) mongo collection
        friend_collection  - fully authenticated (read/write) mongo collection
        edge_collection    - [OPTIONAL] collection to store simple edge: {to: ID, from: ID}
        user_sample        - proportion of seed users to fetch friends for
        requery            - If False, only query for user's friends if 'friend_ids' field is empty
        friends_threshold  - If user has > friends_threshold, DO NOT QUERY for friends. If user's
                           - doc does not contain 'friends_count', ignore and query anyway
        update_threshold   - Datetime threshold on users to update. Only queries friends of users
                             with 'friends_updated' field LT 'update_threshold'
    """
    # Ensure indexes
    ensure_userdoc_indexes(seed_collection)
    ensure_userdoc_indexes(friend_collection)
    if edge_collection:
        ensure_edge_indexes(edge_collection)

    # Create cursor over users (sample and date restriction possible)
    users = _get_user_sample(seed_collection, user_sample, "friends_updated",
                             update_threshold)

    # Progress vars
    user_count = users.count(with_limit_and_skip=True)
    user_it = 1
    logger.info("Friends: Considering total {0} users".format(user_count))

    # Iterate over users, get friends, save user and friends
    friend_request_failed_for = []
    for user in users:
        # Print progress
        if user_it % print_progress_every == 0:
            print ".. Processing user {0} of {1}".format(user_it, user_count)
        user_it += 1

        # Check user private/deleted fields - don't requery if unreachable
        if "deleted" in user and user["deleted"] == True:
            logging.info("User {0} deleted, skipping".format(user["id"]))
            continue
        elif "private" in user and user["private"] == True:
            logging.info("User {0} private, skipping".format(user["id"]))
            continue

        # Check requery. If false, and user has friend_ids, skip user
        if not requery and user["friend_ids"]:
            logger.debug("User {0} has friends, not re-querying".format(
                user["id"]))
            continue

        # Check friends count for threshold
        if "friends_count" in user and user[
                "friends_count"] > friends_threshold:
            logger.info(
                "User {0} has friends {1} above threshold {2}, skipping".
                format(user["id"], user["friends_count"], friends_threshold))
            continue

        r, friend_ids = get_friends_ids(api, user["id"])
        if _check_return_set_user(r, user, seed_collection):
            logging.info("User {0} unreachable, skipping".format(user["id"]))
            continue

        if friend_ids == None:
            friend_request_failed_for.append(user["id"])
            continue

        # Initialize (if necessary) and set user's friend_ids list
        if not user["friend_ids"]:
            user["friend_ids"] = list(set(friend_ids))
        else:
            user["friend_ids"] = list(set(user["friend_ids"] + friend_ids))

        # Save all friends as userdocs in friends collection
        _save_userdocs(friend_ids, friend_collection)

        # Optionally save "edge" documents
        if edge_collection:
            _save_friend_edges(user["id"], friend_ids, edge_collection)

        # Update user doc's timestamps and save
        user["updated_timestamp"] = datetime.now()
        user["friends_updated"] = datetime.now()
        seed_collection.save(user)

    # Print failure numbers
    logger.info("Failed to find friends for {0} users".format(
        len(friend_request_failed_for)))
Пример #5
0
def populate_followers_from_collection(api,
                                       seed_collection,
                                       follower_collection,
                                       edge_collection=None,
                                       user_sample=1.0,
                                       followers_threshold=20000,
                                       update_threshold=None,
                                       requery=True,
                                       print_progress_every=1000):
    """
    See 'populate_friends_from_collection'. Exactly the same, but for followers
    """
    # Ensure indexes
    ensure_userdoc_indexes(seed_collection)
    ensure_userdoc_indexes(follower_collection)
    if edge_collection:
        ensure_edge_indexes(edge_collection)

    # Create cursor over users (sample and date restriction possible)
    users = _get_user_sample(seed_collection, user_sample, "followers_updated",
                             update_threshold)

    # Progress vars
    user_count = users.count(with_limit_and_skip=True)
    user_it = 1
    logger.info("Considering total {0} users".format(user_count))

    # Iterate over users, get followers, save user and followers
    follower_request_failed_for = []
    for user in users:
        if user_it % print_progress_every == 0:
            print ".. Processing user {0} of {1}".format(user_it, user_count)
        user_it += 1

        # Check user private/deleted fields - don't requery if unreachable
        if "deleted" in user and user["deleted"] == True:
            logging.info("User {0} deleted, skipping".format(user["id"]))
            continue
        elif "private" in user and user["private"] == True:
            logging.info("User {0} private, skipping".format(user["id"]))
            continue

        # Check requery. If false, and user has follower_ids, skip user
        if not requery and user["follower_ids"]:
            logger.debug("User {0} has followers, not re-querying".format(
                user["id"]))
            continue

        # Check followers count for threshold
        if "followers_count" in user and user[
                "followers_count"] > followers_threshold:
            logger.info(
                "User {0} has followers {1} above threshold {2}, skipping".
                format(user["id"], user["followers_count"],
                       followers_threshold))
            continue

        r, follower_ids = get_followers_ids(api, user["id"])
        if _check_return_set_user(r, user, seed_collection):
            logging.info("User {0} unreachable, skipping".format(user["id"]))
            continue

        if follower_ids == None:
            follower_request_failed_for.append(user["id"])
            continue

        # Initialize (if necessary) and set user's follower_ids list
        if not user["follower_ids"]:
            user["follower_ids"] = list(set(follower_ids))
        else:
            user["follower_ids"] = list(
                set(user["follower_ids"] + follower_ids))

        # Save all followers as userdocs in followers collection
        _save_userdocs(follower_ids, follower_collection)

        # Optionally save "edge" documents
        if edge_collection:
            _save_follower_edges(user["id"], follower_ids, edge_collection)

        # Update user doc's timestamps and save
        user["updated_timestamp"] = datetime.now()
        user["followers_updated"] = datetime.now()
        seed_collection.save(user)

    # Print failure numbers
    logger.info("Failed to find followers for {0} users".format(
        len(follower_request_failed_for)))
Пример #6
0
    # Set up DB connection
    client = MongoClient(args.server, args.port)
    database = client[args.database]
    if args.user and args.password:
        database.authenticate(args.user, args.password)
    collection = database[args.collection]

    # Get user list
    user_ids = []
    with open(args.users_file, "r") as handle:
        for line in handle:
            user_ids.append(line.strip())
    user_ids = list(set(user_ids))
    print "Uploading {0} IDs from {1}".format(len(user_ids), args.users_file)

    # Ensure indexes on user collection
    print "Ensuring collection indexes"
    ensure_userdoc_indexes(collection)

    # Create and save userdocs for all userids
    for uid in user_ids:
        print ".. Processing user {0}".format(uid)
        userdoc = create_userdoc(uid)
        try:
            collection.save(userdoc)
        except DuplicateKeyError as e:
            print ".... Userdoc for user {0} already in DB. Skipping".format(
                uid)
            continue

    print "Complete"