def create_documents(cur, conn, tl_start_date, tl_end_date): """ This function creates a 'document' for TF-IDF calculations. :param cur: Cursor to database :param conn: Connection to database :param tl_start_date: datetime object to indicate beginning of time line :param tl_end_date: datetime object to indicate the end of a time line """ # Select relevant users that do not already have timeline-documents created cur.execute(cur.mogrify("SELECT DISTINCT (tweets.user_id) FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE users.timeline_document IS NULL AND users.has_timeline = TRUE AND (users.expand_user IS NULL OR users.expand_user = TRUE) AND (tweets.created_at >= %s AND tweets.created_at <= %s);", (tl_start_date, tl_end_date))) uids = cur.fetchall() print "\nCreate documents for {} users".format(len(uids)) # Create the timeline-documents for u in range(len(uids)): print "\tCreate document for user {}: {} out of {}".format(uids[u][0], u, len(uids)) timeline_document = [] # Grab relevant tweets cur.execute(cur.mogrify("SELECT tweet FROM tweets WHERE user_id = %s AND (created_at <= %s AND created_at >= %s);", (uids[u][0], tl_end_date, tl_start_date))) for twt in cur: timeline_document.extend(twt[0].split(' ')) # Process each word in timeline: convert to lower case, remove punctuation, remove English stop-words timeline_document = process_word_list(timeline_document) # Add timeline_document to table json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET timeline_document = %s WHERE user_id = %s;", (timeline_document, uids[u][0]))) if len(timeline_document) < 1: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET timeline_is_relevant = FALSE WHERE user_id = {};".format(uids[u][0]))
def find_top_scores_to_expand(cur, conn, user_id_list, threshold_percentile, tf_type): """ This function returns the top users from user_id_list. :param cur: Cursor to database :param conn: Connection to database :param user_id_list: List of Twitter user IDs :param threshold_percentile: Percent of top scores to expand, value between [0, 1] :param tf_type: """ assert ( 0 <= threshold_percentile <= 1 ), "The parameter 'threshold_percentile' must fall in the range [0, 1]." # Compute the CANDID scores scores = [] print "Compute TF-DF scores" for f in user_id_list: scores.append( compute_tfdf_score(cur=cur, user_id=f, tf_type=tf_type) / float(len(user_id_list))) # Choose top % of friends to expand threshold_value = np.percentile(scores, threshold_percentile * 100) for i in range(len(scores)): json_to_database.make_sql_edit( cur, conn, "UPDATE users SET decision_candid_tfdf_score = {} WHERE user_id = {};" .format(scores[i], user_id_list[i])) if scores[i] >= threshold_value: json_to_database.make_sql_edit( cur, conn, "UPDATE users SET expand_user = TRUE WHERE user_id = {} AND expand_user IS NULL;" .format(scores[i], user_id_list[i]))
def _find_relevant_users(cur, conn, user_ids): """ This function expands relevant users. :param cur: Cursor to database :param conn: Connection to database :param user_ids: List of Twitter user IDs :return: List of Twitter user IDs """ expand_count = 0 if isinstance(user_ids, set): user_ids = list(user_ids) users_expanded_by_rule = set([]) # Do not expand users who have more than 1000 friends+followers json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);") # Get topic query vector topic_queries = [] cur.execute("SELECT topic FROM topics;") for t in cur: topic_queries.append("'%{}%' ~* ARRAY_TO_STRING(hashtag_entities, ',', '*')".format(t[0])) # Get users/tweets, and expand users with relevant tweets for uu in range(len(user_ids)): print "\tLabel user {} relevant: expansion {} of {}.".format(user_ids[uu], uu, len(user_ids)) idx = 0 r = 20 # Check 20 hashtags at a time while (idx+r) < len(topic_queries): topic_query = " OR ".join(topic_queries[idx:(idx+r)]) cur.execute("SELECT COUNT(tweet_id) FROM tweets WHERE (user_id = {}) AND ({});".format(user_ids[uu], topic_query)) q = cur.fetchone() # If user has no tweets if q is None: continue if q[0] > 0: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=TRUE, timeline_is_relevant=TRUE WHERE user_id = {};".format(user_ids[uu])) users_expanded_by_rule.add(user_ids[uu]) expand_count += 1 break idx += r if idx < len(topic_queries) < (idx + r): topic_query = " OR ".join(topic_queries[idx:]) cur.execute("SELECT COUNT(tweet_id) FROM tweets WHERE (user_id = {}) AND ({});".format(user_ids[uu], topic_query)) q = cur.fetchone() if q is None: continue if q[0] > 0: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=TRUE, timeline_is_relevant=TRUE WHERE user_id = {};".format(user_ids[uu])) users_expanded_by_rule.add(user_ids[uu]) expand_count += 1 break # Set remaining timeline_is_relevant=NULL to timeline_is_relevant=FALSE json_to_database.make_sql_edit(cur, conn, "UPDATE users SET timeline_is_relevant=FALSE WHERE (user_id={}) AND (timeline_is_relevant IS NULL);".format(user_ids[uu])) print "We are expanding {} users from hashtags.".format(len(users_expanded_by_rule)) return users_expanded_by_rule
def _add_all_hashtags(cur, conn, user_id, tl_start_date, tl_end_date, khop): """ Add all hashtags from a user to the topic table. :param cur: Cursor to database :param conn: Connection to database :param user_id: Twitter user ID :param tl_start_date: Timeline start date, datetime object :param tl_end_date: Timeline end date, datetime object :param khop: hop count :return nAddedHT: Boolean value indicating if hashtag was added """ nAddedHT = 0 cur.execute(cur.mogrify("SELECT hashtag_entities FROM tweets WHERE (user_id = " + str(user_id) + ") AND (created_at >= %s AND created_at <= %s) AND (hashtag_entities IS NOT NULL AND hashtag_entities != '{}');"), (tl_start_date, tl_end_date)) ht = cur.fetchall() for j in ht: for k in j[0]: json_to_database.make_sql_edit(cur, conn, "INSERT INTO topics (topic, khop) VALUES ('{}', {});".format(k.lower(), khop)) nAddedHT += 1 return nAddedHT
def create_documents(cur, conn, tl_start_date, tl_end_date): """ This function creates a 'document' for TF-IDF calculations. :param cur: Cursor to database :param conn: Connection to database :param tl_start_date: datetime object to indicate beginning of time line :param tl_end_date: datetime object to indicate the end of a time line """ # Select relevant users that do not already have timeline-documents created cur.execute( cur.mogrify( "SELECT DISTINCT (tweets.user_id) FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE users.timeline_document IS NULL AND users.has_timeline = TRUE AND (users.expand_user IS NULL OR users.expand_user = TRUE) AND (tweets.created_at >= %s AND tweets.created_at <= %s);", (tl_start_date, tl_end_date))) uids = cur.fetchall() print "\nCreate documents for {} users".format(len(uids)) # Create the timeline-documents for u in range(len(uids)): print "\tCreate document for user {}: {} out of {}".format( uids[u][0], u, len(uids)) timeline_document = [] # Grab relevant tweets cur.execute( cur.mogrify( "SELECT tweet FROM tweets WHERE user_id = %s AND (created_at <= %s AND created_at >= %s);", (uids[u][0], tl_end_date, tl_start_date))) for twt in cur: timeline_document.extend(twt[0].split(' ')) # Process each word in timeline: convert to lower case, remove punctuation, remove English stop-words timeline_document = process_word_list(timeline_document) # Add timeline_document to table json_to_database.make_sql_edit( cur, conn, cur.mogrify( "UPDATE users SET timeline_document = %s WHERE user_id = %s;", (timeline_document, uids[u][0]))) if len(timeline_document) < 1: json_to_database.make_sql_edit( cur, conn, "UPDATE users SET timeline_is_relevant = FALSE WHERE user_id = {};" .format(uids[u][0]))
def find_top_scores_to_expand(cur, conn, user_id_list, threshold_percentile, tf_type): """ This function returns the top users from user_id_list. :param cur: Cursor to database :param conn: Connection to database :param user_id_list: List of Twitter user IDs :param threshold_percentile: Percent of top scores to expand, value between [0, 1] :param tf_type: """ assert (0 <= threshold_percentile <= 1), "The parameter 'threshold_percentile' must fall in the range [0, 1]." # Compute the CANDID scores scores = [] print "Compute TF-DF scores" for f in user_id_list: scores.append(compute_tfdf_score(cur=cur, user_id=f, tf_type=tf_type)/float(len(user_id_list))) # Choose top % of friends to expand threshold_value = np.percentile(scores, threshold_percentile*100) for i in range(len(scores)): json_to_database.make_sql_edit(cur, conn, "UPDATE users SET decision_candid_tfdf_score = {} WHERE user_id = {};".format(scores[i], user_id_list[i])) if scores[i] >= threshold_value: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user = TRUE WHERE user_id = {} AND expand_user IS NULL;".format(scores[i], user_id_list[i]))
def _filter_by_timeline(cur, conn, tl_start_date, tl_end_date, khop): """ This function expands relevant users. :param cur: Cursor to database :param conn: Connection to database :param tl_start_date: Timeline start date, datetime object :param tl_end_date: Timeline end date, datetime object :param khop: Current hop of sampling loop """ # Get topic query topic_query = _get_topic_query(cur) # Get users/tweets, and expand users with relevant tweets cur.execute(cur.mogrify("SELECT DISTINCT (users.user_id) FROM users INNER JOIN tweets ON users.user_id=tweets.user_id WHERE (users.has_timeline_filter IS NULL) AND (users.expand_user IS NULL) AND (users.has_timeline = TRUE) AND (users.khop = {}) AND (tweets.created_at >= %s AND tweets.created_at <= %s);".format(khop), (tl_start_date, tl_end_date))) uids = cur.fetchall() for u in range(len(uids)): if u % 50 == 0: print "\nFilter timeline for user {}: {} out of {}".format(uids[u][0], u, len(uids)) cur.execute(cur.mogrify("SELECT COUNT(tweet_id) FROM tweets WHERE (user_id = " + str(uids[u][0]) + ") AND (created_at >= %s AND created_at <= %s)", (tl_start_date, tl_end_date)) + " AND ({});".format(topic_query)) q = cur.fetchone()[0] if q < 1: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user = FALSE WHERE user_id = {};".format(uids[u][0])) json_to_database.make_sql_edit(cur, conn, "UPDATE users SET has_timeline_filter = TRUE WHERE user_id = {};".format(uids[u][0]))
def _get_timeline_wrapper(cur, conn, user_id, tl_start_date, proxies, auth): """ This function is a wrapper for grabbing user timelines. :param cur: Cursor to database :param conn: Connection to databaase :param user_id: Twitter user ID :param tl_start_date: Start date of timeline, datetime object :param proxies: proxy dictionary, ex. {'http': 'http://%s:%s' % (HOST, PORT), 'https': 'http://%s:%s' % (HOST, PORT)} :param auth: Twitter application authentication, see the get_authorization method """ if (user_id is None): return if isinstance(user_id, basestring) and (user_id.strip() == ''): return # Has timeline already been collected? try: cur.execute("SELECT expand_user, has_timeline FROM users WHERE user_id = {};".format(user_id)) q = cur.fetchone() except psycopg2.ProgrammingError: print "strange programming error" print "user id is ", user_id print type(user_id) # sys.utc(1) except psycopg2.InternalError: conn.rollback() cur.execute("SELECT expand_user, has_timeline FROM users WHERE user_id = {};".format(user_id)) q = cur.fetchone() if (q is None) or (q[0] is False) or (q[1] is not None): # User is not in database, or user shouldn't be expanded, or TL has already been collected return # Is the user's profile protected or deleted? cur.execute("SELECT profile_id FROM lost_profiles WHERE user_id = {};".format(user_id)) qq = cur.fetchone() if qq is not None: return # check to see if user already has timeline if (q[0] is not False) and (q[1] is None): print '\tGet user timeline for user ', user_id TL = pyTweet.collect_user_timeline(USER=user_id, USER_type='user_id', start_date=tl_start_date, proxies=proxies, auth=auth) # Ignore empty TL if TL == []: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET has_timeline=FALSE, timeline_is_relevant=FALSE WHERE user_id = {};".format(user_id)) return # add date of collection to time line for tl in range(len(TL)): TL[tl]['DOC'] = datetime.datetime.utcnow() # Update has_tl tweetAdded = json_to_database.add_timeline(cur=cur, conn=conn, tldata=TL) if tweetAdded: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET has_timeline = TRUE WHERE user_id = {};".format(user_id)) else: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET has_timeline = FALSE, timeline_is_relevant = FALSE WHERE user_id = {};".format(user_id))
def _add_new_topics(cur, conn, tl_start_date, tl_end_date, hop): """ This function adds new topics to the topic table after a hop has been collected. If a tweet contains at least one of the topic-like words, then its non-stop-word terms are added to the topic table. :param cur: Cursor to database :param conn: Connection to database :param tl_start_date: Timeline start date, datetime object :param tl_end_date: Timeline end date, datetime object :param hop: Current hop count """ # Count current topics in table cur.execute("SELECT COUNT(*) FROM topics;") init_nTopics = cur.fetchone()[0] # Get topic query for previous loop topic_query = _get_topic_query(cur) # Get users/tweets cur.execute(cur.mogrify("SELECT DISTINCT (users.user_id) FROM users INNER JOIN tweets ON users.user_id=tweets.user_id WHERE (users.expand_user = TRUE) AND (users.khop = %s) AND (users.has_timeline = TRUE) AND (users.friends_count > 0 OR users.followers_count > 0 OR tweets.retweet_count > 0) AND (tweets.created_at >= %s AND tweets.created_at <= %s);", (hop, tl_start_date, tl_end_date))) users = cur.fetchall() # [(1,), (2,), ...] print "\tAdd new topics from {} user timelines ...".format(len(users)) for u in users: print "\t\tAdd topics from user: "******"SELECT DISTINCT tweet_id FROM tweets WHERE user_id = %s AND (created_at >= %s AND created_at <= %s) AND (" + topic_query + ");" cur.execute(cur.mogrify(cmd, (u[0], tl_start_date, tl_end_date))) tweet_ids = cur.fetchall() print "\t\tThis user has {} relevant tweets to consider.".format(len(tweet_ids)) for t in tweet_ids: # Label tweet as relevant json_to_database.make_sql_edit(cur, conn, "UPDATE tweets SET tweet_is_relevant = TRUE WHERE tweet_id ={};".format(t[0])) # Load tweet data cur.execute("SELECT * FROM tweets WHERE tweet_id = {};".format(t[0])) twt_data = cur.fetchone() # Add hashtags to topic table if twt_data['hashtag_entities'] is not None: new_hash = candid_tfidf.process_word_list(twt_data['hashtag_entities']) for h in new_hash: json_to_database.make_sql_edit(cur, conn, cur.mogrify("INSERT INTO topics (topic, khop) VALUES (%s, %s);", (h, hop))) # Add URLs to topic table if twt_data['url_entities'] is not None: for url in twt_data['url_entities']: json_to_database.make_sql_edit(cur, conn, cur.mogrify("INSERT INTO topics (topic, khop) VALUES (%s, %s);", (url, hop))) # Add nouns cur.execute("SELECT COUNT(*) FROM topics;") nTopics = cur.fetchone()[0] print "\t\tAdded {} new topics to the table.".format(nTopics - init_nTopics)
def compute_df(cur, conn, tl_start_date, tl_end_date, user_set): """ This function computes the IDF for each term in the topic table. Note that a 'document' is defined as a user's timeline between tl_start_date_date and tl_end_date. IDF(t) = log_e( # of timelines with the term t / # of timelines in database) :param cur: Cursor to database :param conn: Connection to database :param tl_start_date: datetime object to indicate beginning of time line :param tl_end_date: datetime object to indicate the end of a time line :param user_set: Subset of users to restrict calculation, list object """ print "\nCompute DF for each topic." # Get total number of timelines in database a = " OR ".join(['user_id = ' + str(j) for j in user_set]) cur.execute(cur.mogrify("SELECT COUNT (DISTINCT user_id) FROM tweets WHERE (created_at >= %s AND created_at <= %s) AND ({});".format(a), (tl_start_date, tl_end_date))) q = cur.fetchone() if (q is None) or (q[0] is None): print "WARNING: q or q[0] is None!" json_to_database.make_sql_edit(cur, conn, "UPDATE topics SET df = 0.0;") return total_timelines = float(q[0]) print "\tThere are {} timelines for this set of friends/followers".format(total_timelines) # Case: No timelines if total_timelines < 1.0: json_to_database.make_sql_edit(cur, conn, "UPDATE topics SET df = 0.0;") return # Get count of timelines containing topic t, for each topic cur.execute("SELECT topic FROM topics;") topics = cur.fetchall() for t in topics: # Count the number of timelines that the topic appears in topic_freq = 0.0 if 'http' in t[0]: cur.execute(cur.mogrify("SELECT DISTINCT user_id FROM tweets WHERE ({}) AND ((tweet ~ %s) OR (%s = ANY(url_entities))) AND (created_at >= %s AND created_at <= %s);".format(a), ('%' + t[0] + '%', '%' + t[0] + '%', tl_start_date, tl_end_date))) else: cur.execute(cur.mogrify("SELECT DISTINCT user_id FROM tweets WHERE ({}) AND ((tweet ~* %s) OR (LOWER(%s) = ANY(hashtag_entities))) AND (created_at >= %s AND created_at <= %s);".format(a), ('\m' + t[0] + '\M', t[0], tl_start_date, tl_end_date))) q = cur.fetchall() topic_freq += float(len(q)) # Compute IDF df = 0.0 if topic_freq > 0: df = math.log(topic_freq/total_timelines, 10.0) json_to_database.make_sql_edit(cur, conn, "UPDATE topics SET document_frequency = {} WHERE topic = '{}';".format(df, t[0]))
def compute_df(cur, conn, tl_start_date, tl_end_date, user_set): """ This function computes the IDF for each term in the topic table. Note that a 'document' is defined as a user's timeline between tl_start_date_date and tl_end_date. IDF(t) = log_e( # of timelines with the term t / # of timelines in database) :param cur: Cursor to database :param conn: Connection to database :param tl_start_date: datetime object to indicate beginning of time line :param tl_end_date: datetime object to indicate the end of a time line :param user_set: Subset of users to restrict calculation, list object """ print "\nCompute DF for each topic." # Get total number of timelines in database a = " OR ".join(['user_id = ' + str(j) for j in user_set]) cur.execute( cur.mogrify( "SELECT COUNT (DISTINCT user_id) FROM tweets WHERE (created_at >= %s AND created_at <= %s) AND ({});" .format(a), (tl_start_date, tl_end_date))) q = cur.fetchone() if (q is None) or (q[0] is None): print "WARNING: q or q[0] is None!" json_to_database.make_sql_edit(cur, conn, "UPDATE topics SET df = 0.0;") return total_timelines = float(q[0]) print "\tThere are {} timelines for this set of friends/followers".format( total_timelines) # Case: No timelines if total_timelines < 1.0: json_to_database.make_sql_edit(cur, conn, "UPDATE topics SET df = 0.0;") return # Get count of timelines containing topic t, for each topic cur.execute("SELECT topic FROM topics;") topics = cur.fetchall() for t in topics: # Count the number of timelines that the topic appears in topic_freq = 0.0 if 'http' in t[0]: cur.execute( cur.mogrify( "SELECT DISTINCT user_id FROM tweets WHERE ({}) AND ((tweet ~ %s) OR (%s = ANY(url_entities))) AND (created_at >= %s AND created_at <= %s);" .format(a), ('%' + t[0] + '%', '%' + t[0] + '%', tl_start_date, tl_end_date))) else: cur.execute( cur.mogrify( "SELECT DISTINCT user_id FROM tweets WHERE ({}) AND ((tweet ~* %s) OR (LOWER(%s) = ANY(hashtag_entities))) AND (created_at >= %s AND created_at <= %s);" .format(a), ('\m' + t[0] + '\M', t[0], tl_start_date, tl_end_date))) q = cur.fetchall() topic_freq += float(len(q)) # Compute IDF df = 0.0 if topic_freq > 0: df = math.log(topic_freq / total_timelines, 10.0) json_to_database.make_sql_edit( cur, conn, "UPDATE topics SET document_frequency = {} WHERE topic = '{}';". format(df, t[0]))
def find_most_similar_followers(cur, conn, tl_start_date, tl_end_date, user_ids, prev_users): """ This function identifies the top 10% of most textually simliar followers to a user :param cur: Cursor to database :param conn: Connection to database :param tl_start_date: datetime object to indicate beginning of time line :param tl_end_date: datetime object to indicate the end of a time line :param user_ids: set of user IDs :param prev_users: hop -1 """ if 'sklearn' not in sys.modules.keys(): import sklearn original_user_ids = set(user_ids) print "\nFind friends/followers most similar to the previous hop using a TF-IDF transformation." print "\tBegin with {} friends and followers for similarity test".format( len(user_ids)) user_timeline_hash = { } # hash table for user IDs and indexes in the TF-IDF matrix # Create document for khop-1 users user_doc = '' for jj in prev_users: cur.execute( cur.mogrify( "SELECT tweets.tweet FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (users.user_id = %s) AND (users.has_timeline=TRUE) AND (users.expand_user=TRUE) AND (tweets.created_at >= %s AND tweets.created_at <= %s);", (jj, tl_start_date, tl_end_date))) for t in cur: user_doc += t[0] corpus = [user_doc] user_timeline_hash[0] = 'prev_users' # Create document for all hop users idx = 1 jj_users = list(user_ids) for jj in jj_users: user_doc = '' cur.execute( cur.mogrify( "SELECT tweets.tweet FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (users.user_id=%s) AND (users.has_timeline=TRUE) AND (tweets.created_at >= %s AND tweets.created_at <= %s) AND (users.expand_user IS NOT FALSE);", (jj, tl_start_date, tl_end_date))) for t in cur: user_doc += t[0] if user_doc.strip() != '': corpus.append(user_doc) user_timeline_hash[idx] = jj idx += 1 else: user_ids.remove(jj) print "\tCompare previous hop with {} friends and followers".format( len(user_timeline_hash) - 1) if corpus != ['']: # Perform TF-IDF transformation # tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=1) tfidf_vectorizer = TfidfVectorizer(min_df=1) tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) # Compute cosine similarity between khop-1 and all other timelines score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix) # score = sklearn.metrics.pairwise.cosine_similarity(tfidf_matrix[0:1], tfidf_matrix) # Exapand the top 5% of users if len(score[0]) < 2: return threshold = np.percentile(score[0][1:], 80) expand_idx = np.where(score[0] >= threshold)[0] expand_count = 0 for k in user_timeline_hash.keys(): if k < 1: continue if k in expand_idx: expand_count += 1 json_to_database.make_sql_edit( cur, conn, "UPDATE users SET expand_user=TRUE, decision_tfidf={} WHERE user_id={};" .format(score[0][k], user_timeline_hash[k])) else: user_ids.remove(user_timeline_hash[k]) print "\tExpand {} friends/followers".format(expand_count) return original_user_ids
def depth_first_cascade_search(user_seed, tl_start_date, tl_end_date, postgres_params, host, port, save_dir={}, hop_limits={}): """ This funciton builds a network based on users relevant to seed keywords Requires that a PostgreSQL database already exist :param user_seed: List of user names :param tl_start_date: Beginning of date (datetime.date object) of timelines in collection :param tl_end_date: End date (datetime.date object) of timelines in collection :param postgres_params: Dictionary containing the fields '', ..., required to connect to a database :param host: :param port: :param save_dir: Directory storing sampling place savers and growth parameters EX. save_dir = {'place_saver_filename': 'name of file'} :param hop_limits: Specify your graph constrains with the variable hop_limits. Set the maximum number of hops to make a graph with 'max_hops'. EX. hop_limits = {'max_hops': 2} # Maximum number of hops in graph """ # CHECK PARAMETERS print "\nCheck parameters" # Timeline start and end dates assert (isinstance(tl_start_date, datetime.date) and isinstance(tl_end_date, datetime.date)), "Both tl_start_date and tl_end_date must be datetime.date objects (i.e. tl_start_date = datetime.date(year=2014, month=1, day=1))." assert ((tl_end_date - tl_start_date) > datetime.timedelta(0)), "The end date must be later than the start date. Check the assignments of tl_start_date and tl_end_date." # Check PostgreSQL parameters assert (('dbname' in postgres_params.keys()) and ('user' in postgres_params.keys()) and ('password' in postgres_params.keys())), "Verify the parameters. The possible fields are 'dbname', 'user', 'password', 'host', and 'port'." try: conn = psycopg2.connect(" ".join(map(lambda x,y: "{}='{}'".format(x,y), postgres_params.keys(),postgres_params.values()))) cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) except psycopg2.OperationalError: print "OperationalError: Check your login credentials. Make sure the database exists as well." return # Check hop_limits dictionary if 'max_hops' not in hop_limits: hop_limits['max_hops'] = 5 print "\tNo value was specified for hop_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_limits['max_hops']) # Check save_dir dictionary fields, create directories if they do not already exist if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''): save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles') print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles']) if not os.path.isdir(save_dir['twitter_profiles']): print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles']) os.mkdir(save_dir['twitter_profiles']) # SET UP SECONDARY PARAMETERS # Create proxies dictionary proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)} # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = breadth_first_sampling.load_place_savers(save_dir['twitter_profiles']) print "\tThe current hop is {}".format(place_savers['cur_hop']) if place_savers['cur_hop'] < 1: place_savers['cur_user_list'] = set(user_seed) print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop']) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) # Load growth parameters growth_params = breadth_first_sampling.load_growth_params(save_dir['twitter_profiles']) # API AUTHORIZATION print "\nAPI Authorization" OAUTH = pyTweet.get_authorization(twitter_keys) # CONFIGURE SCHEMA FOR TF-IDF ANALYSIS print "\nConfigure database for TF-IDF analysis" json_to_database.configure_database_to_build_network(cur, conn) new_columns = [{'table': 'users', 'col': 'decision_tfidf', 'type': 'FLOAT'}] # used for i in new_columns: try: json_to_database.make_sql_edit(cur, conn, "ALTER TABLE {} ADD {} {};".format(i['table'], i['col'], i['type'])) except psycopg2.ProgrammingError: conn.rollback() # SAMPLING LOOP print "\nBegin collection" cur_hop = place_savers['cur_hop'] for ii in range(cur_hop, hop_limits['max_hops']): print "\nWorking on collecting hop {} containing {} profiles.".format(ii, len(place_savers['cur_user_list'])) if ii < 1: _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='screen_name', hop=ii) # Replace user names in place_savers['cur_user_list'] with user IDs! user_id_set = set([]) for jj in place_savers['cur_user_list']: cur.execute("SELECT user_id FROM users WHERE screen_name = '{}';".format(jj)) user_id_set.add(cur.fetchone()[0]) place_savers['cur_user_list'] = set(user_id_set) del user_id_set breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) else: _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii) # Do not expand users who have more than 1000 friends+followers json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);") # GET TIMELINES for jj in place_savers['cur_user_list']: _get_timeline_wrapper(cur=cur, conn=conn, user_id=jj, tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) # Add all of the hashtags from the seed of users if ii < 1: total_ht_h0 = 0 print "\nAdd all of the hashtags from the seed of users" json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=TRUE WHERE (khop=0) AND (expand_user IS NULL);") cur.execute("SELECT DISTINCT user_id FROM users WHERE (khop=0) AND (expand_user IS TRUE);") q = cur.fetchall() for qq in q: nAdd = _add_all_hashtags(cur=cur, conn=conn, user_id=qq[0], tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii) total_ht_h0 = total_ht_h0 + nAdd print "Added {} hashtags from hop {}.".format(total_ht_h0, ii) cur.execute("SELECT COUNT(*) FROM users WHERE (khop=0) AND (expand_user=TRUE);") print "Expand {} users from hop {}".format(cur.fetchone()[0], ii) # SAVE GRAPH PARAMS growth_params['h{}_users.json'.format(ii)] = set(place_savers['cur_user_list']) growth_params['h{}_missing.json'.format(ii)] = set([]) growth_params['h{}_extendTRUE.json'.format(ii)] = set([]) growth_params['h{}_extendFALSE.json'.format(ii)] = set([]) growth_params['h{}_extendNULL.json'.format(ii)] = set([]) for uu in place_savers['cur_user_list']: cur.execute("SELECT expand_user FROM users WHERE user_id = {};".format(uu)) q = cur.fetchone() if q is None: growth_params['h{}_missing.json'.format(ii)].add(uu) continue if q[0] is None: growth_params['h{}_extendNULL.json'.format(ii)].add(uu) elif q[0] is True: growth_params['h{}_extendTRUE.json'.format(ii)].add(uu) elif q[0] is False: growth_params['h{}_extendFALSE.json'.format(ii)].add(uu) else: print "ERROR in saving growth parameters! Invalid data type..." continue breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) # GET NEXT SET OF USERS if ii < (hop_limits['max_hops'] - 1): print "\nCHOOSE NEXT SET OF USERS FROM USER MENTIONS" new_um = set([]) # Add user mentions to next hop for jj in place_savers['cur_user_list']: cur.execute(cur.mogrify("SELECT DISTINCT tweets.user_mentions FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (users.user_id = %s) AND (tweets.created_at >= %s AND tweets.created_at <= %s) AND (users.expand_user IS TRUE) AND (tweets.user_mentions IS NOT NULL OR tweets.user_mentions != '{}');", (jj, tl_start_date, tl_end_date))) uids = cur.fetchall() for kk in uids: new_um.update(set(kk[0])) print "There are {} user mentions from hop {}".format(len(new_um), ii) # Get user mention profiles _get_profiles_wrapper(cur=cur, conn=conn, user_list=new_um, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1) # Expand, or not, user mentions json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);") # Expand remaining user mentions growth_params['h{}_um_missing.json'.format(ii)] = set([]) growth_params['h{}_um_extendTRUE.json'.format(ii)] = set([]) growth_params['h{}_um_extendFALSE.json'.format(ii)] = set([]) growth_params['h{}_um_extendNULL.json'.format(ii)] = set([]) new_um_tracker = set(new_um) for uu in new_um_tracker: cur.execute("SELECT expand_user FROM users WHERE user_id = {};".format(uu)) q = cur.fetchone() if q is None: new_um.remove(uu) growth_params['h{}_um_missing.json'.format(ii)].add(uu) continue if q[0] is False: new_um.remove(uu) growth_params['h{}_um_extendFALSE.json'.format(ii)].add(uu) else: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=TRUE WHERE (user_id = {}) AND (expand_user IS NULL);".format(uu)) new_um_tracker = set(new_um) for uu in new_um_tracker: cur.execute("SELECT expand_user FROM users WHERE user_id = {};".format(uu)) q = cur.fetchone() if q is None: # print "this is strange" new_um.remove(uu) growth_params['h{}_um_missing.json'.format(ii)].add(uu) continue if q[0] is True: growth_params['h{}_um_extendTRUE.json'.format(ii)].add(uu) if q[0] is None: growth_params['h{}_um_extendNULL.json'.format(ii)].add(uu) new_um.remove(uu) print "This is not supposed to happen!!!" del new_um_tracker assert (len(growth_params['h{}_um_extendNULL.json'.format(ii)]) < 1), "There are user mentions assigned expand_user=NULL!" place_savers['next_user_list'].update(new_um) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) print "\nCHOOSE NEXT SET OF USERS FROM FRIENDS AND FOLLOWERS" print "Collect friends" for jj in place_savers['cur_user_list']: cur.execute("SELECT expand_user FROM users WHERE (user_id = {}) AND (expand_user IS TRUE) AND (friends_count > 0) AND (friends_list IS NULL);".format(jj)) q = cur.fetchone() if q is None: continue if q[0] is True: print "\tCollect friends for user {}.".format(jj) friends_list = pyTweet.get_user_friends(user_id=jj, proxies=proxies, auth=OAUTH, limit=1000) json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET friends_list = %s WHERE user_id = %s;", (friends_list, jj))) print "Collect followers" for jj in place_savers['cur_user_list']: cur.execute("SELECT expand_user FROM users WHERE (user_id = {}) AND (expand_user IS TRUE) AND (followers_count > 0) AND (followers_list IS NULL);".format(jj)) q = cur.fetchone() if q is None: continue if q[0] is True: print "\tCollect followers for user {}.".format(jj) followers_list = pyTweet.get_user_followers(user_id=jj, proxies=proxies, auth=OAUTH, limit=1000) json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET followers_list = %s WHERE user_id = %s;", (followers_list, jj))) print "Get profiles and timelines of friends and followers" fids = set([]) for jj in place_savers['cur_user_list']: cur.execute("SELECT friends_list,followers_list FROM users WHERE (user_id = {}) AND (expand_user IS TRUE) AND (((friends_list IS NOT NULL) AND (ARRAY_LENGTH(friends_list,1) > 0)) OR ((followers_list IS NOT NULL) AND (ARRAY_LENGTH(followers_list,1) > 0)));".format(jj)) flist = cur.fetchone() if flist is None: continue if flist[0] is not None: fids.update(flist[0]) if flist[1] is not None: fids.update(flist[1]) print "There are {} friends/followers of hop {}".format(len(fids), ii) # Get profiles of friends/followers _get_profiles_wrapper(cur=cur, conn=conn, user_list=fids, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1) # Filter with high degree rule and get timelines json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);") # Remove expand_user=FALSE from friend/follower list growth_params['h{}_frfo_missing.json'.format(ii)] = set([]) growth_params['h{}_frfo_extendFALSE.json'.format(ii)] = set([]) jj_list = list(fids) for jj in jj_list: cur.execute("SELECT expand_user,has_timeline FROM users WHERE user_id = {};".format(jj)) q = cur.fetchone() if q is None: growth_params['h{}_frfo_missing.json'.format(ii)].add(jj) fids.remove(jj) continue if q[0] is False: fids.remove(jj) growth_params['h{}_frfo_extendFALSE.json'.format(ii)].add(jj) continue if (q[0] is not False) and (q[1] is None): _get_timeline_wrapper(cur=cur, conn=conn, user_id=jj, tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) del jj_list # Find the most similar friends/followers, and expand the top 5% original_frfo_set = candid_tfidf.find_most_similar_followers(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, user_ids=fids, prev_users=place_savers['cur_user_list']) growth_params['h{}_frfo_extendTRUE.json'.format(ii)] = set(fids) growth_params['h{}_frfo_extendNULL.json'.format(ii)] = set(original_frfo_set.difference(fids)) del original_frfo_set breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) place_savers['next_user_list'].update(fids) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) if ii > 0: print "\nFIND USERS WITH AT LEAST ONE HASHTAG IN COMMON WITH TOPICS" new_relevant_users = _find_relevant_users(cur=cur, conn=conn, user_ids=growth_params['h{}_frfo_extendNULL.json'.format(ii)]) growth_params['h{}_relevant_extendTRUE.json'.format(ii)] = set(new_relevant_users) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) place_savers['next_user_list'].update(new_relevant_users) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) # PREPARE FOR NEXT HOP place_savers['cur_hop'] += 1 place_savers['cur_user_list'] = set(place_savers['next_user_list']) place_savers['next_user_list'] = set([]) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)
def _get_profiles_wrapper(cur, conn, user_list, proxies, auth, list_type, hop): """ This function is a wrapper for grabbing user profiles :param cur: Cursor to database :param conn: Connection to database :param user_list: List of Twitter user IDs :param list_type: must be 'ID' for user IDs or 'SN' for screen names :param get_profiles: Set object of users to collect """ assert ((list_type == 'user_id') or (list_type == 'screen_name')), "The parameter must list_type must be set to either 'user_id' or 'screen_name'" db_user_ids = set([]) # List of redundant profiles # Filter out profiles that have already been collected cur.execute("SELECT DISTINCT {} FROM users;".format(list_type)) q = cur.fetchall() # q = [('sn',), ..., ('sn2',)] for ii in q: db_user_ids.add(ii[0]) # Filter out deleted or protected profiles cur.execute("SELECT DISTINCT {} FROM lost_profiles;".format(list_type)) q = cur.fetchall() for ii in q: db_user_ids.add(ii[0]) get_profiles = list(set(user_list).difference(db_user_ids)) if get_profiles is None: return # Partition IDs USERS = [get_profiles[z:z+100] for z in range(0, len(get_profiles), 100)] get_profiles = set(get_profiles) # del get_profiles # Save space for j in range(len(USERS)): # Look up information of users, 100 at a time print "\tLook up profile information for up to 100 users at a time" if list_type == 'screen_name': user_info = pyTweet.user_lookup_usernames(user_list=list(USERS[j]), proxies=proxies, auth=auth) lost_cmd = "INSERT INTO lost_profiles (screen_name) VALUES ('{}');" elif list_type == 'user_id': user_info = pyTweet.user_lookup_userids(user_list=list(USERS[j]), proxies=proxies, auth=auth) lost_cmd = "INSERT INTO lost_profiles (user_id) VALUES ({});" else: print "The type '{}' is not recognized. Set list_type to either 'user_id' or 'screen_name'".format(list_type) return # Are there profiles that are either protected/deleted? if (not isinstance(user_info, list)) and ('errors' in user_info.keys()) and (user_info['errors'][0]['code'] == 17): for u in USERS[j]: json_to_database.make_sql_edit(cur, conn, lost_cmd.format(u)) get_profiles.remove(u) return if len(user_info) < len(USERS[j]): for u in USERS[j]: profile_collected = False for ui in user_info: if (u in ui.values()) or (str(u) in ui.values()): profile_collected = True break # Add profile to table deleted_profiles if necessary if not profile_collected: json_to_database.make_sql_edit(cur, conn, lost_cmd.format(u)) get_profiles.remove(u) # Add user info to database for k in user_info: if k == 'errors': continue k['khop'] = hop k['DOC'] = datetime.datetime.utcnow() if hop < 1: k['expand_user'] = True json_to_database.add_user(userdata=k, cur=cur, conn=conn)
def depth_first_causal_search(user_seed, topic_seed, tl_start_date, tl_end_date, postgres_params, host, port, save_dir={}, hop_limits={}, collection_limits={}): """ This funciton builds a network based on users relevant to seed keywords Requires that a PostgreSQL database already exist :param user_seed: List of user names :param topic_seed: List of seed topics :param tl_start_date: Beginning of date (datetime.date object) of timelines in collection :param tl_end_date: End date (datetime.date object) of timelines in collection :param postgres_params: Dictionary containing the fields '', ..., required to connect to a database :param host: :param port: :param save_dir: Directory to save sampling and growth parameters :param hop_limits: Specify your graph constrains with the variable hop_limits. Set the maximum number of hops to make a graph with 'max_hops'. EX. hop_limits = {'max_hops': 2} # Maximum number of hops in graph :param collection_limits: Specify the term-frequency calculation and threshold percentile EX. collection_limits = {'threshold_percentile': 0.05, # Threshold percentile for .... 'tf_type': 'raw'} # TF caclulation type """ # CHECK PARAMETERS print "\nCheck parameters" # Timeline start and end dates assert (isinstance(tl_start_date, datetime.date) and isinstance(tl_end_date, datetime.date)), "Both tl_start_date and tl_end_date must be datetime.date objects (i.e. tl_start_date = datetime.date(year=2014, month=1, day=1))." assert ((tl_end_date - tl_start_date) > datetime.timedelta(0)), "The end date must be later than the start date. Check the assignments of tl_start_date and tl_end_date." # Check PostgreSQL parameters assert (('dbname' in postgres_params.keys()) and ('user' in postgres_params.keys()) and ('password' in postgres_params.keys())), "Verify the parameters. The possible fields are 'dbname', 'user', 'password', 'host', and 'port'." try: conn = psycopg2.connect(" ".join(map(lambda x,y: "{}='{}'".format(x,y), postgres_params.keys(),postgres_params.values()))) cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) except psycopg2.OperationalError: print "OperationalError: Check your login credentials. Make sure the database exists as well." return # Check hop_limits dictionary if 'max_hops' not in hop_limits: hop_limits['max_hops'] = 5 print "\tNo value was specified for hop_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_limits['max_hops']) # Check save_dir dictionary fields, create directories if they do not already exist if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''): save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles') print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles']) if not os.path.isdir(save_dir['twitter_profiles']): print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles']) os.mkdir(save_dir['twitter_profiles']) # Check collection_limits dictionary if 'threshold_percentile' not in collection_limits: collection_limits['threshold_percentile'] = 0.05 print "\tNo value was specified for collection_limits['threshold_percentile'], xxx, so it will be set to 0.05." assert (0 <= collection_limits['threshold_percentile'] <= 1), "The value collection_parameters['threshold_percentile'] must fall within [0,1]." if 'tf_type' not in collection_limits: collection_limits['tf_type'] = 'raw' print "\tNo value was specified for collection_limits['tf_type'], method of calculating the term frequency, so it will be set to 'raw'." assert ((collection_limits['tf_type'] == 'raw') or (collection_limits['tf_type'] == 'augmented') or (collection_limits['tf_type'] == 'boolean')), "The value collection_parameters['tf_type'] is not recognized. Please enter 'raw', 'boolean' or 'augmented' as it's value." # SET UP SECONDARY PARAMETERS # Create proxies dictionary proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)} # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = breadth_first_sampling.load_place_savers(save_dir['twitter_profiles']) print "\tAs of now {} user profiles have been collected and saved to {}".format(len(place_savers['finished_users']), save_dir['twitter_profiles']) print "\tThe current hop is {}".format(place_savers['cur_hop']) if place_savers['cur_hop'] < 1: place_savers['cur_user_list'] = set(user_seed) print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop']) # Load growth parameters growth_params = breadth_first_sampling.load_growth_params(save_dir['twitter_profiles']) # API AUTHORIZATION print "\nAPI Authorization" OAUTH = pyTweet.get_authorization(twitter_keys) print "Start with key {}".format(OAUTH['KEY_FILE']) # CONFIGURE SCHEMA FOR TF-IDF ANALYSIS print "\nConfigure database for TF-IDF analysis" json_to_database.configure_database_to_build_network(cur, conn) # Load topics for t in topic_seed: if (t is None) or (t.strip() == ''): continue json_to_database.make_sql_edit(cur, conn, "INSERT INTO topics (topic, khop) VALUES ('{}', -1);".format(t.strip())) # Add columns for this sampling method new_columns = [{'table': 'users', 'col': 'has_timeline_filter', 'type': 'BOOLEAN'}, # Indicates if a user's timeline has already been filtered {'table': 'users', 'col': 'timeline_document', 'type': 'TEXT[]'}, # Document created from relevant tweets {'table': 'topics', 'col': 'document_frequency', 'type': 'FLOAT'}, # Docuemnt frequency {'table': 'users', 'col': 'decision_candid_tfdf_score', 'type': 'FLOAT'}] for i in new_columns: try: json_to_database.make_sql_edit(cur, conn, "ALTER TABLE {} ADD {} {};".format(i['table'], i['col'], i['type'])) print "Add column {} to table {}.".format(i['col'], i['table']) except psycopg2.ProgrammingError: conn.rollback() new_ind = [{'table': 'users', 'col': 'has_timeline_filter'}] # SAMPLING LOOP print "\nBegin collection" cur_hop = place_savers['cur_hop'] for ii in range(cur_hop, hop_limits['max_hops']): print "\nWorking on collecting hop {} containing {} profiles.".format(ii, len(place_savers['cur_user_list'])) # GET PROFILE INFORMATION if ii < 1: _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='screen_name', hop=ii) else: _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii) growth_params['h{}_users.json'.format(ii)] = set(place_savers['cur_user_list']) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) # GET TIMELINES cur.execute("SELECT user_id FROM users WHERE (has_timeline IS NULL) AND (expand_user = TRUE OR expand_user IS NULL) AND (khop = {});".format(ii)) uids = cur.fetchall() for j in uids: # Get timeline _get_timeline_wrapper(cur=cur, conn=conn, user_id=j[0], tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) # Filter users by timeline _filter_by_timeline(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii) # Create documents from timelines candid_tfidf.create_documents(cur, conn, tl_start_date, tl_end_date) # Expand relevant seed users json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user = TRUE WHERE khop = 0 AND expand_user IS NULL;") # GET NEXT SET OF USERS if ii < (hop_limits['max_hops'] - 1): # USER MENTIONS print "\nCHOOSE NEXT SET OF USERS FROM USER MENTIONS" new_um = set([]) cur.execute(cur.mogrify("SELECT DISTINCT tweets.user_mentions FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (tweets.created_at >= %s AND tweets.created_at <= %s) AND users.expand_user = TRUE AND users.khop = %s AND (tweets.user_mentions IS NOT NULL OR tweets.user_mentions != '{}');", (tl_start_date, tl_end_date, ii))) uids = cur.fetchall() for t in uids: new_um = new_um.union(set(t[0])) # Get user mention profiles _get_profiles_wrapper(cur=cur, conn=conn, user_list=new_um, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1) # Get timelines for um in new_um: _get_timeline_wrapper(cur=cur, conn=conn, user_id=um, tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) # Filter users by timeline _filter_by_timeline(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii+1) # Create documents from timelines candid_tfidf.create_documents(cur, conn, tl_start_date, tl_end_date) # User mentions who have expand_user = NULL, will be set to TRUE for um in new_um: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user = TRUE WHERE expand_user IS NULL AND user_id = {};".format(um)) growth_params['h{}_user_mentions.json'.format(ii)] = set(new_um) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) del new_um # FRIENDS AND FOLLOWERS print "\nCHOOSE NEXT SET OF USERS FROM FRIENDS AND FOLLOWERS" print "Collect friends" cur.execute("SELECT user_id FROM users WHERE expand_user = TRUE AND khop = {} AND friends_count > 0 AND friends_list IS NOT NULL;".format(ii)) hasfriends = cur.fetchall() growth_params['h{}_friends.json'.format(ii)] = set([]) for u in hasfriends: print "\nCollect friends for user {}.".format(u[0]) friends_list = pyTweet.get_user_friends(user_id=u[0], proxies=proxies, auth=OAUTH, limit=100) json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET friends_list = %s WHERE user_id = %s;", (friends_list, u[0]))) growth_params['h{}_friends.json'.format(ii)].update(set(friends_list)) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) print "Collect followers" cur.execute("SELECT user_id FROM users WHERE expand_user = TRUE AND khop = {} AND followers_count > 0 AND followers_list IS NOT NULL;".format(ii)) hasfollowers = cur.fetchall() growth_params['h{}_followers.json'] = set([]) for u in hasfollowers: print "\nCollect followers for user {}.".format(u[0]) followers_list = pyTweet.get_user_followers(user_id=u[0], proxies=proxies, auth=OAUTH, limit=100) json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET followers_list = %s WHERE user_id = %s;", (followers_list, u[0]))) growth_params['h{}_followers.json'].update(set(followers_list)) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) print "Get profiles and timelines of friends and followers" cur.execute("SELECT user_id, friends_list,followers_list FROM users WHERE expand_user = TRUE AND khop = {} AND (ARRAY_LENGTH(friends_list, 1) > 0 OR ARRAY_LENGTH(followers_list, 1) > 0);".format(ii)) flist = cur.fetchall() ids = set([]) for f in flist: if f is not None: if f[1] is not None: ids.update(f[1]) if f[2] is not None: ids.update(f[2]) ids = list(ids) # Get profiles of friends/followers _get_profiles_wrapper(cur=cur, conn=conn, user_list=ids, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1) for i in range(len(ids)): print "\nGet timeline for friend/follower {}: {} out of {}".format(ids[i], i, len(ids)) _get_timeline_wrapper(cur=cur, conn=conn, user_id=ids[i], tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) # Filter profiles by timeline _filter_by_timeline(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii+1) # Create documents from timelines candid_tfidf.create_documents(cur, conn, tl_start_date, tl_end_date) # Compute CANDID information score, and discriminate users for f in flist: if (f is not None) and (f[0] is not None): candid_tfidf.compute_candid_score(cur=cur, conn=conn, parent_id=f[0], tl_start_date=tl_start_date, tl_end_date=tl_end_date, threshold_percentile=collection_limits['threshold_percentile'], tf_type=collection_limits['tf_type']) # PREPARE FOR NEXT HOP place_savers['cur_hop'] = ii + 1 place_savers['cur_user_list'] = set([]) cur.execute("SELECT user_id FROM users WHERE khop = {} AND expand_user = TRUE;".format(ii + 1)) new_profiles = cur.fetchall() for np in new_profiles: place_savers['cur_user_list'].add(np[0]) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
def find_most_similar_followers(cur, conn, tl_start_date, tl_end_date, user_ids, prev_users): """ This function identifies the top 10% of most textually simliar followers to a user :param cur: Cursor to database :param conn: Connection to database :param tl_start_date: datetime object to indicate beginning of time line :param tl_end_date: datetime object to indicate the end of a time line :param user_ids: set of user IDs :param prev_users: hop -1 """ if 'sklearn' not in sys.modules.keys(): import sklearn original_user_ids = set(user_ids) print "\nFind friends/followers most similar to the previous hop using a TF-IDF transformation." print "\tBegin with {} friends and followers for similarity test".format(len(user_ids)) user_timeline_hash = {} # hash table for user IDs and indexes in the TF-IDF matrix # Create document for khop-1 users user_doc = '' for jj in prev_users: cur.execute(cur.mogrify("SELECT tweets.tweet FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (users.user_id = %s) AND (users.has_timeline=TRUE) AND (users.expand_user=TRUE) AND (tweets.created_at >= %s AND tweets.created_at <= %s);", (jj, tl_start_date, tl_end_date))) for t in cur: user_doc += t[0] corpus = [user_doc] user_timeline_hash[0] = 'prev_users' # Create document for all hop users idx = 1 jj_users = list(user_ids) for jj in jj_users: user_doc = '' cur.execute(cur.mogrify("SELECT tweets.tweet FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (users.user_id=%s) AND (users.has_timeline=TRUE) AND (tweets.created_at >= %s AND tweets.created_at <= %s) AND (users.expand_user IS NOT FALSE);", (jj, tl_start_date, tl_end_date))) for t in cur: user_doc += t[0] if user_doc.strip() != '': corpus.append(user_doc) user_timeline_hash[idx] = jj idx += 1 else: user_ids.remove(jj) print "\tCompare previous hop with {} friends and followers".format(len(user_timeline_hash)-1) if corpus != ['']: # Perform TF-IDF transformation # tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=1) tfidf_vectorizer = TfidfVectorizer(min_df=1) tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) # Compute cosine similarity between khop-1 and all other timelines score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix) # score = sklearn.metrics.pairwise.cosine_similarity(tfidf_matrix[0:1], tfidf_matrix) # Exapand the top 5% of users if len(score[0]) < 2: return threshold = np.percentile(score[0][1:], 80) expand_idx = np.where(score[0] >= threshold)[0] expand_count = 0 for k in user_timeline_hash.keys(): if k < 1: continue if k in expand_idx: expand_count += 1 json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=TRUE, decision_tfidf={} WHERE user_id={};".format(score[0][k], user_timeline_hash[k])) else: user_ids.remove(user_timeline_hash[k]) print "\tExpand {} friends/followers".format(expand_count) return original_user_ids