def generate_users_table(database_name, db_config_file, input_table_name, user_table_name, user_id_col, admins, event_split_date=None, overwrite=False): database, cursor = open_database(database_name, db_config_file) if overwrite: cursor.execute(sql_statements.drop_table_statement(user_table_name)) users_create_statement = users_sql_statement(input_table_name, user_table_name, user_id_col, event_split_date) cursor.execute(users_create_statement) database.commit() # Add admins to the table. admin_add_statement = sql_statements.table_permission_statement( user_table_name, admins) cursor.execute(admin_add_statement) database.commit() return
def filter_by_time(database_name, db_config_file, time_column='example', num_returned=10000, table_name='table', return_headers=None, distinct=None, output_filename=None, output_column_headers=None): database, cursor = open_database(database_name, db_config_file) cursor.execute( sql.SQL(""" SELECT * FROM {} WHERE time_column BETWEEN %s and %s LIMIT %s; """).format(sql.Identifier(table_name), sql.Identifier(column_name)), [start_date, end_date]) results = cursor.fetchall() dict_result = [] for row in results: dict_result.append(dict(row)) results = remove_duplicates(dict_result, limit=100) if output_filename is not None: save_to_csv(results, output_filename, output_column_headers) return results
def generate_hashtag_table(database_name, db_config_file, input_table_name, hashtag_table_name, hashtag_col, id_col, admins, overwrite=False, verbose=True): database, cursor = open_database(database_name, db_config_file) if overwrite: cursor.execute(sql_statements.drop_table_statement(hashtag_table_name)) hashtag_create_statement = hashtag_sql_statement(input_table_name, hashtag_table_name, hashtag_col, id_col) if verbose: print('Generating hashtag table..') cursor.execute(hashtag_create_statement) database.commit() # Add admins to the table. admin_add_statement = sql_statements.table_permission_statement( hashtag_table_name, admins) cursor.execute(admin_add_statement) database.commit() return
def timeline_chart(database_name, db_config_file, tweet_table, start_date, end_date, output_filename): database, cursor = open_database(database_name, db_config_file) time_aggregate_statement = sql.SQL("""SELECT date_trunc('day', created_ts) AS "date" , count(*) AS "tweet_total" FROM {tweet_table} WHERE created_ts >= %s AND created_ts <= %s GROUP BY 1 ORDER BY 1; """).format(tweet_table=sql.Identifier(tweet_table)) cursor.execute(time_aggregate_statement, [start_date, end_date]) results = to_pandas(cursor) chart = alt.Chart(results).mark_line().encode( x='date', y='tweet_total', # color='symbol' ) chart.save(output_filename) return
def get_user_flows(database_name, db_config_file, input_table_name, user_accounts, user_aliases=None, output_filename='flows.csv', data_limit=100, time_interval='month', verbose=True): database, cursor = open_database(database_name, db_config_file) sql_statement = sql.SQL(""" SELECT user_screen_name,user_name,user_description,user_created_ts,user_id, retweeted_status_user_screen_name,quoted_status_user_screen_name,in_reply_to_screen_name, created_ts -- complete_text FROM {input_table} """).format(input_table=sql.Identifier(input_table_name)) for idx, user in enumerate(user_accounts): if idx == 0: sql_statement += sql.SQL(""" WHERE retweeted_status_user_screen_name = {user_name} OR quoted_status_user_screen_name = {user_name} OR in_reply_to_screen_name = {user_name}""").format( user_name=sql.Literal(user)) else: sql_statement += sql.SQL(""" OR retweeted_status_user_screen_name = {user_name} OR quoted_status_user_screen_name = {user_name} OR in_reply_to_screen_name = {user_name}""").format( user_name=sql.Literal(user)) if data_limit is not None: sql_statement += sql.SQL('LIMIT %s') cursor.execute(sql_statement, [data_limit]) results = to_list_of_dicts(cursor) if verbose: for result in results: print(result) aggregate_dict = aggregate_flows(results, user_accounts) write_to_d3_sankey_csv(output_filename, aggregate_dict, user_accounts, user_aliases) return
def grab_top(database_name=None, db_config_file=None, cursor=None, column_name='example', num_returned=100, table_name='table', return_headers=None, distinct=None, output_filename=None, output_column_headers=None): if cursor is None: database, cursor = open_database(database_name, db_config_file) else: return if distinct is None: sql_statement = sql.SQL(""" SELECT user_screen_name,user_name,user_description,user_created_ts,user_followers_count,user_id,created_at,complete_text FROM (SELECT DISTINCT ON (user_id) user_screen_name,user_name,user_description,user_created_ts,user_followers_count,user_id,created_at,complete_text FROM {} WHERE lang='en') as sub_table ORDER BY {} DESC LIMIT %s; """).format(sql.Identifier(table_name), sql.Identifier(column_name)) cursor.execute(sql_statement, [num_returned]) else: # Currently non-functional. cursor.execute( sql.SQL(""" SELECT * FROM ( SELECT DISTINCT ON {} * FROM {} ORDER BY {} DESC LIMIT %s; """).format(sql.Identifier(distinct), sql.Identifier(table_name), sql.Identifier(column_name)), [num_returned]) results = to_list_of_dicts(cursor) # results = remove_duplicates(results, limit=100) if output_filename is not None: save_to_csv(results, output_filename, output_column_headers) return results
def aggregate(database_name, db_config_file, aggregate_column='example', output_columns='example', table_name='table', count_column_name='total_tweets', num_returned=1000, return_headers=None, output_filename=None, output_column_headers=None, verbose=False): database, cursor = open_database(database_name, db_config_file) # Max is a bit shady here. output_columns_sql = sql.SQL(',').join([ sql.SQL("MAX({output}) as {output}").format( output=sql.Identifier(output)) for output in output_columns ]) sql_statement = sql.SQL(""" SELECT {agg}, COUNT({count}) as {count_name}, {outputs} FROM {table} GROUP BY {agg} ORDER BY {count_name} DESC LIMIT %s; """).format(agg=sql.Identifier(aggregate_column), count=sql.Identifier(aggregate_column), count_name=sql.Identifier(count_column_name), table=sql.Identifier(table_name), outputs=output_columns_sql) cursor.execute(sql_statement, [num_returned]) results = to_list_of_dicts(cursor) if verbose: for result in results: print(result) if output_filename is not None: save_to_csv(results, output_filename, output_column_headers) return
def append_hashtag_clusters(database_name, db_config_file, hashtag_table_name, hashtag_cluster_csv, hashtag_col='hashtags', cluster_col='clusters'): database, cursor = open_database(database_name, db_config_file) create_col_statement = sql.SQL("""ALTER TABLE {table}\n ADD COLUMN IF NOT EXISTS {cluster_col} VARCHAR;""").format( table=sql.SQL(hashtag_table_name), cluster_col=sql.SQL(cluster_col)) cursor.execute(create_col_statement) classify_statement, classify_values = sql_statements.category_statement( hashtag_table_name, hashtag_cluster_csv, hashtag_col, cluster_col) cursor.execute(classify_statement, classify_values) database.commit()
def stream_connection_data(database_name, db_config_file, output_gefx_file, output_columns, connect_column, connect_column_screen_name, where_statement, save_pkl=True, dict_pkl_file=None, users_pkl_file=None, table_name='tweets', connection_type='retweet', attributes=None, label='screen_name', itersize=1000, limit=None): if limit is None: limit_statement = sql.SQL('') else: limit_statement = sql.SQL(f'LIMIT {limit}') select_columns = sql.SQL(', ').join( [sql.Identifier(item) for item in output_columns]) database, cursor = open_database( database_name, db_config_file, named_cursor='network_connections_retrieval', itersize=itersize) user_statement = sql.SQL(""" SELECT {select} FROM {table_name} {where_statement} {limit_statement} """).format(table_name=sql.SQL(table_name), select=select_columns, where_statement=where_statement, limit_statement=limit_statement) cursor.execute(user_statement) connections_dict = defaultdict(dict_dict) username_dict = defaultdict(set_dict) count = 0 progress_bar = tqdm() while True: result = cursor.fetchmany(cursor.itersize) if result: for item in result: item = dict(item) username_dict[item['user_id']]['screen_name'].add( item['user_screen_name']) if connection_type in ['reply', 'quote', 'retweet']: if 'count' not in connections_dict[item['user_id']][ item[connect_column]]: connections_dict[item['user_id']][ item[connect_column]]['count'] = 0 connections_dict[item['user_id']][ item[connect_column]]['count'] += 1 username_dict[item[connect_column]]['screen_name'].add( item[connect_column_screen_name]) elif connection_type == 'all': pass if attributes is not None: for attribute in attributes: connections_dict[item['user_id']][ item[connect_column]][attribute] = item[attribute] count += len(result) progress_bar.set_description( f"Iteration {count // itersize}, {count} rows retrieved.") else: cursor.close() break if save_pkl: with open(dict_pkl_file, 'wb') as openfile: pickle.dump(connections_dict, openfile) with open(users_pkl_file, 'wb') as openfile: pickle.dump(username_dict, openfile) return connections_dict, username_dict
def generate_user_statistics(database_name, db_config_file, tweet_table_name, user_table_name, overwrite=False): database, cursor = open_database(database_name, db_config_file) # If statements are for testing, remove or make parameter eventually. # Total Posts if True: total_post_statement = sql.SQL("""UPDATE {user_table}\n SET {column_name} = temp.idcount FROM (SELECT COUNT(user_id) as idcount, user_id FROM {tweet_table} GROUP BY user_id) AS temp WHERE {user_table}.user_id = temp.user_id """) execute_user_sql(total_post_statement, 'total_database_tweets', 'INT', cursor, database, 'total post count') # First Post, Statuses Count if True: first_post_statement = sql.SQL("""UPDATE {user_table}\n SET first_created_ts = temp.first_post FROM (SELECT user_id, min(created_ts) first_post FROM {tweet_table} GROUP BY user_id) AS temp WHERE {user_table}.user_id = temp.user_id """) execute_user_sql(first_post_statement, 'first_created_ts', 'TIMESTAMP', cursor, database, 'first post') first_post_statement = sql.SQL("""UPDATE {user_table}\n SET first_status_count = temp.first_status FROM (SELECT DISTINCT ON (user_id) user_id, user_statuses_count as first_status, user_created_ts FROM {tweet_table} ORDER BY user_id, user_created_ts ASC) AS temp WHERE {user_table}.user_id = temp.user_id """) execute_user_sql(first_post_statement, 'first_status_count', 'INT', cursor, database, 'first status count') # Time Differences if True: date_range_statement = sql.SQL("""UPDATE {user_table}\n SET database_active_days = extract(day FROM last_created_ts - first_created_ts) """) execute_user_sql(date_range_statement, 'database_active_days', 'INT', cursor, database, 'active days in database') date_range_statement = sql.SQL("""UPDATE {user_table}\n SET previous_active_days = extract(day FROM first_created_ts - user_created_ts) """) execute_user_sql(date_range_statement, 'previous_active_days', 'INT', cursor, database, 'previously active days') # Rates if True: rate_statement = sql.SQL("""UPDATE {user_table}\n SET database_posting_rate = cast(total_database_tweets as numeric) / NULLIF(database_active_days,0) """) execute_user_sql(rate_statement, 'database_posting_rate', 'NUMERIC', cursor, database, 'database posting rate') rate_statement = sql.SQL("""UPDATE {user_table}\n SET previous_posting_rate = cast(first_status_count as numeric) / NULLIF(previous_active_days,0) """) execute_user_sql(rate_statement, 'previous_posting_rate', 'NUMERIC', cursor, database, 'previous rate statement') rate_statement = sql.SQL("""UPDATE {user_table}\n SET previous_current_posting_rate_ratio = database_posting_rate / NULLIF(previous_posting_rate,0) """) execute_user_sql(rate_statement, 'previous_current_posting_rate_ratio', 'NUMERIC', cursor, database, 'posting rate ratio statement') return
def upload_twitter_2_sql( database_name, db_config_file, twitter_json_folders, table_format_csv, table_name='example_table', owner='example', admins=[], search_text=False, keywords=['keyword1', 'keyword2'], all_keywords=False, match_dates=True, start_time=None, end_time=datetime.utcnow().replace(tzinfo=timezone.utc), use_regex_match=False, reg_expr='example_regex', overwrite_db=True, overwrite=True, timestamp='modified', json_mode='newline'): create_table_statement = sql_statements.create_table_statement( table_format_csv, table_name) insert_table_statement = sql_statements.insert_statement( table_format_csv, table_name) database, cursor = open_database(database_name, db_config_file, overwrite_db, owner, admins) if overwrite: # This errors if the tweets table does not yet exist. # Fix that! cursor.execute("DROP TABLE IF EXISTS tweets;") pass else: # Not sure what should happen in this case. print('Table already exists, and overwrite=False. Exiting.') return cursor.execute(create_table_statement) database.commit() # Add admins to the table. admin_add_statement = sql_statements.table_permission_statement( table_name, admins) cursor.execute(admin_add_statement) database.commit() column_header_dict = get_column_header_dict(table_format_csv) try: # Keep track of how many tweets have been inserted (just make sure it's running) total_tweets_inserted = 0 # Process each folder for folder_path in twitter_json_folders: # Make sure only valid .json files are processed json_files_to_process = glob.glob( os.path.join(folder_path, '*.json')) # Filter to only include files within the date range: # this specifically filters out JSON files by the file's last modified time (UNIX timestamp), only keeping # files written on or after the early time bound and on or before the late time bound plus one day # (to allow time for tweets to be written to the file) if match_dates: json_files_to_process = sorted( json_files_to_process, key=lambda json_file: get_last_modified( os.path.abspath(json_file))) print("{} JSON files before time filtering: from {} to {}". format(len(json_files_to_process), json_files_to_process[0], json_files_to_process[-1])) json_files_to_process = [ json_file for json_file in json_files_to_process if within_time_bounds(os.path.abspath(json_file), start_time, end_time) ] print( "{} JSON files after time filtering: from {} to {}".format( len(json_files_to_process), json_files_to_process[0], json_files_to_process[-1])) progress_bar = tqdm(json_files_to_process, desc='0/0 tweets inserted') for idx, json_file in enumerate(progress_bar): # For each file, extract the tweets and add the number extracted to the total_tweets_inserted total_tweets_inserted += extract_json_file( os.path.join(folder_path, json_file), cursor, database, keywords, search_text, all_keywords, insert_table_statement, match_dates, start_time, end_time, use_regex_match, reg_expr, column_header_dict, json_mode=json_mode) progress_bar.set_description( "{fnum}/{ftotal_tweets_inserted}: {tnum} tweets inserted". format(fnum=idx, ftotal_tweets_inserted=(len(json_files_to_process) + 1), tnum=total_tweets_inserted)) sys.stdout.flush() # Close everything close_database(cursor, database) except KeyboardInterrupt: close_database(cursor, database) except Exception: close_database(cursor, database) raise return
def export_reply_threads(database_name, db_config_file, table_name, select_columns=[ 'tweet', 'user_id', 'user_name', 'user_screen_name', 'created_at', 'in_reply_to_user_id', 'in_reply_to_user_screen_name', 'in_reply_to_status_id' ], seed_conditions=None, seed_db_config_file=None, seed_database_name=None, seed_table_name=None, seed_limit=500, seed_random_percent=None, reply_range=[1, 10000000000000], verbose=False, output_type='csv', output_filepath=None): """ If not seed database is provided, we assume that you are pulling your seed posts from the same database your are pulling replies from. """ if seed_database_name is None: seed_database_name = database_name if seed_db_config_file is None: seed_db_config_file = db_config_file """ The cursor is a curious concept in SQL. All queries need to run through the cursor object, and you need to generate one with Python. I use a function in twitter2sql.core.util to do this easily. You need a unique cursor for each server/database combination. """ seed_database, seed_cursor = open_database(seed_database_name, seed_db_config_file) database, cursor = open_database(database_name, db_config_file) """ I have a variable, select_columns, which determines which columns users want returned in this process. The 'id' column, however, is necessary for this whole operation to work, so I add it here as a check on the user (me) who might forget it. """ if 'id' not in select_columns: select_columns = select_columns + ['id'] """ Step 1, get the seed posts! Check the function below this one for additional documentation. """ seed_posts = get_seed_posts(seed_cursor, seed_limit, seed_conditions, select_columns, seed_random_percent, seed_table_name, verbose) """ A bit of magic is done here to make sure that 'id' and 'in_reply_to_status_id' are in select_columns, and makes sure that the 'id' column is the first one in select_columns. This is over-complicated, and should probably be revised later :). """ if 'id' not in select_columns: select_columns = select_columns + ['in_reply_to_status_id'] select_columns.insert(0, select_columns.pop(select_columns.index('id'))) """ Step 2, retrieve the replies and write them to a csv document. """ if output_type == 'csv': with open(output_filepath, 'w') as openfile: writer = csv.writer(openfile, delimiter=',') writer.writerow(select_columns + ['path', 'depth', 'is_seed']) for seed_post in tqdm(seed_posts): """ First, write the row for your seed post. The last three columns are only applicable for replies, so fill them with dummy values. """ writer.writerow([seed_post[key] for key in select_columns] + ['NONE', 1, 'TRUE']) """ Then, pull the replies! See the get_reply_thread function for more. """ results = get_reply_thread(cursor, seed_post, table_name, select_columns, reply_range, verbose) """ IF there are replies, this recursive function will write them to your csv-file in 'reply order.' In other words, replies will be threaded as they are in e.g. Reddit. """ if results: for idx, result in enumerate(results): if result['depth'] == 2: construct_tree(result, results, idx, writer, select_columns) elif output_type == 'networkx': """ This code segment is for pulling out data into networkx format. It will be documented later :) """ with open(output_filepath, 'wb') as openfile: thread_networks = [] for seed_post in tqdm(seed_posts): results = get_reply_thread(cursor, seed_post, table_name, select_columns, reply_range) G = nx.Graph(id=seed_post['id']) G.add_node(seed_post['id'], time=seed_post['created_at'], user_id=seed_post['user_id'], depth=1) if results: for idx, result in enumerate(results): G.add_node(result['id'], time=result['created_at'], user_id=result['user_id'], depth=result['depth']) G.add_edge(result['id'], result['in_reply_to_status_id']) thread_networks += [G] pickle.dump(thread_networks, openfile)
def unwind_urls( database_name, db_config_file, twitter_json_folders, input_table_name, output_table_name, admins, cache_dir='./url_unwind_cache', column_headers={ 'expanded_url_0': ['TEXT', ''], 'resolved_url': ['TEXT', ''], 'domain': ['TEXT', ''] }, chunk_size=1000, overwrite_urls=False, overwrite_table=False): """ chunk_size: how many URLs per chunk - can be adjusted """ if not os.path.isdir(cache_dir): os.makedirs(cache_dir) # Open database database, cursor = open_database(database_name, db_config_file) # Create table create_table_statement = sql_statements.create_table_statement( column_headers, output_table_name) if overwrite_table: cursor.execute(sql_statements.drop_table_statement(output_table_name)) cursor.execute(create_table_statement) database.commit() # Add admins to the table. admin_add_statement = sql_statements.table_permission_statement( output_table_name, admins) cursor.execute(admin_add_statement) database.commit() # Collect existing URLs from table. if not os.path.isfile( os.path.join(cache_dir, 'distinct_urls_' + database_name + ".json")) or overwrite_urls: not_null_statement = sql_statements.not_null_statement( input_table_name, 'expanded_url_0', select="expanded_url_0", distinct='expanded_url_0') cursor.execute(not_null_statement) short_urls = sorted([u[0] for u in cursor.fetchall()]) with open( os.path.join(cache_dir, "distinct_urls_" + database_name + ".json"), "w+") as f: json.dump(short_urls, f) else: with open( os.path.join(cache_dir, "distinct_urls_" + database_name + ".json")) as f: short_urls = json.load(f) # Split URLs into chunks url_chunks = [] for index in range(0, len(short_urls), chunk_size): chunk_id = index // chunk_size url_chunks += [(chunk_id, short_urls[index:(index + chunk_size)])] total_chunks = len(url_chunks) # Feed those chunks to parallel processes. func = partial(process_chunk, cache_dir, total_chunks, overwrite_urls) pool = mp.Pool() _ = pool.map_async(func, url_chunks) pool.close() pool.join() # Make sure we've got them all for chunk_id, _ in url_chunks: assert os.path.isfile(os.path.join(cache_dir, f"{chunk_id}.json")), \ "No file exists for chunk {}".format(chunk_id) # Compile all URL chunks into one big file compiled_urls = [] for chunk_id, _ in url_chunks: fname = os.path.join(cache_dir, f"{chunk_id}.json") with open(fname) as f: chunk_urls = json.load(f) compiled_urls.extend(chunk_urls) insert_table_statement = sql_statements.insert_statement( column_headers, output_table_name) ext.execute_batch(cursor, insert_table_statement, compiled_urls) close_database(cursor, database) return