示例#1
0
def generate_users_table(database_name,
                         db_config_file,
                         input_table_name,
                         user_table_name,
                         user_id_col,
                         admins,
                         event_split_date=None,
                         overwrite=False):

    database, cursor = open_database(database_name, db_config_file)

    if overwrite:
        cursor.execute(sql_statements.drop_table_statement(user_table_name))

    users_create_statement = users_sql_statement(input_table_name,
                                                 user_table_name, user_id_col,
                                                 event_split_date)

    cursor.execute(users_create_statement)
    database.commit()

    # Add admins to the table.
    admin_add_statement = sql_statements.table_permission_statement(
        user_table_name, admins)
    cursor.execute(admin_add_statement)
    database.commit()

    return
示例#2
0
def filter_by_time(database_name,
                   db_config_file,
                   time_column='example',
                   num_returned=10000,
                   table_name='table',
                   return_headers=None,
                   distinct=None,
                   output_filename=None,
                   output_column_headers=None):

    database, cursor = open_database(database_name, db_config_file)

    cursor.execute(
        sql.SQL("""
        SELECT *
        FROM {}
        WHERE time_column BETWEEN %s and %s
        LIMIT %s;
        """).format(sql.Identifier(table_name), sql.Identifier(column_name)),
        [start_date, end_date])

    results = cursor.fetchall()
    dict_result = []
    for row in results:
        dict_result.append(dict(row))

    results = remove_duplicates(dict_result, limit=100)

    if output_filename is not None:
        save_to_csv(results, output_filename, output_column_headers)

    return results
示例#3
0
def generate_hashtag_table(database_name,
                           db_config_file,
                           input_table_name,
                           hashtag_table_name,
                           hashtag_col,
                           id_col,
                           admins,
                           overwrite=False,
                           verbose=True):

    database, cursor = open_database(database_name, db_config_file)

    if overwrite:
        cursor.execute(sql_statements.drop_table_statement(hashtag_table_name))

    hashtag_create_statement = hashtag_sql_statement(input_table_name,
                                                     hashtag_table_name,
                                                     hashtag_col, id_col)

    if verbose:
        print('Generating hashtag table..')
    cursor.execute(hashtag_create_statement)
    database.commit()

    # Add admins to the table.
    admin_add_statement = sql_statements.table_permission_statement(
        hashtag_table_name, admins)
    cursor.execute(admin_add_statement)
    database.commit()

    return
示例#4
0
def timeline_chart(database_name, db_config_file, tweet_table, start_date,
                   end_date, output_filename):

    database, cursor = open_database(database_name, db_config_file)

    time_aggregate_statement = sql.SQL("""SELECT 
    date_trunc('day', created_ts) AS "date" , count(*) AS "tweet_total"
    FROM {tweet_table}
    WHERE created_ts >= %s
    AND created_ts <= %s
    GROUP BY 1 
    ORDER BY 1;
        """).format(tweet_table=sql.Identifier(tweet_table))
    cursor.execute(time_aggregate_statement, [start_date, end_date])

    results = to_pandas(cursor)

    chart = alt.Chart(results).mark_line().encode(
        x='date',
        y='tweet_total',
        # color='symbol'
    )

    chart.save(output_filename)

    return
示例#5
0
def get_user_flows(database_name,
                   db_config_file,
                   input_table_name,
                   user_accounts,
                   user_aliases=None,
                   output_filename='flows.csv',
                   data_limit=100,
                   time_interval='month',
                   verbose=True):

    database, cursor = open_database(database_name, db_config_file)

    sql_statement = sql.SQL("""
    SELECT user_screen_name,user_name,user_description,user_created_ts,user_id,
    retweeted_status_user_screen_name,quoted_status_user_screen_name,in_reply_to_screen_name,
    created_ts
    -- complete_text
    FROM {input_table}
    """).format(input_table=sql.Identifier(input_table_name))

    for idx, user in enumerate(user_accounts):
        if idx == 0:
            sql_statement += sql.SQL("""
                WHERE retweeted_status_user_screen_name = {user_name}
                OR quoted_status_user_screen_name = {user_name}
                OR in_reply_to_screen_name = {user_name}""").format(
                user_name=sql.Literal(user))
        else:
            sql_statement += sql.SQL("""
                OR retweeted_status_user_screen_name = {user_name}
                OR quoted_status_user_screen_name = {user_name}
                OR in_reply_to_screen_name = {user_name}""").format(
                user_name=sql.Literal(user))

    if data_limit is not None:
        sql_statement += sql.SQL('LIMIT %s')

    cursor.execute(sql_statement, [data_limit])
    results = to_list_of_dicts(cursor)

    if verbose:
        for result in results:
            print(result)

    aggregate_dict = aggregate_flows(results, user_accounts)
    write_to_d3_sankey_csv(output_filename, aggregate_dict, user_accounts,
                           user_aliases)

    return
示例#6
0
def grab_top(database_name=None,
             db_config_file=None,
             cursor=None,
             column_name='example',
             num_returned=100,
             table_name='table',
             return_headers=None,
             distinct=None,
             output_filename=None,
             output_column_headers=None):

    if cursor is None:
        database, cursor = open_database(database_name, db_config_file)
    else:
        return

    if distinct is None:
        sql_statement = sql.SQL("""
            SELECT user_screen_name,user_name,user_description,user_created_ts,user_followers_count,user_id,created_at,complete_text
            FROM (SELECT DISTINCT ON (user_id) user_screen_name,user_name,user_description,user_created_ts,user_followers_count,user_id,created_at,complete_text 
            FROM {} WHERE lang='en') as sub_table
            ORDER BY {} DESC
            LIMIT %s;
            """).format(sql.Identifier(table_name),
                        sql.Identifier(column_name))
        cursor.execute(sql_statement, [num_returned])
    else:
        # Currently non-functional.
        cursor.execute(
            sql.SQL("""
            SELECT * FROM (
                SELECT DISTINCT ON {} *
            FROM {}
            ORDER BY {} DESC
            LIMIT %s;
            """).format(sql.Identifier(distinct), sql.Identifier(table_name),
                        sql.Identifier(column_name)), [num_returned])

    results = to_list_of_dicts(cursor)
    # results = remove_duplicates(results, limit=100)

    if output_filename is not None:
        save_to_csv(results, output_filename, output_column_headers)

    return results
示例#7
0
def aggregate(database_name,
              db_config_file,
              aggregate_column='example',
              output_columns='example',
              table_name='table',
              count_column_name='total_tweets',
              num_returned=1000,
              return_headers=None,
              output_filename=None,
              output_column_headers=None,
              verbose=False):

    database, cursor = open_database(database_name, db_config_file)

    # Max is a bit shady here.
    output_columns_sql = sql.SQL(',').join([
        sql.SQL("MAX({output}) as {output}").format(
            output=sql.Identifier(output)) for output in output_columns
    ])
    sql_statement = sql.SQL("""
        SELECT {agg}, COUNT({count}) as {count_name},
        {outputs}
        FROM {table}
        GROUP BY {agg}
        ORDER BY {count_name} DESC
        LIMIT %s;
        """).format(agg=sql.Identifier(aggregate_column),
                    count=sql.Identifier(aggregate_column),
                    count_name=sql.Identifier(count_column_name),
                    table=sql.Identifier(table_name),
                    outputs=output_columns_sql)
    cursor.execute(sql_statement, [num_returned])

    results = to_list_of_dicts(cursor)

    if verbose:
        for result in results:
            print(result)

    if output_filename is not None:
        save_to_csv(results, output_filename, output_column_headers)

    return
示例#8
0
def append_hashtag_clusters(database_name,
                            db_config_file,
                            hashtag_table_name,
                            hashtag_cluster_csv,
                            hashtag_col='hashtags',
                            cluster_col='clusters'):

    database, cursor = open_database(database_name, db_config_file)

    create_col_statement = sql.SQL("""ALTER TABLE {table}\n
        ADD COLUMN IF NOT EXISTS {cluster_col} VARCHAR;""").format(
        table=sql.SQL(hashtag_table_name), cluster_col=sql.SQL(cluster_col))
    cursor.execute(create_col_statement)

    classify_statement, classify_values = sql_statements.category_statement(
        hashtag_table_name, hashtag_cluster_csv, hashtag_col, cluster_col)

    cursor.execute(classify_statement, classify_values)
    database.commit()
示例#9
0
def stream_connection_data(database_name,
                           db_config_file,
                           output_gefx_file,
                           output_columns,
                           connect_column,
                           connect_column_screen_name,
                           where_statement,
                           save_pkl=True,
                           dict_pkl_file=None,
                           users_pkl_file=None,
                           table_name='tweets',
                           connection_type='retweet',
                           attributes=None,
                           label='screen_name',
                           itersize=1000,
                           limit=None):

    if limit is None:
        limit_statement = sql.SQL('')
    else:
        limit_statement = sql.SQL(f'LIMIT {limit}')

    select_columns = sql.SQL(', ').join(
        [sql.Identifier(item) for item in output_columns])

    database, cursor = open_database(
        database_name,
        db_config_file,
        named_cursor='network_connections_retrieval',
        itersize=itersize)

    user_statement = sql.SQL("""
        SELECT {select}
        FROM {table_name}
        {where_statement}
        {limit_statement}
        """).format(table_name=sql.SQL(table_name),
                    select=select_columns,
                    where_statement=where_statement,
                    limit_statement=limit_statement)

    cursor.execute(user_statement)

    connections_dict = defaultdict(dict_dict)
    username_dict = defaultdict(set_dict)

    count = 0
    progress_bar = tqdm()
    while True:
        result = cursor.fetchmany(cursor.itersize)
        if result:
            for item in result:
                item = dict(item)

                username_dict[item['user_id']]['screen_name'].add(
                    item['user_screen_name'])

                if connection_type in ['reply', 'quote', 'retweet']:
                    if 'count' not in connections_dict[item['user_id']][
                            item[connect_column]]:
                        connections_dict[item['user_id']][
                            item[connect_column]]['count'] = 0
                    connections_dict[item['user_id']][
                        item[connect_column]]['count'] += 1
                    username_dict[item[connect_column]]['screen_name'].add(
                        item[connect_column_screen_name])
                elif connection_type == 'all':
                    pass

                if attributes is not None:
                    for attribute in attributes:
                        connections_dict[item['user_id']][
                            item[connect_column]][attribute] = item[attribute]

            count += len(result)
            progress_bar.set_description(
                f"Iteration {count // itersize}, {count} rows retrieved.")
        else:
            cursor.close()
            break

    if save_pkl:
        with open(dict_pkl_file, 'wb') as openfile:
            pickle.dump(connections_dict, openfile)
        with open(users_pkl_file, 'wb') as openfile:
            pickle.dump(username_dict, openfile)

    return connections_dict, username_dict
示例#10
0
def generate_user_statistics(database_name,
                             db_config_file,
                             tweet_table_name,
                             user_table_name,
                             overwrite=False):

    database, cursor = open_database(database_name, db_config_file)

    # If statements are for testing, remove or make parameter eventually.

    # Total Posts
    if True:
        total_post_statement = sql.SQL("""UPDATE {user_table}\n
            SET {column_name} = temp.idcount
            FROM
            (SELECT COUNT(user_id) as idcount, user_id FROM {tweet_table} 
            GROUP BY user_id) AS temp
            WHERE {user_table}.user_id = temp.user_id
            """)

        execute_user_sql(total_post_statement, 'total_database_tweets', 'INT',
                         cursor, database, 'total post count')

    # First Post, Statuses Count
    if True:
        first_post_statement = sql.SQL("""UPDATE {user_table}\n
            SET first_created_ts = temp.first_post
            FROM
            (SELECT user_id, min(created_ts) first_post
                FROM {tweet_table}
                GROUP BY user_id) AS temp
            WHERE {user_table}.user_id = temp.user_id
            """)

        execute_user_sql(first_post_statement, 'first_created_ts', 'TIMESTAMP',
                         cursor, database, 'first post')

        first_post_statement = sql.SQL("""UPDATE {user_table}\n
            SET first_status_count = temp.first_status
            FROM
            (SELECT DISTINCT ON (user_id) user_id,
            user_statuses_count as first_status, user_created_ts
            FROM {tweet_table}
            ORDER BY user_id, user_created_ts ASC) AS temp
            WHERE {user_table}.user_id = temp.user_id
            """)

        execute_user_sql(first_post_statement, 'first_status_count', 'INT',
                         cursor, database, 'first status count')

    # Time Differences
    if True:
        date_range_statement = sql.SQL("""UPDATE {user_table}\n
            SET database_active_days = extract(day 
            FROM last_created_ts - first_created_ts)
            """)

        execute_user_sql(date_range_statement, 'database_active_days', 'INT',
                         cursor, database, 'active days in database')

        date_range_statement = sql.SQL("""UPDATE {user_table}\n
            SET previous_active_days = extract(day 
            FROM first_created_ts - user_created_ts)
            """)

        execute_user_sql(date_range_statement, 'previous_active_days', 'INT',
                         cursor, database, 'previously active days')

    # Rates
    if True:
        rate_statement = sql.SQL("""UPDATE {user_table}\n
            SET database_posting_rate = cast(total_database_tweets as numeric)
            / NULLIF(database_active_days,0)
            """)

        execute_user_sql(rate_statement, 'database_posting_rate', 'NUMERIC',
                         cursor, database, 'database posting rate')

        rate_statement = sql.SQL("""UPDATE {user_table}\n
            SET previous_posting_rate = cast(first_status_count as numeric)
            / NULLIF(previous_active_days,0)
            """)

        execute_user_sql(rate_statement, 'previous_posting_rate', 'NUMERIC',
                         cursor, database, 'previous rate statement')

        rate_statement = sql.SQL("""UPDATE {user_table}\n
            SET previous_current_posting_rate_ratio = database_posting_rate
            / NULLIF(previous_posting_rate,0)
            """)

        execute_user_sql(rate_statement, 'previous_current_posting_rate_ratio',
                         'NUMERIC', cursor, database,
                         'posting rate ratio statement')

    return
示例#11
0
def upload_twitter_2_sql(
        database_name,
        db_config_file,
        twitter_json_folders,
        table_format_csv,
        table_name='example_table',
        owner='example',
        admins=[],
        search_text=False,
        keywords=['keyword1', 'keyword2'],
        all_keywords=False,
        match_dates=True,
        start_time=None,
        end_time=datetime.utcnow().replace(tzinfo=timezone.utc),
        use_regex_match=False,
        reg_expr='example_regex',
        overwrite_db=True,
        overwrite=True,
        timestamp='modified',
        json_mode='newline'):

    create_table_statement = sql_statements.create_table_statement(
        table_format_csv, table_name)
    insert_table_statement = sql_statements.insert_statement(
        table_format_csv, table_name)
    database, cursor = open_database(database_name, db_config_file,
                                     overwrite_db, owner, admins)

    if overwrite:
        # This errors if the tweets table does not yet exist.
        # Fix that!
        cursor.execute("DROP TABLE IF EXISTS tweets;")
        pass
    else:
        # Not sure what should happen in this case.
        print('Table already exists, and overwrite=False. Exiting.')
        return
    cursor.execute(create_table_statement)
    database.commit()

    # Add admins to the table.
    admin_add_statement = sql_statements.table_permission_statement(
        table_name, admins)
    cursor.execute(admin_add_statement)
    database.commit()

    column_header_dict = get_column_header_dict(table_format_csv)

    try:

        # Keep track of how many tweets have been inserted (just make sure it's running)
        total_tweets_inserted = 0

        # Process each folder
        for folder_path in twitter_json_folders:

            # Make sure only valid .json files are processed
            json_files_to_process = glob.glob(
                os.path.join(folder_path, '*.json'))

            # Filter to only include files within the date range:
            # this specifically filters out JSON files by the file's last modified time (UNIX timestamp), only keeping
            # files written on or after the early time bound and on or before the late time bound plus one day
            # (to allow time for tweets to be written to the file)
            if match_dates:

                json_files_to_process = sorted(
                    json_files_to_process,
                    key=lambda json_file: get_last_modified(
                        os.path.abspath(json_file)))

                print("{} JSON files before time filtering: from {} to {}".
                      format(len(json_files_to_process),
                             json_files_to_process[0],
                             json_files_to_process[-1]))

                json_files_to_process = [
                    json_file for json_file in json_files_to_process
                    if within_time_bounds(os.path.abspath(json_file),
                                          start_time, end_time)
                ]

                print(
                    "{} JSON files after time filtering: from {} to {}".format(
                        len(json_files_to_process), json_files_to_process[0],
                        json_files_to_process[-1]))

            progress_bar = tqdm(json_files_to_process,
                                desc='0/0 tweets inserted')
            for idx, json_file in enumerate(progress_bar):
                # For each file, extract the tweets and add the number extracted to the total_tweets_inserted
                total_tweets_inserted += extract_json_file(
                    os.path.join(folder_path, json_file),
                    cursor,
                    database,
                    keywords,
                    search_text,
                    all_keywords,
                    insert_table_statement,
                    match_dates,
                    start_time,
                    end_time,
                    use_regex_match,
                    reg_expr,
                    column_header_dict,
                    json_mode=json_mode)

                progress_bar.set_description(
                    "{fnum}/{ftotal_tweets_inserted}: {tnum} tweets inserted".
                    format(fnum=idx,
                           ftotal_tweets_inserted=(len(json_files_to_process) +
                                                   1),
                           tnum=total_tweets_inserted))
                sys.stdout.flush()

        # Close everything
        close_database(cursor, database)

    except KeyboardInterrupt:
        close_database(cursor, database)
    except Exception:
        close_database(cursor, database)
        raise

    return
示例#12
0
def export_reply_threads(database_name,
                         db_config_file,
                         table_name,
                         select_columns=[
                             'tweet', 'user_id', 'user_name',
                             'user_screen_name', 'created_at',
                             'in_reply_to_user_id',
                             'in_reply_to_user_screen_name',
                             'in_reply_to_status_id'
                         ],
                         seed_conditions=None,
                         seed_db_config_file=None,
                         seed_database_name=None,
                         seed_table_name=None,
                         seed_limit=500,
                         seed_random_percent=None,
                         reply_range=[1, 10000000000000],
                         verbose=False,
                         output_type='csv',
                         output_filepath=None):
    """ If not seed database is provided, we assume that you are pulling your seed posts
    from the same database your are pulling replies from.
    """
    if seed_database_name is None:
        seed_database_name = database_name
    if seed_db_config_file is None:
        seed_db_config_file = db_config_file
    """ The cursor is a curious concept in SQL. All queries need to run through the cursor
    object, and you need to generate one with Python. I use a function in twitter2sql.core.util
    to do this easily. You need a unique cursor for each server/database combination.
    """
    seed_database, seed_cursor = open_database(seed_database_name,
                                               seed_db_config_file)
    database, cursor = open_database(database_name, db_config_file)
    """ I have a variable, select_columns, which determines which columns users want returned in
    this process. The 'id' column, however, is necessary for this whole operation to work, so I
    add it here as a check on the user (me) who might forget it.
    """
    if 'id' not in select_columns:
        select_columns = select_columns + ['id']
    """ Step 1, get the seed posts! Check the function below this one for additional documentation.
    """
    seed_posts = get_seed_posts(seed_cursor, seed_limit, seed_conditions,
                                select_columns, seed_random_percent,
                                seed_table_name, verbose)
    """ A bit of magic is done here to make sure that 'id' and 'in_reply_to_status_id'
    are in select_columns, and makes sure that the 'id' column is the first one in
    select_columns. This is over-complicated, and should probably be revised later :).
    """
    if 'id' not in select_columns:
        select_columns = select_columns + ['in_reply_to_status_id']
    select_columns.insert(0, select_columns.pop(select_columns.index('id')))
    """ Step 2, retrieve the replies and write them to a csv document.
    """
    if output_type == 'csv':
        with open(output_filepath, 'w') as openfile:

            writer = csv.writer(openfile, delimiter=',')
            writer.writerow(select_columns + ['path', 'depth', 'is_seed'])

            for seed_post in tqdm(seed_posts):
                """ First, write the row for your seed post. The last three columns are only applicable
                    for replies, so fill them with dummy values.
                """
                writer.writerow([seed_post[key] for key in select_columns] +
                                ['NONE', 1, 'TRUE'])
                """ Then, pull the replies! See the get_reply_thread function for more.
                """
                results = get_reply_thread(cursor, seed_post, table_name,
                                           select_columns, reply_range,
                                           verbose)
                """ IF there are replies, this recursive function will write them to your csv-file in
                    'reply order.' In other words, replies will be threaded as they are in e.g. Reddit.
                """
                if results:
                    for idx, result in enumerate(results):
                        if result['depth'] == 2:
                            construct_tree(result, results, idx, writer,
                                           select_columns)

    elif output_type == 'networkx':
        """ This code segment is for pulling out data into networkx format. It will be documented later :)
        """

        with open(output_filepath, 'wb') as openfile:

            thread_networks = []

            for seed_post in tqdm(seed_posts):

                results = get_reply_thread(cursor, seed_post, table_name,
                                           select_columns, reply_range)

                G = nx.Graph(id=seed_post['id'])
                G.add_node(seed_post['id'],
                           time=seed_post['created_at'],
                           user_id=seed_post['user_id'],
                           depth=1)

                if results:
                    for idx, result in enumerate(results):
                        G.add_node(result['id'],
                                   time=result['created_at'],
                                   user_id=result['user_id'],
                                   depth=result['depth'])
                        G.add_edge(result['id'],
                                   result['in_reply_to_status_id'])

                thread_networks += [G]

            pickle.dump(thread_networks, openfile)
示例#13
0
def unwind_urls(
        database_name,
        db_config_file,
        twitter_json_folders,
        input_table_name,
        output_table_name,
        admins,
        cache_dir='./url_unwind_cache',
        column_headers={
            'expanded_url_0': ['TEXT', ''],
            'resolved_url': ['TEXT', ''],
            'domain': ['TEXT', '']
        },
        chunk_size=1000,
        overwrite_urls=False,
        overwrite_table=False):
    """ chunk_size: how many URLs per chunk - can be adjusted
    """

    if not os.path.isdir(cache_dir):
        os.makedirs(cache_dir)

    # Open database
    database, cursor = open_database(database_name, db_config_file)

    # Create table
    create_table_statement = sql_statements.create_table_statement(
        column_headers, output_table_name)

    if overwrite_table:
        cursor.execute(sql_statements.drop_table_statement(output_table_name))

    cursor.execute(create_table_statement)
    database.commit()

    # Add admins to the table.
    admin_add_statement = sql_statements.table_permission_statement(
        output_table_name, admins)
    cursor.execute(admin_add_statement)
    database.commit()

    # Collect existing URLs from table.
    if not os.path.isfile(
            os.path.join(cache_dir, 'distinct_urls_' + database_name +
                         ".json")) or overwrite_urls:

        not_null_statement = sql_statements.not_null_statement(
            input_table_name,
            'expanded_url_0',
            select="expanded_url_0",
            distinct='expanded_url_0')
        cursor.execute(not_null_statement)
        short_urls = sorted([u[0] for u in cursor.fetchall()])

        with open(
                os.path.join(cache_dir,
                             "distinct_urls_" + database_name + ".json"),
                "w+") as f:
            json.dump(short_urls, f)

    else:
        with open(
                os.path.join(cache_dir,
                             "distinct_urls_" + database_name + ".json")) as f:
            short_urls = json.load(f)

    # Split URLs into chunks
    url_chunks = []
    for index in range(0, len(short_urls), chunk_size):
        chunk_id = index // chunk_size
        url_chunks += [(chunk_id, short_urls[index:(index + chunk_size)])]
    total_chunks = len(url_chunks)

    # Feed those chunks to parallel processes.
    func = partial(process_chunk, cache_dir, total_chunks, overwrite_urls)
    pool = mp.Pool()
    _ = pool.map_async(func, url_chunks)
    pool.close()
    pool.join()

    # Make sure we've got them all
    for chunk_id, _ in url_chunks:
        assert os.path.isfile(os.path.join(cache_dir, f"{chunk_id}.json")), \
            "No file exists for chunk {}".format(chunk_id)

    # Compile all URL chunks into one big file
    compiled_urls = []
    for chunk_id, _ in url_chunks:
        fname = os.path.join(cache_dir, f"{chunk_id}.json")
        with open(fname) as f:
            chunk_urls = json.load(f)
            compiled_urls.extend(chunk_urls)

    insert_table_statement = sql_statements.insert_statement(
        column_headers, output_table_name)

    ext.execute_batch(cursor, insert_table_statement, compiled_urls)
    close_database(cursor, database)

    return