def transform_subs_data(): df = load_dataframes(SUBS_RAW_DIR, "reddit", ".txt", SUBS_HEADERS) if df is None: return df = transform_data(df) df_to_csv(df, SUBS_TRANSFORMED_DIR, "reddit_subs") prefix_files(SUBS_RAW_DIR, "transformed")
def transform_posts_data(): df = load_dataframes(POSTS_RAW_DIR, "reddit", ".txt", POSTS_HEADERS) if df is None: return df = transform_data(df) df = filter_data(df) df_to_csv(df, POSTS_TRANSFORMED_DIR, "reddit_posts") prefix_files(POSTS_RAW_DIR, "transformed")
def transform_users_data(): df = load_dataframes(USERS_RAW_DIR, "reddit", ".txt", USERS_HEADERS) if df is None: return df = transform_data(df) df = filter_users_stats(df) df_to_csv(df, USERS_TRANSFORMED_DIR, "reddit_user_stats") prefix_files(USERS_RAW_DIR, "transformed") log_errors(ERROR_LOG_PATH, "reddit.users_transform", ERRORS)
def parse_tweets_data(): tweet_files = get_data_filepaths(RAW_DIR, "stf", ".txt") for _file in tweet_files: for tweet in load_tweets(_file): parse_tweet(tweet) separate_users_tweets() save_parsed_tweets() save_chosen_for_scoring() prefix_files(RAW_DIR, "parsed")
def transform_g1_data(): df = load_dataframes(RAW_DIR, "g1", "txt", HEADERS) if df is None: return df = filter_dataframe(df) if not df.empty: df = transform_data(df) df_to_csv(df, TRANSFORMED_DIR, "g1") prefix_files(RAW_DIR, "transformed")
def load_subs_data(): df = load_dataframes(SUBS_TRANSFORMED_DIR, "reddit", ".csv") if df is None: return try: sqlalch_load(df, "reddit", "sub_counts", ERRORS) except SQLAlchError: pass else: prefix_files(SUBS_TRANSFORMED_DIR, "loaded") log_errors(ERROR_LOG_PATH, "reddit.subs_load", ERRORS)
def load_g1_data(): df = load_dataframes(TRANSFORMED_DIR, "g1", ".csv") if df is None: return try: sqlalch_load(df, "noticias", "noticias", ERRORS) except SQLAlchError: pass else: prefix_files(TRANSFORMED_DIR, "loaded") log_errors(ERROR_LOG_PATH, "g1.load", ERRORS)
def load_users_data(): df = load_dataframes(USER_CSV_DIR, "users", ".csv") if df is None: return try: sqlalch_load(df, "twitter", "users", ERRORS) except SQLAlchError: pass else: prefix_files(USER_CSV_DIR, "loaded") log_errors(ERROR_LOG_PATH, "twitter.users_load", ERRORS)