Пример #1
0
def generate_model_comparison_by_user(input_directory, output_directory):
    models = [x for x in os.listdir(input_directory) if '.' not in x]
   
    # This loop will look across models for all users (not assuming every
    # model will have data on all users). The intent is to plot, by user,
    # whatever models are available.
    users  = []
    for model in models:
        model_users = [x for x in os.listdir(
                        os.path.join(input_directory, model)) if '.' not in x]
        users = users+model_users
    
    # dedup        
    users = list(set(users))
    
    # For each user now, let's make the plot across models.
    for user in users:
        user_out_dir = os.path.join(output_directory, user)
        create_dir_if_not_there(user_out_dir)
  
        dat = []
        for model in models:
            user_in_dir = os.path.join(input_directory, model, user)
            # if that model missing for user
            if not os.path.exists(user_in_dir):
                continue

            # Gather data by model
            dat.append(
            (model,
            pd.read_json(os.path.join(user_in_dir, 'own.json'  ))[0].values,
            pd.read_json(os.path.join(user_in_dir, 'other.json'))[0].values))
        
        generate_analysis_output(user_out_dir, dat, 'Model', user)
def generate_model_comparison(in_dir, out_dir):

    create_dir_if_not_there(output_directory)
    models = [x for x in os.listdir(input_directory) if '.' not in x]

    dat = [(model, pd.read_json(os.path.join(in_dir, model,
                                             'own.json'))[0].values,
            pd.read_json(os.path.join(in_dir, model, 'other.json'))[0].values)
           for model in models]

    generate_analysis_output(out_dir, dat, 'Model')
Пример #3
0
def generate_user_comparison_by_model(input_directory, output_directory):
    models = [x for x in os.listdir(input_directory) if '.' not in x]
   
    for model in models:
        model_in_dir  = os.path.join(input_directory, model)
        model_out_dir = os.path.join(output_directory, model)
        create_dir_if_not_there(model_out_dir)

        model_users = [x for x in os.listdir(model_in_dir) if '.' not in x]

        dat = []
        for user in model_users:
            in_dir = os.path.join(model_in_dir, user)
            dat.append(
            (user,
            pd.read_json(os.path.join(in_dir, 'own.json'  ))[0].values,
            pd.read_json(os.path.join(in_dir, 'other.json'))[0].values))
        
        generate_analysis_output(model_out_dir, dat, 'User', model)
Пример #4
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Takes user tweet files from 'raw_data_path', applies filters, and writes output
to preprocessed_data_path.

June, 2019
@author: Joshua Rubin
"""

from get_config import (get_config, create_dir_if_not_there)
from tweetvalidator.data_processing import filter_tweets_from_directories

# Pull-in filter settings from global configuration.    
config = get_config()

create_dir_if_not_there(config['preprocessed_data_path'])

filter_tweets_from_directories(config['raw_data_path'],
                               config['preprocessed_data_path'],
                               config['regexp_tweet_filters'],
                                int(config['min_tweet_characters'])    )
Пример #5
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Generates similarity scores for a variety of models and configurations.

September, 2019
@author: Joshua Rubin
"""

import time
from get_config import (get_config, create_dir_if_not_there)
config = get_config()
create_dir_if_not_there(config['eval_output_path'])

from tweetvalidator.models import RandomForestModel
from tweetvalidator import train_models

dir_args = {   'input_directory'  : config['processed_data_path'],
       'negative_input_directory' : config['processed_negative_data_path'],
               'output_directory' : config['eval_output_path']}

start = time.time()
train_models(RandomForestModel(verbose=True),
            'embedding', **dir_args,
            file_prefix = 'random_forest_model')

end = time.time()
print(end - start)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Downloads tweets for users specified in 'twitter_users' field of config.json

Jan, 2020
@author: Yagna
"""

from get_config import (get_config, create_dir_if_not_there)
from tweetvalidator.data_processing import get_tweets_by_user

config = get_config()

output_directory = config['raw_data_path']

max_tweets_per_user = config['max_tweets_per_user']

create_dir_if_not_there(output_directory)

twitter_users_to_fetch = config['twitter_users']

for user in twitter_users_to_fetch:
    print(user)
    get_tweets_by_user(user,
                       max_tweets=max_tweets_per_user,
                       output_path=output_directory)
Пример #7
0
# -*- coding: utf-8 -*-
"""
Takes filtered user tweet files from 'preprocessed_data_path', generates
embeddings, and writes output processed_data_path.

June, 2019
@author: Joshua Rubin
"""
import time
from get_config import (get_config, create_dir_if_not_there)
from tweetvalidator.data_processing import embed_tweets_from_directories
from tweetvalidator.models import RandomForestModel
from tweetvalidator import train_models

config = get_config()
create_dir_if_not_there(config['processed_data_path'])

start = time.time()
embed_tweets_from_directories(config['preprocessed_data_path'], 
                              config['processed_data_path'])


create_dir_if_not_there(config['eval_output_path'])

dir_args = {   'input_directory'  : config['processed_data_path'],
       'negative_input_directory' : config['processed_negative_data_path'],
               'output_directory' : config['eval_output_path']}


train_models(RandomForestModel(verbose=True),
            'embedding', **dir_args,