def load_data(messages_filepath, categories_filepath): """ Loads the data from the given locations, ready to be cleaned :param messages_filepath: The file path for the messages data :param categories_filepath: The file path for the categorization data :return: Merges the 2 sources on 'id' and returns the joined result """ messages = ut.read_csv(messages_filepath) categories = ut.read_csv(categories_filepath) return pd.merge(messages, categories, on='id', how='inner')
def clean_raw_data(): """ Cleans the raw data by removing un-necessary columns """ interactions = ut.read_csv('data/raw/user-item-interaction.csv') articles = ut.read_csv('data/raw/articles_community.csv') del interactions['Unnamed: 0'] del articles['Unnamed: 0'] del interactions['Unnamed: 0.1'] del articles['Unnamed: 0.1'] interactions.to_csv('data/interactions.csv', index=False) articles.to_csv('data/articles.csv', index=False)
def create_word_bias_data(disaster_csv, bias_file_name): """ Based on the disaster data, generates a file to store the bias data for word ==> category :param disaster_csv: The disaster.csv file path :param bias_file_name: The file name of the output file with bias data """ # Read data disaster = ut.read_csv(disaster_csv) disaster['message'] = disaster['message'].apply(ast.literal_eval) non_category_names = [ 'id', 'message', 'original', 'genre_direct', 'genre_news', 'genre_social' ] category_names = list( dropwhile(lambda x: x in non_category_names, disaster.columns)) # Record word to category frequency mapping bias_data = {} total = ut.row_count(disaster) for index, row in disaster.iterrows(): for word in row['message']: if word not in bias_data: bias_data[word] = {} for category_name in category_names: bias_data[word][category_name + '_ones'] = 0 bias_data[word][category_name + '_total'] = 0 for category_name in category_names: bias_data[word][category_name + '_ones'] += row[category_name] bias_data[word][category_name + '_total'] += 1 if index % 100 == 0: print('Done ' + str(index) + ' of ' + str(total)) # Generate a data frame from the frequency mapping bias = pd.DataFrame() bias['word'] = bias_data.keys() # Populate each category ones and total column and add it to dataframe columns = bias_data[next(iter(bias_data))].keys() current_column_data = [] i = 1 for column in columns: for word in bias_data: current_column_data.append(bias_data[word][column]) bias[column] = current_column_data current_column_data = [] i += 1 # For each category, calculate the bias based on the ones and total data for category_name in category_names: bias[category_name + '_bias'] = bias[category_name + '_ones'] / bias[category_name + '_total'] bias.to_csv(bias_file_name, index=False)
def create_disaster_pipeline(disaster_csv_path, category_name): disaster = ut.read_csv(disaster_csv_path) print('Getting data...') X = disaster['message'].values Y = disaster[category_name].values x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3) print('Creating pipeline...') pipeline = pi.Pipeline([ ('vect', st.CountVectorizer( tokenizer=lambda text: (pt.pipe | __normalize_text__ | __tokenize_text__ | __remove_stopwords__ | __lemmatize_text__)(text))), ('tfidf', st.TfidfTransformer()), ('clf', en.RandomForestClassifier()) ]) print('Fitting pipeline...') pipeline.fit(x_train, y_train) print('Predicting with pipeline...') y_pred = pipeline.predict(x_test) print('Displaying results...') display_results(y_test, y_pred) pass
def create_disaster_sequence(disaster_csv_path, category_name): disaster = ut.read_csv(disaster_csv_path) print('Getting Data...') X = disaster['message'].values Y = disaster[category_name].values x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3) print('Tokenizing and count vectorizing...') vect = st.CountVectorizer(tokenizer=lambda message: ( pt.pipe | __normalize_text__ | __tokenize_text__ | __remove_stopwords__ # | __stem_text__ | __lemmatize_text__)(message)) print('Tfidf transforming...') tfidf = st.TfidfTransformer() classifier = en.RandomForestClassifier() print('Fitting classifier on train...') x_train_counts = vect.fit_transform(x_train) x_train_tfidf = tfidf.fit_transform(x_train_counts) classifier.fit(x_train_tfidf, y_train) print('Running classifier on test...') x_test_counts = vect.transform(x_test) x_test_tfidf = tfidf.transform(x_test_counts) y_pred = classifier.predict(x_test_tfidf) print('Displaying results...') display_results(y_test, y_pred)
def create_normalized_disaster_to(file_name): """ Normalizes disaster and creates a csv file with the resulting data :param file_name: The name to output to """ disaster = ut.read_csv('../../data/disaster.csv')\ .pipe(nlp.remove_columns)\ .pipe(nlp.one_hot_encode_genre)\ .pipe(nlp.normalize_related_category_values)\ .pipe(nlp.normalize_messages) disaster.to_csv(file_name, index=False)
def show_disaster_pca_for(category_name): """ Show a PCA where the data points are the word vectors and the targets are the values in the given category :param category_name: The disaster category name """ model = gensim.models.Word2Vec.load('disaster.model') disaster = ut.read_csv('disaster.csv') X = [] Y = [] num_rows = ut.row_count(disaster) for index, row in disaster.iterrows(): for word in row['message'].upper().split(' '): if word in model.wv.vocab: X.append(model[word]) Y.append(row[category_name]) if index % 5000 == 0: print('Done ' + str(index) + ' of ' + str(num_rows) + ' rows') pca = PCA(n_components=2) principalComponents = pca.fit_transform(X) finalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) finalDf['Is' + category_name] = pd.Series(Y) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('2 component PCA', fontsize=20) targets = [0, 1] colors = ['r', 'g'] for target, color in zip(targets, colors): indicesToKeep = finalDf['Is' + category_name] == target ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'], finalDf.loc[indicesToKeep, 'principal component 2'], c=color, s=5) ax.legend(targets) ax.grid() plt.show()
def create_word_vectors(model_bin_file, weather_words_csv, all_words_csv, output_dir, all_word_sample_size=500): weather = ut.read_csv(weather_words_csv) all = ut.read_csv(all_words_csv).sample(all_word_sample_size) model = gensim.models.KeyedVectors.load_word2vec_format(model_bin_file, binary=True) for index, row in weather.iterrows(): try: np.save(output_dir + '/' + row['word'], model[row['word']]) except: pass for index, row in all.iterrows(): try: np.save(output_dir + '/' + row['word'], model[row['word']]) except: pass
def print_unique_lengths_of_categories(): """ Prints all the different lengths that the 'categories' column has (If this print more than 1 number, than the data has a problem) """ lengths = set() categories = ut.read_csv('../data/disaster_categories.csv') for index, row in categories.iterrows(): lengths.add(len(row['categories'].split(';'))) for length in lengths: print(length)
def print_disaster_category_values(): """ Prints all the disaster category values (To find out if the '2's are a mistake) """ disaster = ut.read_csv('data/disaster.csv') non_cat_names = ['id', 'message', 'original', 'genre'] for cat in list(dropwhile(lambda x: x in non_cat_names, disaster.columns)): print(cat) print('-------------------------') for value in disaster[cat].unique(): print( str(value) + ' - ' + str(ut.row_count(disaster[disaster[cat] == value]))) print()
def show_weather_pca(word_vector_dir, weather_words_csv): """ Given names of the csv files, plots PCA for weather words vs normal word sample :param word_vector_dir: The file name of the csv containing the sample normal words :param weather_words_csv: The file name of the csv containing weather related words :return: """ weather_words = ut.read_csv(weather_words_csv) X = [] Y = [] for filename in os.listdir(word_vector_dir): X.append(np.load(word_vector_dir + '/' + filename)) if weather_words['word'].str.contains(filename.replace('.npy', '')).any(): Y.append('weather_related') else: Y.append('general') pca = PCA(n_components=2) principalComponents = pca.fit_transform(X) finalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) finalDf['Category'] = pd.Series(Y) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('2 component PCA', fontsize=20) targets = ['weather_related', 'general'] colors = ['r', 'g'] for target, color in zip(targets, colors): indicesToKeep = finalDf['Category'] == target ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'], finalDf.loc[indicesToKeep, 'principal component 2'], c=color, s=5) ax.legend(targets) ax.grid() plt.show()
def find_most_biased_word_for(category_name): """ Goes into the disaster.csv and prints the words that are the strongest indicator of the given category :param category_name: The name fo the target category """ disaster = ut.read_csv('disaster.csv') num_rows = ut.row_count(disaster) word_target_count = {} for index, row in disaster.iterrows(): for word in row['message'].upper().split(' '): if word not in word_target_count: word_target_count[word] = [0, 0, 0] word_target_count[word][row[category_name]] += 1 word_target_count[word][ 2] = word_target_count[word][1] / word_target_count[word][ 0] if word_target_count[word][0] > 0 else 2147483648 if index % 5000 == 0: print('Done ' + str(index) + ' of ' + str(num_rows)) word_corrs = pd.DataFrame() word_corrs['word'] = word_target_count.keys() word_corrs['zeros'] = pd.Series( map(lambda x: x[0], word_target_count.values())) word_corrs['ones'] = pd.Series( map(lambda x: x[1], word_target_count.values())) word_corrs['one2zero'] = pd.Series( map(lambda x: x[2], word_target_count.values())) word_corrs = word_corrs.sort_values(by=['one2zero'], ascending=False) word_corrs.to_csv('word_corrs.csv', index=False) for index, row in word_corrs[ word_corrs['one2zero'] < 2147483648].iterrows(): print(row['word'] + ' - Ones: ' + str(row['ones']) + ', Zeros: ' + str(row['zeros'])) input()
def print_disaster_dupe_summary(): """ Goes through merged, and categorized disaster.csv and prints the ids that are duplicates and a preview of the messages """ disaster = ut.read_csv('../data/disaster.csv') # Check for dupes ids = set() disaster['id'].apply(lambda x: ids.add(x)) dupe_ids = [] for id in ids: if ut.row_count(disaster[disaster['id'] == id]) > 1: print(id) dupe_ids.append(id) for dupe_id in dupe_ids: print(disaster[disaster['id'] == dupe_id]['message'])
def print_word_frequency(): """ Prints the word frequency in messages, from most frequent word to least frequent """ messages = ut.read_csv('../disaster.csv') message_words = messages['message'].apply(lambda x: x.lower().split(' ')) word_count = {} for message in message_words: for word in message: if word in word_count: word_count[word] += 1 else: word_count[word] = 1 for key, value in sorted(word_count.items(), key=lambda item: item[1], reverse=True): print(key + ' - ' + str(value))
def create_readble_bias(bias_file_name, database_filename, table_name): """ Based on the bias file output, creates new table and saves it to an SQLite DB :param bias_file_name: The file with all the word ==> category indicator data :param database_filename: The database file name :param table_name: The name of the table """ bias = ut.read_csv(bias_file_name) readable_bias = pd.DataFrame() for column in list(dropwhile(lambda x: '_bias' not in x, bias.columns)): category = column.replace('_bias', '') bias = bias.sort_values(by=[column], ascending=False) readable_bias[category + '_word'] = bias['word'] readable_bias[category + '_ones'] = bias[category + '_ones'] readable_bias[category + '_total'] = bias[category + '_total'] readable_bias[category + '_bias'] = bias[category + '_bias'] ut.to_db(readable_bias, database_filename, table_name)
import utility.util as ut import sklearn.preprocessing as pp import numpy as np import keras.models as km import keras.layers as kl import keras as kr # import yfinance as yf import pandas as pd import matplotlib.pyplot as plt # Get and reshape data aapl_train = ut.read_csv('aapl_train.csv') training_set = aapl_train.iloc[:, 1:2].values scaler = pp.MinMaxScaler(feature_range=(0, 1)) training_set_scaled = scaler.fit_transform(training_set) x_train = [] y_train = [] for i in range(200, 2000): x_train.append(training_set_scaled[i - 200:i, 0]) y_train.append(training_set_scaled[i, 0]) x_train, y_train = np.array(x_train), np.array(y_train) x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1)) # Build the LSTM regressor = km.Sequential()
import utility.util as ut import sklearn.preprocessing as pp import numpy as np import keras.models as km import keras.layers as kl import keras as kr # import yfinance as yf import pandas as pd import matplotlib.pyplot as plt # Get and reshape data ftse_train = ut.read_csv('ftse_train.csv') training_set = ftse_train.iloc[:, 1:2].values scaler = pp.MinMaxScaler(feature_range=(0, 1)) training_set_scaled = scaler.fit_transform(training_set) x_train = [] y_train = [] for i in range(50, 2000): x_train.append(training_set_scaled[i-50:i,0]) y_train.append(training_set_scaled[i,0]) x_train, y_train = np.array(x_train), np.array(y_train) x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1)) # Build the LSTM regressor = km.Sequential()
import utility.util as ut import sections as se se.widen_df_display() articles = ut.read_csv('data/articles.csv') interactions = ut.read_csv('data/interactions.csv') ################################################################################## # Part I: Exploratory Data Analysis ################################################################################## print('\n\nPress Enter to run Part I...') input() # SECTION 1.1 max_views_by_user = se.get_max_views_by_user(interactions) se.show_num_article_interaction_distribution(interactions) # SECTION 1.2 print(f'median: {se.get_median_num_article_interaction(interactions)}') print(f'max views by user: {se.get_max_num_article_interaction(interactions)}') # SECTION 1.3 articles = se.remove_dupes(articles) articles.to_csv('data/articles.csv', index=False) # SECTION 1.4 unique_articles = se.get_num_articles_with_interaction(interactions) total_articles = se.get_num_articles(articles) unique_users = se.get_unique_users(interactions) user_article_interactions = len(interactions)
import utility.util as ut import sklearn.preprocessing as pp import numpy as np import keras.models as km import keras.layers as kl import keras as kr # import yfinance as yf import pandas as pd import matplotlib.pyplot as plt import statistics as st ut.widen_df_display() # Get and reshape data msft_train = ut.read_csv('msft_train.csv') msft_train['Open_Delta'] = (msft_train['Open'] - msft_train['Open'].shift(1)) / msft_train['Open'] msft_train.at[0, 'Open_Delta'] = 0 training_set = msft_train.iloc[:, 7].values training_set = np.reshape(training_set, (-1, 1)) x_train = [] y_train = [] for i in range(50, 2000): x_train.append(training_set[i-50:i,0]) y_train.append(training_set[i,0]) x_train, y_train = np.array(x_train), np.array(y_train) x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))