def get_baseline_results(data, emotion, output_folder, n_grams): print("--- " + str(emotion) + " ---") results = [] results.append( run.run_knn_classification(data, emotion, None, "baseline", n_grams)) results.append( run.run_decision_tree_classification(data, emotion, "baseline", n_grams)) results.append( run.run_linear_svm_classification(data, emotion, None, "baseline", n_grams)) results.append( run.run_naive_bayes_classification(data, emotion, None, "baseline", n_grams)) results.append( run.run_random_forest_classification(data, emotion, None, "baseline", n_grams)) columns = [ "algorithm", "hyperparameter", "weighted_avg_precision", "weighted_avg_recall", "weighted_avg_f1-score", "accuracy", "experiment_type", "metric_dump_id", "macro_avg_precision", "macro_avg_recall", "macro_avg_f1-score", emotion + "_precision", emotion + "_recall", emotion + "_f1-score", "no_" + emotion + "_precision", "no_" + emotion + "_recall", "no_" + emotion + "_f1-score" ] results_df = pd.DataFrame(results, columns=columns) helpers.dataframe_to_csv(results_df, output_folder + emotion + "_results.csv") print(results_df[[ "algorithm", "hyperparameter", "macro_avg_f1-score", "accuracy", emotion + "_f1-score", "no_" + emotion + "_f1-score" ]])
def get_baseline_results(data, mpt, output_folder, n_grams): print("--- " + str(mpt) + "% ---") data = data[data.words_matched_percentage >= mpt] results = [] results.append(run.run_knn_classification(data, None, "baseline", n_grams)) results.append( run.run_decision_tree_classification(data, "baseline", n_grams)) results.append( run.run_linear_svm_classification(data, None, "baseline", n_grams)) results.append( run.run_naive_bayes_classification(data, None, "baseline", n_grams)) results.append( run.run_random_forest_classification(data, None, "baseline", n_grams)) columns = [ "algorithm", "hyperparameter", "weighted_avg_precision", "weighted_avg_recall", "weighted_avg_f1-score", "accuracy", "experiment_type", "metric_dump_id", "positive_precision", "positive_recall", "positive_f1-score", "neutral_precision", "neutral_recall", "neutral_f1-score", "negative_precision", "negative_recall", "negative_f1-score" ] results_df = pd.DataFrame(results, columns=columns) helpers.dataframe_to_csv(results_df, output_folder + str(mpt) + "_mpt_results.csv") print(results_df)
def get_existing_results(folder, dataset_type, n_grams): experiments = [] for file in get_results_filenames(folder): mpt = file.split("_")[0] if mpt == "best": continue mpt = int(mpt) results_df = helpers.load_dataset(folder + file) results_df = results_df.sort_values( ['weighted_avg_f1-score'], ascending=False).groupby('algorithm').head(3) results_df = results_df.reset_index(drop=True) algorithms = algorithm_single_list(results_df.algorithm.tolist()) for algorithm in algorithms: relevant_rows = results_df[results_df.algorithm == algorithm] for index, row in relevant_rows.iterrows(): experiments = next_experiments(mpt, algorithm, row.hyperparameter, experiments) break new_experiments_df = pd.DataFrame( experiments, columns=["mpt", "algorithm", "hyperparameter"]) helpers.dataframe_to_csv( new_experiments_df, "/home/michael/MRes/actual_project/sentiment_analysis/" + n_grams + "/next_" + dataset_type + "_experiments.csv")
def import_best_results_and_sort(folder): best_results_df = helpers.load_dataset(folder + "best_result_per_emotion.csv") best_results_df = best_results_df.sort_values(['macro_avg_f1-score'], ascending=False) best_results_df = best_results_df.reset_index(drop=True) helpers.dataframe_to_csv(best_results_df, folder + "best_result_per_emotion_sorted.csv")
def single_list_generator(): df = helpers.load_dataset("original_keywords.csv") list_of_terms = df.keywords.tolist() individual_terms = word_extractor(list_of_terms) list_df = pd.DataFrame(individual_terms, columns=["keyword"]) output_dir = ds.output_data + "keywords/" helpers.path_checker(output_dir) output_file = output_dir + "keywords_single_list.csv" helpers.dataframe_to_csv(list_df, output_file) return output_file
def negation_handled (folder, n_grams): data = helpers.load_dataset(ds.dataset + ds.negate_dataset) results_files = get_results_filenames(folder) for results_file in results_files: mpt = results_file.split("_")[0] if mpt == "best": continue mpt = int(mpt) results_df = helpers.load_dataset(folder + results_file) results_df = get_first_experimental_results(data, mpt, results_df, n_grams) helpers.dataframe_to_csv(results_df, folder + results_file)
def execute(folder, n_grams): results_files = get_results_filenames(folder) data = helpers.load_dataset(ds.dataset + ds.file) for results_file in results_files: emotion = results_file.split("_")[0] if emotion == "best": continue results_df = helpers.load_dataset(folder + results_file) results_df = get_first_experimental_results(data, emotion, results_df, n_grams) helpers.dataframe_to_csv(results_df, folder + results_file)
def import_results (folder): new_results = [] files = get_results_filenames(folder) for file in files: print("---" + file + "---") mpt = file.split("_")[0] if mpt == "best": continue mpt = int(mpt) results_df = helpers.load_dataset(folder + file) results_df = results_df.sort_values(['weighted_avg_f1-score'],ascending=False) results_df = results_df.reset_index(drop=True) for index, row in results_df.iterrows(): new_results.append([mpt, row.algorithm, row.hyperparameter, row.weighted_avg_precision, row.weighted_avg_recall, row["weighted_avg_f1-score"], row.accuracy, row.experiment_type, row.metric_dump_id, row.positive_precision, row.positive_recall, row["positive_f1-score"], row.neutral_precision, row.neutral_recall, row["neutral_f1-score"], row.negative_precision, row.negative_recall, row["negative_f1-score"]]) break columns = ["mpt", "algorithm", "hyperparameter", "weighted_avg_precision", "weighted_avg_recall", "weighted_avg_f1-score", "accuracy", "experiment_type", "metric_dump_id", "positive_precision", "positive_recall", "positive_f1-score", "neutral_precision", "neutral_recall", "neutral_f1-score", "negative_precision", "negative_recall", "negative_f1-score"] new_results_df = pd.DataFrame(new_results, columns=columns) helpers.dataframe_to_csv(new_results_df, folder + "best_result_per_mpt.csv")
def execute (folder, n_grams): results_files = get_results_filenames(folder) data = helpers.load_dataset(ds.dataset + ds.file) experiments_df = helpers.load_dataset("/home/michael/MRes/actual_project/emotion_detection/" + n_grams + "/next_experiments.csv") for results_filename in results_files: emotion = results_filename.split("_")[0] if emotion == "best": continue experiments = experiments_df[experiments_df.emotion == emotion] results_df = helpers.load_dataset(folder + results_filename) results_df = process_experiments(data, emotion, experiments, results_df, n_grams) helpers.dataframe_to_csv(results_df, folder + results_filename) return
def process_negation_handled_experiments (folder, n_grams): results_files = get_results_filenames(folder) data = helpers.load_dataset(ds.dataset + ds.negate_dataset) experiments_df = helpers.load_dataset("/home/michael/MRes/actual_project/sentiment_analysis/" + n_grams + "/next_negation_handled_experiments.csv") for results_filename in results_files: mpt = results_filename.split("_")[0] if mpt == "best": continue mpt = int(mpt) experiments = experiments_df[experiments_df.mpt == mpt] results_df = helpers.load_dataset(folder + results_filename) results_df = process_experiments(data, mpt, experiments, results_df, n_grams) helpers.dataframe_to_csv(results_df, folder + results_filename) return
def merge(dataset_type): print(" - Processing " + dataset_type + " files:") for file in ds.all_datasets: print(" - " + file) file_path = file.split("/") f_name = ds.output_data + "first_dataset_extraction/" + dataset_type + "/" + file_path[0] + "/" + file_path[1] df = helpers.load_dataset(f_name) if file == ds.all_datasets[0]: merge_hold = df else: merge_hold = pd.concat([merge_hold, df], sort=False) output_path = ds.output_data + "merged_dataset_extraction/" helpers.path_checker(output_path) file_name = dataset_type + ".csv" helpers.dataframe_to_csv(merge_hold, output_path + file_name) return output_path + file_name
def tweet_extractor(): files_created_generic = [] files_created_specific = [] for file in ds.all_datasets: generic_df = helpers.load_dataset(ds.output_data + "actual_keyword_matches/generic/" + file) specific_df = helpers.load_dataset(ds.output_data + "actual_keyword_matches/specific/" + file) print(" - loading data", file) df = helpers.load_dataset(ds.dataset + file) df = df[df.tweet_language == "en"] columns = [] for h in df.head(): columns.append(h) columns.append("matches") columns.append("source_file") columns.append("month") columns.append("year") df["matches"] = "" df["source_file"] = "" df["tweet_time"] = df["tweet_time"].astype("datetime64") df["month"] = df["tweet_time"].dt.month df["year"] = df["tweet_time"].dt.year specific_tweets, generic_tweets = pd.DataFrame( columns=columns), pd.DataFrame(columns=columns) specific_tweets = match_extractor(specific_df, df, specific_tweets, file, "specific") generic_tweets = match_extractor(generic_df, df, generic_tweets, file, "generic") output_data_path = ds.output_data + "first_dataset_extraction/" dataset = file.split("/")[0] filename = file.split("/")[1] specific_path = output_data_path + "specific/" + dataset + "/" helpers.path_checker(specific_path) helpers.dataframe_to_csv(specific_tweets, specific_path + filename) files_created_specific.append(specific_path + filename) generic_path = output_data_path + "generic/" + dataset + "/" helpers.path_checker(generic_path) helpers.dataframe_to_csv(generic_tweets, generic_path + filename) files_created_generic.append(generic_path + filename) return files_created_generic, files_created_specific
def total_frequency_graph_generator(df, path): col_name = "total" filename = path + "total_freq.png" helpers.path_checker(path) plt.style.use('fivethirtyeight') df = df.sort_values(col_name, ascending=False) zero = df[df[col_name] == 0] df = df[df[col_name] != 0] helpers.dataframe_to_csv(zero, ds.output_data + "statistics/zero_matches.csv") df.set_index('keyword', drop=True, inplace=True) if len(df) > 0: ax = df[col_name].plot.bar(figsize=(20, 12.75)) plt.xlabel("Keyword/Term") plt.ylabel("Number of Tweets") plt.title("Keyword frequency") plt.subplots_adjust(bottom=0.3) plt.savefig(filename) plt.close() return [filename, ds.output_data + "statistics/zero_matches.csv"]
def date_selection(): output_files = [] path = ds.output_data + "merged_dataset_extraction/" files = helpers.path_fetcher(path) for file in files: df = helpers.load_dataset(path + file) df_2013 = df[df.year == 2013] df_2013_8 = df_2013[df.month == 8] df_2013_9 = df_2013[df.month == 9] df_2013_10 = df_2013[df.month == 10] df_2013_11 = df_2013[df.month == 11] df_2013_12 = df_2013[df.month == 12] df = df[(df.year == 2014) | (df.year == 2015) | (df.year == 2016) | (df.year == 2017) | (df.year == 2018)] df = pd.concat( [df_2013_8, df_2013_9, df_2013_10, df_2013_11, df_2013_12, df]) storage_path = ds.output_data + "time_filtered_dataset_extraction/" helpers.path_checker(storage_path) helpers.dataframe_to_csv(df, storage_path + file) output_files.append(storage_path + file) return output_files
def import_results(folder): new_results = [] files = get_results_filenames(folder) for file in files: print("---" + file + "---") emotion = file.split("_")[0] if emotion == "best": continue results_df = helpers.load_dataset(folder + file) results_df = results_df.sort_values(['macro_avg_f1-score'], ascending=False) results_df = results_df.reset_index(drop=True) helpers.dataframe_to_csv(results_df, folder + file) for index, row in results_df.iterrows(): new_results.append([ emotion, row.algorithm, row.hyperparameter, row.weighted_avg_precision, row.weighted_avg_recall, row["weighted_avg_f1-score"], row.accuracy, row.experiment_type, row.metric_dump_id, row.macro_avg_precision, row.macro_avg_recall, row["macro_avg_f1-score"], row[emotion + "_precision"], row[emotion + "_recall"], row[emotion + "_f1-score"], row["no_" + emotion + "_precision"], row["no_" + emotion + "_recall"], row["no_" + emotion + "_f1-score"] ]) break columns = [ "emotion", "algorithm", "hyperparameter", "weighted_avg_precision", "weighted_avg_recall", "weighted_avg_f1-score", "accuracy", "experiment_type", "metric_dump_id", "macro_avg_precision", "macro_avg_recall", "macro_avg_f1-score", emotion + "_precision", emotion + "_recall", emotion + "_f1-score", "no_" + emotion + "_precision", "no_" + emotion + "_recall", "no_" + emotion + "_f1-score" ] new_results_df = pd.DataFrame(new_results, columns=columns) helpers.dataframe_to_csv(new_results_df, folder + "best_result_per_emotion.csv")
# load generic and specific datasets and merge them together dfs = [] for df in ds.all_datasets: dfs.append(helpers.load_dataset(ds.dataset + df)) dfs = pd.concat(dfs, ignore_index=True) dfs['preprocessed_tweet_text'] = "" # loop over dataframe, perform preprocessing on the tweet text for index, row in dfs.iterrows(): tweet_text = row.tweet_text tweet_text = lowercase_conversion(tweet_text) tweet_text = remove_urls(tweet_text) tweet_text = remove_accents(tweet_text) tweet_text = remove_usernames(tweet_text) tweet_text = transform_hashtags(tweet_text) tweet_text = contraction_expansion(tweet_text) tweet_text = remove_special_chars(tweet_text) # sorts out multiple spaces tweet_text = tweet_text.split() tweet_text = " ".join(tweet_text) dfs.preprocessed_tweet_text.at[index] = tweet_text # Store processed data helpers.dataframe_to_csv(dfs, ds.output_data + 'preprocessed_data.csv') # print complete message print('Preprocessing complete')
def store_metrics (metrics_dict, emotion, algorithm, modifier, n_grams): location = ds.metric_storage_location metric_id = get_new_metric_storage_identifier() metric_list_for_df = [] for index in range(1,11): metric_list_for_df.append([ metric_id, index, algorithm, modifier, n_grams, metrics_dict[emotion]["precision"][index - 1], metrics_dict[emotion]["recall"][index - 1], metrics_dict[emotion]["f1-score"][index - 1], metrics_dict[emotion]["support"][index - 1], metrics_dict["no_" + emotion]["precision"][index - 1], metrics_dict["no_" + emotion]["recall"][index - 1], metrics_dict["no_" + emotion]["f1-score"][index - 1], metrics_dict["no_" + emotion]["support"][index - 1], metrics_dict["accuracy"]["list"][index - 1], metrics_dict["macro avg"]["precision"][index - 1], metrics_dict["macro avg"]["recall"][index - 1], metrics_dict["macro avg"]["f1-score"][index - 1], metrics_dict["macro avg"]["support"][index - 1], metrics_dict["weighted avg"]["precision"][index - 1], metrics_dict["weighted avg"]["recall"][index - 1], metrics_dict["weighted avg"]["f1-score"][index - 1], metrics_dict["weighted avg"]["support"][index - 1], ]) metric_list_for_df.append([ metric_id, "average", algorithm, modifier, n_grams, metrics_dict[emotion]["avg"]["precision"], metrics_dict[emotion]["avg"]["recall"], metrics_dict[emotion]["avg"]["f1-score"], metrics_dict[emotion]["avg"]["support"], metrics_dict["no_" + emotion]["avg"]["precision"], metrics_dict["no_" + emotion]["avg"]["recall"], metrics_dict["no_" + emotion]["avg"]["f1-score"], metrics_dict["no_" + emotion]["avg"]["support"], metrics_dict["accuracy"]["avg"], metrics_dict["macro avg"]["avg"]["precision"], metrics_dict["macro avg"]["avg"]["recall"], metrics_dict["macro avg"]["avg"]["f1-score"], metrics_dict["macro avg"]["avg"]["support"], metrics_dict["weighted avg"]["avg"]["precision"], metrics_dict["weighted avg"]["avg"]["recall"], metrics_dict["weighted avg"]["avg"]["f1-score"], metrics_dict["weighted avg"]["avg"]["support"], ]) columns = ["metric_dump_id", "fold", "algorithm", "modifier", "n_grams", emotion + "_precision", emotion + "_recall", emotion + "_f1-score", emotion + "_support", "no_" + emotion + "_precision", "no_" + emotion + "_recall", "no_" + emotion + "_f1-score", "no_" + emotion + "_support", "accuracy", "macro_avg_precision", "macro_avg_recall", "macro_avg_f1-score", "macro_avg_support", "weighted_avg_precision", "weighted_avg_recall", "weighted_avg_f1-score", "weighted_avg_support"] metric_df = pd.DataFrame(metric_list_for_df, columns=columns) helpers.dataframe_to_csv(metric_df, location + str(metric_id) + ".csv") return metric_id
def processing(): tagged_keywords = load_tagged_keywords() match_files = find_match_files() files_created = [] for dataset in match_files: for file in match_files[dataset]: print(" - " + file) new_store = {} generic_matches = {} specific_matches = {} matches_df = load_match_file(file) for match_index, match_row in matches_df.iterrows(): generic_counter = 0 specific_counter = 0 tempstore = [] match_row.matches = match_row.matches.strip("''][").split( "', '") match_row.matches = remove_duplicates(match_row.matches) matches_df.matches.at[match_index] = match_row.matches for keyword_index, keyword_row in tagged_keywords.iterrows(): if comparison(match_row.matches, keyword_row['split']): if keyword_row.tag == "generic": generic_counter += 1 if keyword_row.tag == "specific": specific_counter += 1 tempstore.append(keyword_row.term) if specific_counter != 0: specific_matches[match_row.tweet_id] = tempstore else: if generic_counter != 0: generic_matches[match_row.tweet_id] = tempstore generic_data_list = [] specific_data_list = [] for tweet_id in generic_matches: generic_data_list.append([tweet_id, generic_matches[tweet_id]]) for tweet_id in specific_matches: specific_data_list.append( [tweet_id, specific_matches[tweet_id]]) generic_file_path = ds.output_data + "actual_keyword_matches/generic/" + file.split( "/")[-2] + "/" helpers.path_checker(generic_file_path) generic_file_name = generic_file_path + file.split("/")[-1] helpers.data_to_file_two_values(generic_data_list, '"tweet_id","matches"', generic_file_name) files_created.append(generic_file_name) specific_file_path = ds.output_data + "actual_keyword_matches/specific/" + file.split( "/")[-2] + "/" helpers.path_checker(specific_file_path) specific_file_name = specific_file_path + file.split("/")[-1] helpers.data_to_file_two_values(specific_data_list, '"tweet_id","matches"', specific_file_name) files_created.append(specific_file_name) file_path = ds.output_data + "single_keyword_matches_dup_removed/" + file.split( "/")[-2] + "/" helpers.path_checker(file_path) file_name = file_path + file.split("/")[-1] helpers.dataframe_to_csv(matches_df, file_name) files_created.append(file_name) return files_created
def import_results(folder): unigram_df = helpers.load_dataset(ds.output_data + "unigrams/" + folder + "/best_result_per_mpt.csv") bigram_df = helpers.load_dataset(ds.output_data + "bigrams/" + folder + "/best_result_per_mpt.csv") trigram_df = helpers.load_dataset(ds.output_data + "trigrams/" + folder + "/best_result_per_mpt.csv") unigram_bigram_df = helpers.load_dataset(ds.output_data + "unigrams_bigrams/" + folder + "/best_result_per_mpt.csv") unigram_bigram_trigram_df = helpers.load_dataset( ds.output_data + "unigrams_bigrams_trigrams/" + folder + "/best_result_per_mpt.csv") new_results, res_for_storage = [], [] for idx in range(0, 100, 10): unigram = unigram_df[unigram_df.mpt == idx] bigram = bigram_df[bigram_df.mpt == idx] trigram = trigram_df[trigram_df.mpt == idx] unigram_bigram = unigram_bigram_df[unigram_bigram_df.mpt == idx] unigram_bigram_trigram = unigram_bigram_trigram_df[ unigram_bigram_trigram_df.mpt == idx] new_results.append([ idx, unigram['weighted_avg_f1-score'].tolist()[0], bigram['weighted_avg_f1-score'].tolist()[0], trigram['weighted_avg_f1-score'].tolist()[0], unigram_bigram['weighted_avg_f1-score'].tolist()[0], unigram_bigram_trigram['weighted_avg_f1-score'].tolist()[0] ]) res_for_storage.append([ idx, unigram['weighted_avg_f1-score'].tolist()[0], unigram['algorithm'].tolist()[0], unigram['hyperparameter'].tolist()[0], bigram['weighted_avg_f1-score'].tolist()[0], bigram['algorithm'].tolist()[0], bigram['hyperparameter'].tolist()[0], trigram['weighted_avg_f1-score'].tolist()[0], trigram['algorithm'].tolist()[0], trigram['hyperparameter'].tolist()[0], unigram_bigram['weighted_avg_f1-score'].tolist()[0], unigram_bigram['algorithm'].tolist()[0], unigram_bigram['hyperparameter'].tolist()[0], unigram_bigram_trigram['weighted_avg_f1-score'].tolist()[0], unigram_bigram_trigram['algorithm'].tolist()[0], unigram_bigram_trigram['hyperparameter'].tolist()[0] ]) new_results_df = pd.DataFrame(new_results, columns=[ "mpt", "Unigrams", "Bigrams", "Trigrams", "Unigrams & Bigrams", "Unigrams, Bigrams and Trigrams" ]) cols = [ "mpt", "unigram_f1", "unigram_algorithm", "unigram_hyperparameter", "bigram_f1", "bigram_algorithm", "bigram_hyperparameter", "trigram_f1", "trigram_algorithm", "trigram_hyperparameter", "unigram_bigram_f1", "unigram_bigram_algorithm", "unigram_bigram_hyperparameter", "unigram_bigram_trigram_f1", "unigram_bigram_trigram_algorithm", "unigram_bigram_trigram_hyperparameter" ] res_for_storage_df = pd.DataFrame(res_for_storage, columns=cols) helpers.dataframe_to_csv( res_for_storage_df, ds.output_data + "results/best_results_" + folder + ".csv") return new_results_df
import helpers import dataset as ds df = helpers.load_dataset(ds.dataset + ds.data) new_df = df[df.is_retweet == False] helpers.dataframe_to_csv(new_df, ds.output_data + "rt_removed_unlabelled.csv")
import helpers import dataset as ds df = helpers.load_dataset(ds.output_data + "nrc_labelled.csv") for emotion in ds.emotion_list: emotion_str = emotion + "_str" df[emotion_str] = "" for index, row in df.iterrows(): for emotion in ds.emotion_list: emotion_str = emotion + "_str" if row[emotion] == 0: df[emotion_str].at[index] = "no_" + emotion else: df[emotion_str].at[index] = emotion helpers.dataframe_to_csv(df, ds.output_data + "nrc_labelled_for_detection.csv") print(df)
counter["no_emotion"] = 0 counter["no_lexicon_matches"] = 0 df = helpers.load_dataset(ds.output_data + "nrc_labelled.csv") print(len(df)) for index, row in df.iterrows(): zero_emotion = 0 for emotion in emotion_list: if row[emotion] != 0: counter[emotion] += 1 else: zero_emotion += 1 if zero_emotion == 8 and row.lexicon_matches != 0: counter["no_emotion"] += 1 if row.lexicon_matches == 0: counter["no_lexicon_matches"] += 1 temp_list = [] for emotion in counter: temp_list.append([emotion, counter[emotion]]) cols = ["emotion", "count"] emo_df = pd.DataFrame(temp_list, columns=cols) emo_df = emo_df.sort_values(by=['count'], ascending=False) print(emo_df) helpers.dataframe_to_csv(emo_df, ds.output_data + "emotion_counts.csv")
for emotion in emotion_list: df[emotion] = 0 df["lexicon_matches"] = 0 df["total_words"] = 0 for index, row in df.iterrows(): print(index) match_data = {} for emotion in emotion_list: match_data[emotion] = 0 lexicon_matches = 0 for word in row.preprocessed_tweet_text.split(): word = str(word) word_match_data = match_checker(word) if word_match_data != None: lexicon_matches += 1 for emotion, data in zip(emotion_list, word_match_data): match_data[emotion] += data df.anger.at[index] = match_data["anger"] df.anticipation.at[index] = match_data["anticipation"] df.disgust.at[index] = match_data["disgust"] df.fear.at[index] = match_data["fear"] df.joy.at[index] = match_data["joy"] df.sadness.at[index] = match_data["sadness"] df.surprise.at[index] = match_data["surprise"] df.trust.at[index] = match_data["trust"] df.lexicon_matches.at[index] = lexicon_matches df.total_words.at[index] = len(row.preprocessed_tweet_text.split()) helpers.dataframe_to_csv(df, ds.output_data + "nrc_labelled.csv")
def processing(): create_storage_dataframes() create_freq_matrix() global generic_list, specific_list, year_freq_df, month_year_freq_df, generic_specific_freq_df created_files = [] file_path = ds.output_data + "time_filtered_dataset_extraction/" generic_tweets = file_path + "generic.csv" specific_tweets = file_path + "specific.csv" all_tweets_df = pd.concat([ helpers.load_dataset(specific_tweets), helpers.load_dataset(generic_tweets) ]) all_tweets_df.reset_index(inplace=True, drop=True) limit = len(all_tweets_df) + 1 counter = 0 for index, row in all_tweets_df.iterrows(): if counter == limit: break counter += 1 if index % 100 == 0: print(" -", str(index), "/", str(len(all_tweets_df))) generic_matches = [] specific_matches = [] # preprocessing row.matches = row.matches.strip("''][").split("', '") for match in row.matches: generic_check = keyword_checker(match, generic_list) generic_matches.append(generic_check) specific_check = keyword_checker(match, specific_list) specific_matches.append(specific_check) if generic_check | specific_check: year_freq_update(match, str(row.year)) month_year_freq_update(match, str(row.month), str(row.year)) if True in generic_matches: if True in specific_matches: for match in row.matches: generic_specific_freq_update(match, "generic_specific") else: for match in row.matches: generic_specific_freq_update(match, "generic") else: for match in row.matches: generic_specific_freq_update(match, "specific") freq_matrix_update(row.matches) #store dataframe storage_path = ds.output_data + "statistics/" helpers.path_checker(storage_path) # Store year frequency helpers.dataframe_to_csv(year_freq_df, storage_path + "year_frequency.csv") created_files.append(storage_path + "year_frequency.csv") # Store month year frequency helpers.dataframe_to_csv(month_year_freq_df, storage_path + "month_year_frequency.csv") created_files.append(storage_path + "month_year_frequency.csv") # Store generic specific frequency helpers.dataframe_to_csv(generic_specific_freq_df, storage_path + "generic_specific_frequency.csv") created_files.append(storage_path + "generic_specific_frequency.csv") # Store frequency matrix helpers.dataframe_to_csv(freq_matrix, storage_path + "frequency_matrix.csv") created_files.append(storage_path + "frequency_matrix.csv") month_year_freq_output_files = preprocess_month_year_graph( month_year_freq_df) year_freq_output_files = preprocess_year_graph(year_freq_df) frequency_total_output_files = preprocess_frequency_total_graph( generic_specific_freq_df) for file in month_year_freq_output_files: created_files.append(file) for file in year_freq_output_files: created_files.append(file) for file in frequency_total_output_files: created_files.append(file) return created_files
import helpers import dataset as ds df = helpers.load_dataset(ds.output_data + 'sentiwordnet_labelled.csv') df2 = df[(df.sentiment_class == "Positive") | (df.sentiment_class == "Negative") | (df.sentiment_class == "Neutral")] df3 = df[(df.sentiment_class != "Positive") & (df.sentiment_class != "Negative") & (df.sentiment_class != "Neutral")] print(df3) helpers.dataframe_to_csv( df2, ds.output_data + "sentiwordnet_labelled_unclassified_removed.csv") helpers.dataframe_to_csv(df3, ds.output_data + "unclassified.csv")
def store_dataset(df): helpers.path_checker(ds.output_data) helpers.dataframe_to_csv( df, '/' + ds.output_data + "/sentiwordnet_labelled.csv")