Пример #1
0
def get_baseline_results(data, emotion, output_folder, n_grams):
    print("--- " + str(emotion) + " ---")
    results = []

    results.append(
        run.run_knn_classification(data, emotion, None, "baseline", n_grams))
    results.append(
        run.run_decision_tree_classification(data, emotion, "baseline",
                                             n_grams))
    results.append(
        run.run_linear_svm_classification(data, emotion, None, "baseline",
                                          n_grams))
    results.append(
        run.run_naive_bayes_classification(data, emotion, None, "baseline",
                                           n_grams))
    results.append(
        run.run_random_forest_classification(data, emotion, None, "baseline",
                                             n_grams))
    columns = [
        "algorithm", "hyperparameter", "weighted_avg_precision",
        "weighted_avg_recall", "weighted_avg_f1-score", "accuracy",
        "experiment_type", "metric_dump_id", "macro_avg_precision",
        "macro_avg_recall", "macro_avg_f1-score", emotion + "_precision",
        emotion + "_recall", emotion + "_f1-score",
        "no_" + emotion + "_precision", "no_" + emotion + "_recall",
        "no_" + emotion + "_f1-score"
    ]
    results_df = pd.DataFrame(results, columns=columns)
    helpers.dataframe_to_csv(results_df,
                             output_folder + emotion + "_results.csv")
    print(results_df[[
        "algorithm", "hyperparameter", "macro_avg_f1-score", "accuracy",
        emotion + "_f1-score", "no_" + emotion + "_f1-score"
    ]])
Пример #2
0
def get_baseline_results(data, mpt, output_folder, n_grams):
    print("--- " + str(mpt) + "% ---")
    data = data[data.words_matched_percentage >= mpt]
    results = []

    results.append(run.run_knn_classification(data, None, "baseline", n_grams))
    results.append(
        run.run_decision_tree_classification(data, "baseline", n_grams))
    results.append(
        run.run_linear_svm_classification(data, None, "baseline", n_grams))
    results.append(
        run.run_naive_bayes_classification(data, None, "baseline", n_grams))
    results.append(
        run.run_random_forest_classification(data, None, "baseline", n_grams))
    columns = [
        "algorithm", "hyperparameter", "weighted_avg_precision",
        "weighted_avg_recall", "weighted_avg_f1-score", "accuracy",
        "experiment_type", "metric_dump_id", "positive_precision",
        "positive_recall", "positive_f1-score", "neutral_precision",
        "neutral_recall", "neutral_f1-score", "negative_precision",
        "negative_recall", "negative_f1-score"
    ]
    results_df = pd.DataFrame(results, columns=columns)
    helpers.dataframe_to_csv(results_df,
                             output_folder + str(mpt) + "_mpt_results.csv")
    print(results_df)
Пример #3
0
def get_existing_results(folder, dataset_type, n_grams):
    experiments = []
    for file in get_results_filenames(folder):
        mpt = file.split("_")[0]
        if mpt == "best":
            continue
        mpt = int(mpt)
        results_df = helpers.load_dataset(folder + file)
        results_df = results_df.sort_values(
            ['weighted_avg_f1-score'],
            ascending=False).groupby('algorithm').head(3)
        results_df = results_df.reset_index(drop=True)
        algorithms = algorithm_single_list(results_df.algorithm.tolist())
        for algorithm in algorithms:
            relevant_rows = results_df[results_df.algorithm == algorithm]
            for index, row in relevant_rows.iterrows():
                experiments = next_experiments(mpt, algorithm,
                                               row.hyperparameter, experiments)
                break
    new_experiments_df = pd.DataFrame(
        experiments, columns=["mpt", "algorithm", "hyperparameter"])
    helpers.dataframe_to_csv(
        new_experiments_df,
        "/home/michael/MRes/actual_project/sentiment_analysis/" + n_grams +
        "/next_" + dataset_type + "_experiments.csv")
def import_best_results_and_sort(folder):
    best_results_df = helpers.load_dataset(folder +
                                           "best_result_per_emotion.csv")
    best_results_df = best_results_df.sort_values(['macro_avg_f1-score'],
                                                  ascending=False)
    best_results_df = best_results_df.reset_index(drop=True)
    helpers.dataframe_to_csv(best_results_df,
                             folder + "best_result_per_emotion_sorted.csv")
def single_list_generator():
    df = helpers.load_dataset("original_keywords.csv")
    list_of_terms = df.keywords.tolist()
    individual_terms = word_extractor(list_of_terms)
    list_df = pd.DataFrame(individual_terms, columns=["keyword"])
    output_dir = ds.output_data + "keywords/"
    helpers.path_checker(output_dir)
    output_file = output_dir + "keywords_single_list.csv"
    helpers.dataframe_to_csv(list_df, output_file)
    return output_file
def negation_handled (folder, n_grams):
    data = helpers.load_dataset(ds.dataset + ds.negate_dataset)
    results_files = get_results_filenames(folder)
    for results_file in results_files:
        mpt = results_file.split("_")[0]
        if mpt == "best":
            continue
        mpt = int(mpt)
        results_df = helpers.load_dataset(folder + results_file)
        results_df = get_first_experimental_results(data, mpt, results_df, n_grams)
        helpers.dataframe_to_csv(results_df, folder + results_file)
def execute(folder, n_grams):
    results_files = get_results_filenames(folder)
    data = helpers.load_dataset(ds.dataset + ds.file)
    for results_file in results_files:
        emotion = results_file.split("_")[0]
        if emotion == "best":
            continue
        results_df = helpers.load_dataset(folder + results_file)
        results_df = get_first_experimental_results(data, emotion, results_df,
                                                    n_grams)
        helpers.dataframe_to_csv(results_df, folder + results_file)
def import_results (folder):
    new_results = []
    files = get_results_filenames(folder)
    for file in files:
        print("---" + file + "---")
        mpt = file.split("_")[0]
        if mpt == "best":
            continue
        mpt = int(mpt)
        results_df = helpers.load_dataset(folder + file)
        results_df = results_df.sort_values(['weighted_avg_f1-score'],ascending=False)
        results_df = results_df.reset_index(drop=True)
        for index, row in results_df.iterrows():
            new_results.append([mpt,
                                row.algorithm,
                                row.hyperparameter,
                                row.weighted_avg_precision,
                                row.weighted_avg_recall,
                                row["weighted_avg_f1-score"],
                                row.accuracy,
                                row.experiment_type,
                                row.metric_dump_id,
                                row.positive_precision,
                                row.positive_recall,
                                row["positive_f1-score"],
                                row.neutral_precision,
                                row.neutral_recall,
                                row["neutral_f1-score"],
                                row.negative_precision,
                                row.negative_recall,
                                row["negative_f1-score"]])
            break
    columns = ["mpt",
            "algorithm",
            "hyperparameter",
            "weighted_avg_precision",
            "weighted_avg_recall",
            "weighted_avg_f1-score",
            "accuracy",
            "experiment_type",
            "metric_dump_id",
            "positive_precision",
            "positive_recall",
            "positive_f1-score",
            "neutral_precision",
            "neutral_recall",
            "neutral_f1-score",
            "negative_precision",
            "negative_recall",
            "negative_f1-score"]
    new_results_df = pd.DataFrame(new_results, columns=columns)
    helpers.dataframe_to_csv(new_results_df, folder + "best_result_per_mpt.csv")
def execute (folder, n_grams):
    results_files = get_results_filenames(folder)
    data = helpers.load_dataset(ds.dataset + ds.file)
    experiments_df = helpers.load_dataset("/home/michael/MRes/actual_project/emotion_detection/" + n_grams + "/next_experiments.csv")
    for results_filename in results_files:
        emotion = results_filename.split("_")[0]
        if emotion == "best":
            continue
        experiments = experiments_df[experiments_df.emotion == emotion]
        results_df = helpers.load_dataset(folder + results_filename)
        results_df = process_experiments(data, emotion, experiments, results_df, n_grams)
        helpers.dataframe_to_csv(results_df, folder + results_filename)
    return
def process_negation_handled_experiments (folder, n_grams):
    results_files = get_results_filenames(folder)
    data = helpers.load_dataset(ds.dataset + ds.negate_dataset)
    experiments_df = helpers.load_dataset("/home/michael/MRes/actual_project/sentiment_analysis/" + n_grams + "/next_negation_handled_experiments.csv")
    for results_filename in results_files:
        mpt = results_filename.split("_")[0]
        if mpt == "best":
            continue
        mpt = int(mpt)
        experiments = experiments_df[experiments_df.mpt == mpt]
        results_df = helpers.load_dataset(folder + results_filename)
        results_df = process_experiments(data, mpt, experiments, results_df, n_grams)
        helpers.dataframe_to_csv(results_df, folder + results_filename)
    return
Пример #11
0
def merge(dataset_type):
    print("      - Processing " + dataset_type + " files:")
    for file in ds.all_datasets:
        print("        - " + file)
        file_path = file.split("/")
        f_name = ds.output_data + "first_dataset_extraction/" + dataset_type + "/" + file_path[0] + "/" + file_path[1]
        df = helpers.load_dataset(f_name)
        if file == ds.all_datasets[0]:
            merge_hold = df
        else:
            merge_hold = pd.concat([merge_hold, df], sort=False)
    output_path = ds.output_data + "merged_dataset_extraction/"
    helpers.path_checker(output_path)
    file_name = dataset_type + ".csv"
    helpers.dataframe_to_csv(merge_hold, output_path + file_name)
    return output_path + file_name
Пример #12
0
def tweet_extractor():
    files_created_generic = []
    files_created_specific = []
    for file in ds.all_datasets:
        generic_df = helpers.load_dataset(ds.output_data +
                                          "actual_keyword_matches/generic/" +
                                          file)
        specific_df = helpers.load_dataset(ds.output_data +
                                           "actual_keyword_matches/specific/" +
                                           file)
        print("      - loading data", file)
        df = helpers.load_dataset(ds.dataset + file)
        df = df[df.tweet_language == "en"]
        columns = []
        for h in df.head():
            columns.append(h)
        columns.append("matches")
        columns.append("source_file")
        columns.append("month")
        columns.append("year")
        df["matches"] = ""
        df["source_file"] = ""
        df["tweet_time"] = df["tweet_time"].astype("datetime64")
        df["month"] = df["tweet_time"].dt.month
        df["year"] = df["tweet_time"].dt.year
        specific_tweets, generic_tweets = pd.DataFrame(
            columns=columns), pd.DataFrame(columns=columns)
        specific_tweets = match_extractor(specific_df, df, specific_tweets,
                                          file, "specific")
        generic_tweets = match_extractor(generic_df, df, generic_tweets, file,
                                         "generic")
        output_data_path = ds.output_data + "first_dataset_extraction/"
        dataset = file.split("/")[0]
        filename = file.split("/")[1]

        specific_path = output_data_path + "specific/" + dataset + "/"
        helpers.path_checker(specific_path)
        helpers.dataframe_to_csv(specific_tweets, specific_path + filename)
        files_created_specific.append(specific_path + filename)

        generic_path = output_data_path + "generic/" + dataset + "/"
        helpers.path_checker(generic_path)
        helpers.dataframe_to_csv(generic_tweets, generic_path + filename)
        files_created_generic.append(generic_path + filename)
    return files_created_generic, files_created_specific
Пример #13
0
def total_frequency_graph_generator(df, path):
    col_name = "total"
    filename = path + "total_freq.png"
    helpers.path_checker(path)
    plt.style.use('fivethirtyeight')
    df = df.sort_values(col_name, ascending=False)
    zero = df[df[col_name] == 0]
    df = df[df[col_name] != 0]
    helpers.dataframe_to_csv(zero,
                             ds.output_data + "statistics/zero_matches.csv")
    df.set_index('keyword', drop=True, inplace=True)
    if len(df) > 0:
        ax = df[col_name].plot.bar(figsize=(20, 12.75))
        plt.xlabel("Keyword/Term")
        plt.ylabel("Number of Tweets")
        plt.title("Keyword frequency")
        plt.subplots_adjust(bottom=0.3)
        plt.savefig(filename)
        plt.close()
    return [filename, ds.output_data + "statistics/zero_matches.csv"]
Пример #14
0
def date_selection():
    output_files = []
    path = ds.output_data + "merged_dataset_extraction/"
    files = helpers.path_fetcher(path)
    for file in files:
        df = helpers.load_dataset(path + file)
        df_2013 = df[df.year == 2013]
        df_2013_8 = df_2013[df.month == 8]
        df_2013_9 = df_2013[df.month == 9]
        df_2013_10 = df_2013[df.month == 10]
        df_2013_11 = df_2013[df.month == 11]
        df_2013_12 = df_2013[df.month == 12]
        df = df[(df.year == 2014) | (df.year == 2015) | (df.year == 2016) |
                (df.year == 2017) | (df.year == 2018)]
        df = pd.concat(
            [df_2013_8, df_2013_9, df_2013_10, df_2013_11, df_2013_12, df])
        storage_path = ds.output_data + "time_filtered_dataset_extraction/"
        helpers.path_checker(storage_path)
        helpers.dataframe_to_csv(df, storage_path + file)
        output_files.append(storage_path + file)
    return output_files
def import_results(folder):
    new_results = []
    files = get_results_filenames(folder)
    for file in files:
        print("---" + file + "---")
        emotion = file.split("_")[0]
        if emotion == "best":
            continue
        results_df = helpers.load_dataset(folder + file)
        results_df = results_df.sort_values(['macro_avg_f1-score'],
                                            ascending=False)
        results_df = results_df.reset_index(drop=True)
        helpers.dataframe_to_csv(results_df, folder + file)
        for index, row in results_df.iterrows():
            new_results.append([
                emotion, row.algorithm, row.hyperparameter,
                row.weighted_avg_precision, row.weighted_avg_recall,
                row["weighted_avg_f1-score"], row.accuracy,
                row.experiment_type, row.metric_dump_id,
                row.macro_avg_precision, row.macro_avg_recall,
                row["macro_avg_f1-score"], row[emotion + "_precision"],
                row[emotion + "_recall"], row[emotion + "_f1-score"],
                row["no_" + emotion + "_precision"],
                row["no_" + emotion + "_recall"],
                row["no_" + emotion + "_f1-score"]
            ])
            break
    columns = [
        "emotion", "algorithm", "hyperparameter", "weighted_avg_precision",
        "weighted_avg_recall", "weighted_avg_f1-score", "accuracy",
        "experiment_type", "metric_dump_id", "macro_avg_precision",
        "macro_avg_recall", "macro_avg_f1-score", emotion + "_precision",
        emotion + "_recall", emotion + "_f1-score",
        "no_" + emotion + "_precision", "no_" + emotion + "_recall",
        "no_" + emotion + "_f1-score"
    ]
    new_results_df = pd.DataFrame(new_results, columns=columns)
    helpers.dataframe_to_csv(new_results_df,
                             folder + "best_result_per_emotion.csv")

# load generic and specific datasets and merge them together
dfs = []
for df in ds.all_datasets:
    dfs.append(helpers.load_dataset(ds.dataset + df))
dfs = pd.concat(dfs, ignore_index=True)
dfs['preprocessed_tweet_text'] = ""
# loop over dataframe, perform preprocessing on the tweet text
for index, row in dfs.iterrows():
    tweet_text = row.tweet_text
    tweet_text = lowercase_conversion(tweet_text)
    tweet_text = remove_urls(tweet_text)
    tweet_text = remove_accents(tweet_text)
    tweet_text = remove_usernames(tweet_text)
    tweet_text = transform_hashtags(tweet_text)
    tweet_text = contraction_expansion(tweet_text)
    tweet_text = remove_special_chars(tweet_text)

    # sorts out multiple spaces
    tweet_text = tweet_text.split()
    tweet_text = " ".join(tweet_text)

    dfs.preprocessed_tweet_text.at[index] = tweet_text

# Store processed data
helpers.dataframe_to_csv(dfs, ds.output_data + 'preprocessed_data.csv')

# print complete message
print('Preprocessing complete')
Пример #17
0
def store_metrics (metrics_dict, emotion, algorithm, modifier, n_grams):
    location = ds.metric_storage_location
    metric_id = get_new_metric_storage_identifier()
    metric_list_for_df = []

    for index in range(1,11):
        metric_list_for_df.append([
            metric_id,
            index,
            algorithm,
            modifier,
            n_grams,
            metrics_dict[emotion]["precision"][index - 1],
            metrics_dict[emotion]["recall"][index - 1],
            metrics_dict[emotion]["f1-score"][index - 1],
            metrics_dict[emotion]["support"][index - 1],
            metrics_dict["no_" + emotion]["precision"][index - 1],
            metrics_dict["no_" + emotion]["recall"][index - 1],
            metrics_dict["no_" + emotion]["f1-score"][index - 1],
            metrics_dict["no_" + emotion]["support"][index - 1],
            metrics_dict["accuracy"]["list"][index - 1],
            metrics_dict["macro avg"]["precision"][index - 1],
            metrics_dict["macro avg"]["recall"][index - 1],
            metrics_dict["macro avg"]["f1-score"][index - 1],
            metrics_dict["macro avg"]["support"][index - 1],
            metrics_dict["weighted avg"]["precision"][index - 1],
            metrics_dict["weighted avg"]["recall"][index - 1],
            metrics_dict["weighted avg"]["f1-score"][index - 1],
            metrics_dict["weighted avg"]["support"][index - 1],
        ])
    metric_list_for_df.append([
        metric_id,
        "average",
        algorithm,
        modifier,
        n_grams,
        metrics_dict[emotion]["avg"]["precision"],
        metrics_dict[emotion]["avg"]["recall"],
        metrics_dict[emotion]["avg"]["f1-score"],
        metrics_dict[emotion]["avg"]["support"],
        metrics_dict["no_" + emotion]["avg"]["precision"],
        metrics_dict["no_" + emotion]["avg"]["recall"],
        metrics_dict["no_" + emotion]["avg"]["f1-score"],
        metrics_dict["no_" + emotion]["avg"]["support"],
        metrics_dict["accuracy"]["avg"],
        metrics_dict["macro avg"]["avg"]["precision"],
        metrics_dict["macro avg"]["avg"]["recall"],
        metrics_dict["macro avg"]["avg"]["f1-score"],
        metrics_dict["macro avg"]["avg"]["support"],
        metrics_dict["weighted avg"]["avg"]["precision"],
        metrics_dict["weighted avg"]["avg"]["recall"],
        metrics_dict["weighted avg"]["avg"]["f1-score"],
        metrics_dict["weighted avg"]["avg"]["support"],
    ])

    columns = ["metric_dump_id",
                "fold",
                "algorithm",
                "modifier",
                "n_grams",
                emotion + "_precision",
                emotion + "_recall",
                emotion + "_f1-score",
                emotion + "_support",
                "no_" + emotion + "_precision",
                "no_" + emotion + "_recall",
                "no_" + emotion + "_f1-score",
                "no_" + emotion + "_support",
                "accuracy",
                "macro_avg_precision",
                "macro_avg_recall",
                "macro_avg_f1-score",
                "macro_avg_support",
                "weighted_avg_precision",
                "weighted_avg_recall",
                "weighted_avg_f1-score",
                "weighted_avg_support"]

    metric_df = pd.DataFrame(metric_list_for_df, columns=columns)
    helpers.dataframe_to_csv(metric_df, location + str(metric_id) + ".csv")
    return metric_id
def processing():
    tagged_keywords = load_tagged_keywords()
    match_files = find_match_files()
    files_created = []
    for dataset in match_files:
        for file in match_files[dataset]:
            print("      - " + file)
            new_store = {}
            generic_matches = {}
            specific_matches = {}
            matches_df = load_match_file(file)
            for match_index, match_row in matches_df.iterrows():
                generic_counter = 0
                specific_counter = 0
                tempstore = []
                match_row.matches = match_row.matches.strip("''][").split(
                    "', '")
                match_row.matches = remove_duplicates(match_row.matches)
                matches_df.matches.at[match_index] = match_row.matches
                for keyword_index, keyword_row in tagged_keywords.iterrows():
                    if comparison(match_row.matches, keyword_row['split']):
                        if keyword_row.tag == "generic":
                            generic_counter += 1
                        if keyword_row.tag == "specific":
                            specific_counter += 1
                        tempstore.append(keyword_row.term)

                if specific_counter != 0:
                    specific_matches[match_row.tweet_id] = tempstore
                else:
                    if generic_counter != 0:
                        generic_matches[match_row.tweet_id] = tempstore
            generic_data_list = []
            specific_data_list = []

            for tweet_id in generic_matches:
                generic_data_list.append([tweet_id, generic_matches[tweet_id]])
            for tweet_id in specific_matches:
                specific_data_list.append(
                    [tweet_id, specific_matches[tweet_id]])

            generic_file_path = ds.output_data + "actual_keyword_matches/generic/" + file.split(
                "/")[-2] + "/"
            helpers.path_checker(generic_file_path)
            generic_file_name = generic_file_path + file.split("/")[-1]
            helpers.data_to_file_two_values(generic_data_list,
                                            '"tweet_id","matches"',
                                            generic_file_name)
            files_created.append(generic_file_name)

            specific_file_path = ds.output_data + "actual_keyword_matches/specific/" + file.split(
                "/")[-2] + "/"
            helpers.path_checker(specific_file_path)
            specific_file_name = specific_file_path + file.split("/")[-1]
            helpers.data_to_file_two_values(specific_data_list,
                                            '"tweet_id","matches"',
                                            specific_file_name)
            files_created.append(specific_file_name)
            file_path = ds.output_data + "single_keyword_matches_dup_removed/" + file.split(
                "/")[-2] + "/"
            helpers.path_checker(file_path)
            file_name = file_path + file.split("/")[-1]
            helpers.dataframe_to_csv(matches_df, file_name)
            files_created.append(file_name)
    return files_created
Пример #19
0
def import_results(folder):
    unigram_df = helpers.load_dataset(ds.output_data + "unigrams/" + folder +
                                      "/best_result_per_mpt.csv")
    bigram_df = helpers.load_dataset(ds.output_data + "bigrams/" + folder +
                                     "/best_result_per_mpt.csv")
    trigram_df = helpers.load_dataset(ds.output_data + "trigrams/" + folder +
                                      "/best_result_per_mpt.csv")
    unigram_bigram_df = helpers.load_dataset(ds.output_data +
                                             "unigrams_bigrams/" + folder +
                                             "/best_result_per_mpt.csv")
    unigram_bigram_trigram_df = helpers.load_dataset(
        ds.output_data + "unigrams_bigrams_trigrams/" + folder +
        "/best_result_per_mpt.csv")
    new_results, res_for_storage = [], []
    for idx in range(0, 100, 10):
        unigram = unigram_df[unigram_df.mpt == idx]
        bigram = bigram_df[bigram_df.mpt == idx]
        trigram = trigram_df[trigram_df.mpt == idx]
        unigram_bigram = unigram_bigram_df[unigram_bigram_df.mpt == idx]
        unigram_bigram_trigram = unigram_bigram_trigram_df[
            unigram_bigram_trigram_df.mpt == idx]
        new_results.append([
            idx, unigram['weighted_avg_f1-score'].tolist()[0],
            bigram['weighted_avg_f1-score'].tolist()[0],
            trigram['weighted_avg_f1-score'].tolist()[0],
            unigram_bigram['weighted_avg_f1-score'].tolist()[0],
            unigram_bigram_trigram['weighted_avg_f1-score'].tolist()[0]
        ])
        res_for_storage.append([
            idx, unigram['weighted_avg_f1-score'].tolist()[0],
            unigram['algorithm'].tolist()[0],
            unigram['hyperparameter'].tolist()[0],
            bigram['weighted_avg_f1-score'].tolist()[0],
            bigram['algorithm'].tolist()[0],
            bigram['hyperparameter'].tolist()[0],
            trigram['weighted_avg_f1-score'].tolist()[0],
            trigram['algorithm'].tolist()[0],
            trigram['hyperparameter'].tolist()[0],
            unigram_bigram['weighted_avg_f1-score'].tolist()[0],
            unigram_bigram['algorithm'].tolist()[0],
            unigram_bigram['hyperparameter'].tolist()[0],
            unigram_bigram_trigram['weighted_avg_f1-score'].tolist()[0],
            unigram_bigram_trigram['algorithm'].tolist()[0],
            unigram_bigram_trigram['hyperparameter'].tolist()[0]
        ])
    new_results_df = pd.DataFrame(new_results,
                                  columns=[
                                      "mpt", "Unigrams", "Bigrams", "Trigrams",
                                      "Unigrams & Bigrams",
                                      "Unigrams, Bigrams and Trigrams"
                                  ])
    cols = [
        "mpt", "unigram_f1", "unigram_algorithm", "unigram_hyperparameter",
        "bigram_f1", "bigram_algorithm", "bigram_hyperparameter", "trigram_f1",
        "trigram_algorithm", "trigram_hyperparameter", "unigram_bigram_f1",
        "unigram_bigram_algorithm", "unigram_bigram_hyperparameter",
        "unigram_bigram_trigram_f1", "unigram_bigram_trigram_algorithm",
        "unigram_bigram_trigram_hyperparameter"
    ]
    res_for_storage_df = pd.DataFrame(res_for_storage, columns=cols)
    helpers.dataframe_to_csv(
        res_for_storage_df,
        ds.output_data + "results/best_results_" + folder + ".csv")
    return new_results_df
import helpers
import dataset as ds

df = helpers.load_dataset(ds.dataset + ds.data)

new_df = df[df.is_retweet == False]
helpers.dataframe_to_csv(new_df, ds.output_data + "rt_removed_unlabelled.csv")
Пример #21
0
import helpers
import dataset as ds

df = helpers.load_dataset(ds.output_data + "nrc_labelled.csv")

for emotion in ds.emotion_list:
    emotion_str = emotion + "_str"
    df[emotion_str] = ""

for index, row in df.iterrows():
    for emotion in ds.emotion_list:
        emotion_str = emotion + "_str"
        if row[emotion] == 0:
            df[emotion_str].at[index] = "no_" + emotion

        else:
            df[emotion_str].at[index] = emotion

helpers.dataframe_to_csv(df, ds.output_data + "nrc_labelled_for_detection.csv")
print(df)
Пример #22
0
counter["no_emotion"] = 0
counter["no_lexicon_matches"] = 0

df = helpers.load_dataset(ds.output_data + "nrc_labelled.csv")
print(len(df))

for index, row in df.iterrows():
    zero_emotion = 0
    for emotion in emotion_list:
        if row[emotion] != 0:
            counter[emotion] += 1
        else:
            zero_emotion += 1
    if zero_emotion == 8 and row.lexicon_matches != 0:
        counter["no_emotion"] += 1
    if row.lexicon_matches == 0:
        counter["no_lexicon_matches"] += 1

temp_list = []
for emotion in counter:
    temp_list.append([emotion, counter[emotion]])

cols = ["emotion", "count"]

emo_df = pd.DataFrame(temp_list, columns=cols)

emo_df = emo_df.sort_values(by=['count'], ascending=False)
print(emo_df)

helpers.dataframe_to_csv(emo_df, ds.output_data + "emotion_counts.csv")
for emotion in emotion_list:
    df[emotion] = 0
df["lexicon_matches"] = 0
df["total_words"] = 0

for index, row in df.iterrows():
    print(index)
    match_data = {}
    for emotion in emotion_list:
        match_data[emotion] = 0
    lexicon_matches = 0
    for word in row.preprocessed_tweet_text.split():
        word = str(word)
        word_match_data = match_checker(word)
        if word_match_data != None:
            lexicon_matches += 1
            for emotion, data in zip(emotion_list, word_match_data):
                match_data[emotion] += data
    df.anger.at[index] = match_data["anger"]
    df.anticipation.at[index] = match_data["anticipation"]
    df.disgust.at[index] = match_data["disgust"]
    df.fear.at[index] = match_data["fear"]
    df.joy.at[index] = match_data["joy"]
    df.sadness.at[index] = match_data["sadness"]
    df.surprise.at[index] = match_data["surprise"]
    df.trust.at[index] = match_data["trust"]
    df.lexicon_matches.at[index] = lexicon_matches
    df.total_words.at[index] = len(row.preprocessed_tweet_text.split())

helpers.dataframe_to_csv(df, ds.output_data + "nrc_labelled.csv")
Пример #24
0
def processing():
    create_storage_dataframes()
    create_freq_matrix()
    global generic_list, specific_list, year_freq_df, month_year_freq_df, generic_specific_freq_df
    created_files = []
    file_path = ds.output_data + "time_filtered_dataset_extraction/"
    generic_tweets = file_path + "generic.csv"
    specific_tweets = file_path + "specific.csv"
    all_tweets_df = pd.concat([
        helpers.load_dataset(specific_tweets),
        helpers.load_dataset(generic_tweets)
    ])
    all_tweets_df.reset_index(inplace=True, drop=True)
    limit = len(all_tweets_df) + 1
    counter = 0
    for index, row in all_tweets_df.iterrows():
        if counter == limit:
            break
        counter += 1
        if index % 100 == 0:
            print("      -", str(index), "/", str(len(all_tweets_df)))
        generic_matches = []
        specific_matches = []
        # preprocessing
        row.matches = row.matches.strip("''][").split("', '")
        for match in row.matches:
            generic_check = keyword_checker(match, generic_list)
            generic_matches.append(generic_check)
            specific_check = keyword_checker(match, specific_list)
            specific_matches.append(specific_check)
            if generic_check | specific_check:
                year_freq_update(match, str(row.year))
                month_year_freq_update(match, str(row.month), str(row.year))
        if True in generic_matches:
            if True in specific_matches:
                for match in row.matches:
                    generic_specific_freq_update(match, "generic_specific")
            else:
                for match in row.matches:
                    generic_specific_freq_update(match, "generic")
        else:
            for match in row.matches:
                generic_specific_freq_update(match, "specific")
        freq_matrix_update(row.matches)

    #store dataframe
    storage_path = ds.output_data + "statistics/"
    helpers.path_checker(storage_path)
    # Store year frequency
    helpers.dataframe_to_csv(year_freq_df, storage_path + "year_frequency.csv")
    created_files.append(storage_path + "year_frequency.csv")
    # Store month year frequency
    helpers.dataframe_to_csv(month_year_freq_df,
                             storage_path + "month_year_frequency.csv")
    created_files.append(storage_path + "month_year_frequency.csv")
    # Store generic specific frequency
    helpers.dataframe_to_csv(generic_specific_freq_df,
                             storage_path + "generic_specific_frequency.csv")
    created_files.append(storage_path + "generic_specific_frequency.csv")
    # Store frequency matrix
    helpers.dataframe_to_csv(freq_matrix,
                             storage_path + "frequency_matrix.csv")
    created_files.append(storage_path + "frequency_matrix.csv")
    month_year_freq_output_files = preprocess_month_year_graph(
        month_year_freq_df)
    year_freq_output_files = preprocess_year_graph(year_freq_df)
    frequency_total_output_files = preprocess_frequency_total_graph(
        generic_specific_freq_df)
    for file in month_year_freq_output_files:
        created_files.append(file)
    for file in year_freq_output_files:
        created_files.append(file)
    for file in frequency_total_output_files:
        created_files.append(file)
    return created_files
import helpers
import dataset as ds

df = helpers.load_dataset(ds.output_data + 'sentiwordnet_labelled.csv')

df2 = df[(df.sentiment_class == "Positive") |
         (df.sentiment_class == "Negative") |
         (df.sentiment_class == "Neutral")]
df3 = df[(df.sentiment_class != "Positive")
         & (df.sentiment_class != "Negative") &
         (df.sentiment_class != "Neutral")]
print(df3)
helpers.dataframe_to_csv(
    df2, ds.output_data + "sentiwordnet_labelled_unclassified_removed.csv")
helpers.dataframe_to_csv(df3, ds.output_data + "unclassified.csv")
Пример #26
0
def store_dataset(df):
    helpers.path_checker(ds.output_data)
    helpers.dataframe_to_csv(
        df, '/' + ds.output_data + "/sentiwordnet_labelled.csv")