示例#1
0
        input_settings     = the input settings used: a subset from 
                             {'dialogue act', 'speaker', 'level', 'utterance length'}. The list must consist of 
                             abbreviations in the format '_<first letter>', for example ['_d', '_d_u'], which uses first 
                             only dialogue acts and then dialogue acts and utterance lengths.

    The script outputs csv files containing the accuracy scores of the dialogue acts (rows) per level and accuracy 
    metric (columns).       
"""

models = ['unweighted', 'weighted']
sequence_length = 3

# Reads in the data containing the predictions of a model under the given settings.
filename = 'analyses/old_model_sequence_length_3_test_set_predictions.csv'
data = Preprocessing(filename)
statistics = Statistics(data)

# Gets the precision, recall and f1-score for every dialogue act for different model input settings.
for weighted in models:
    accuracy_dict = dict()
    for dialogue_act in data.DAs:
        columns = ['labels_' + weighted, 'predictions_' + weighted]
        precision, recall, f1 = statistics.precision_recall_f1(
            data.data, columns, dialogue_act)

        if 'all_levels' not in accuracy_dict.keys():
            accuracy_dict['all_levels'] = dict()
            accuracy_dict['all_levels']['p'] = dict()
            accuracy_dict['all_levels']['r'] = dict()
            accuracy_dict['all_levels']['f1'] = dict()
        accuracy_dict['all_levels']['p'][dialogue_act] = precision
from data import Preprocessing, Statistics
import pandas as pd

preprocessed = Preprocessing('data/DA_labeled_belc_2019.csv')
statistics = Statistics(preprocessed)
# statistics.get_average_utterance_length(['participant', 'interviewer'], [1, 2, 3, 4])
# statistics.get_da_distribution()
# statistics.get_da_distributions(['participant', 'interviewer'], [1, 2, 3, 4])
# statistics.get_bigram_distribution()
# statistics.get_most_common_bigrams(10, [1, 2, 3, 4])
# data = pd.read_csv('analyses/unweighted_model_sequence_length_3_predictions.csv', index_col=[0])
# statistics.get_da_counts(data, 'dialogue_act', [1, 2, 3, 4])
# statistics.get_n_dialogues_average_length([1, 2, 3, 4])
示例#3
0
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from data import Statistics, Preprocessing
''' This file plots all the statistics graphs needed for the thesis report. '''

# Preprocesses the data.
preprocessed = Preprocessing('data/DA_labeled_belc_2019.csv')
statistics = Statistics(preprocessed)

#######################################################################################################################
#                                 PLOTTING DIALOGUE ACT DISTRIBUTIONAL DATA                                           #
#######################################################################################################################
statistics.get_da_distribution()
statistics.get_da_distributions(['participant', 'interviewer'], [1, 2, 3, 4])
statistics.get_da_counts(preprocessed.data, 'dialogue_act', [1, 2, 3, 4])
statistics.get_average_utterance_length(['participant', 'interviewer'],
                                        [1, 2, 3, 4])
statistics.get_speaker_ratios([1, 2, 3, 4])

# Get the dialogue act distribution.
distribution_order = pd.read_csv('analyses/dialogue_act_distribution.csv',
                                 index_col=[0],
                                 header=None)

# Plot the dialogue act distribution.
sns.set_palette(sns.color_palette('Blues_r', 13))
graph = distribution_order.plot.bar()
plt.legend().remove()
_, labels = plt.xticks()
graph.set_xticklabels(labels,
示例#4
0
# Sets the font size of the plot labels.
plt.rcParams['xtick.labelsize'] = 6

weighted = 'unweighted'
sequence_lengths = [3]
input_settings = ['_d', '_d_s', '_d_s_l', '_d_s_l_u']
baselines = ['majority_class', 'random', 'weighted_random']
colors = ['b', 'r', 'y', 'g']

# Initialises variables to be defined later.
names = []
x = []

preprocessed = Preprocessing('data/DA_labeled_belc_2019.csv')
statistics = Statistics(preprocessed)
da_sorted_by_occurance = list(statistics.get_da_distribution().index)

for sequence_length in sequence_lengths:

    # Initialises the plot format.
    fig, ax = plt.subplots()

    # Gets the precision, recall and f1-score for every dialogue act for different model input settings.
    for input_setting in input_settings:

        # Loads in the data for the plots.
        filename = 'analyses/' + weighted + '_model_sequence_length_' + str(sequence_length) + input_setting + \
                   '_accuracy.csv'
        accuracies = pd.read_csv(filename, index_col=[0], header=[0, 1])
    The variables that need to be specified:
        filename    = a .csv file containing the labels belonging to the inputs.
        baselines   = a list containing the baselines used, choose from {'majority_class', 'random', 'weighted_random'}

    The script outputs a .csv file containing the accuracy scores of the dialogue acts (rows) and the accuracy 
    metric per baseline (columns).       
"""

filename = 'analyses/n_gram_models_predictions.csv'
baselines = ['majority_class', 'random', 'weighted_random']

# Reads in the data containing the predictions of a model under the given settings.
data = Preprocessing(filename)
for baseline in baselines:
    data.add_baseline(baseline)
statistics = Statistics(data)

data.data.to_csv('analyses/simple_baselines_predictions.csv')

# Gets the precision, recall and f1-score for every dialogue act for different baselines.
for baseline in baselines:
    accuracy_dict = dict()
    columns = ['labels_2_gram', baseline]
    confusion_matrix = (
        statistics.get_normalised_confusion_matrix(data.data, columns) *
        100).round(2)
    confusion_matrix.to_csv('analyses/' + baseline + '_error_analysis.csv')
    for dialogue_act in data.DAs:
        columns = ['labels_2_gram', baseline]
        precision, recall, f1 = statistics.precision_recall_f1(
            data.data, columns, dialogue_act)
    for i in range(len(DAs)):
        input_frame = input_frame.replace({str(i) + '.0': DAs[i]})

    # Adds the labels and predictions for test set as columns to the original test data in one DataFrame.
    test_data = test_data.merge(input_frame, how='left', left_index=True, right_index=True)

# Saves the DataFrame containing all the labels and predictions for the different n_gram models.
test_data.to_csv('analyses/n_gram_models_predictions.csv')

########################################################################################################################
#                               COMPUTING THE PRECISION, RECALL AND F1-SCORES                                          #
########################################################################################################################

# Reads in the data containing the predictions of a model under the given settings.
data = Preprocessing('analyses/n_gram_models_predictions.csv')
statistics = Statistics(data)
statistics.get_da_distribution()

# Gets the precision, recall and f1-score for every dialogue act for different model input settings.
for n_gram in n:
    accuracy_dict = dict()
    accuracy_frame = data.data[['labels_' + str(n_gram) + '_gram', 'predictions_' + str(n_gram) + '_gram']].dropna()
    accuracy = accuracy_frame[accuracy_frame['labels_' + str(n_gram) + '_gram'] ==
                              accuracy_frame['predictions_' + str(n_gram) + '_gram']].shape[0] / accuracy_frame.shape[0]
    print("The accuracy of the " + str(n_gram) + "-gram model is: ", accuracy)

    columns = ['labels_' + str(n_gram) + '_gram', 'predictions_' + str(n_gram) + '_gram']
    confusion_matrix = (statistics.get_normalised_confusion_matrix(data.data, columns) * 100).round(2)
    confusion_matrix.to_csv('analyses/' + str(n_gram) + 'gram_error_analysis.csv')
    for dialogue_act in data.DAs:
        columns = ['labels_' + str(n_gram) + '_gram', 'predictions_' + str(n_gram) + '_gram']
3       a           a
4       b           a
5       b           b
6       b           c
7       b           c
8       c           a
9       c           b
10      c           a
11      c           b
12      d           a
13      b           e


"""

statistics = Statistics(data)
for class_name in classes:
    precision, recall, f1 = statistics.precision_recall_f1(
        data, ['labels', 'predictions'], class_name)
    print(class_name + ' precision is: ' + str(round(precision, 4)))
    print(class_name + ' recall is: ' + str(round(recall, 4)))
    print(class_name + ' f1 is: ' + str(round(f1, 4)))
""" 
Desired output:

a precision = 2/6 = 0.33
a recall = 2/4 = 0.5
a f1 = 2 * (0.33 * 0.5) / 0.83 = 0.4

b precision = 1/4 = 0.25
b recall = 1/5 = 0.20
示例#8
0
        input_settings     = the input settings used: a subset from 
                             {'dialogue act', 'speaker', 'level', 'utterance length'}. The list must consist of 
                             abbreviations in the format '_<first letter>', for example ['_d', '_d_u'], which uses first 
                             only dialogue acts and then dialogue acts and utterance lengths.

    The script outputs csv files containing the normalised distribution of predictions (columns) for each label (rows).
"""

settings = ['unweighted', 'weighted']
sequence_lengths = [2, 3, 5, 7, 10, 15, 20]
levels = [1, 2, 3, 4]

# Reads in the data containing the predictions of the model with sentence embeddings for different sequence lengths
filename = 'analyses/new_model_predictions.csv'
data = Preprocessing(filename)
statistics = Statistics(data)

# Computes the confusion matrix for different sequence lengths.
for sequence_length in sequence_lengths:
    columns = [
        'labels_seq_len_' + str(sequence_length),
        'predictions_seq_len_' + str(sequence_length)
    ]
    matrix = (statistics.get_normalised_confusion_matrix(data.data, columns) *
              100).round(2)
    error_file = 'analyses/weighted_model_with_txt_sequence_length_' + str(
        sequence_length) + '_error_analysis.csv'
    matrix.to_csv(error_file)

    # Computes the confusion matrix for different levels with the sequence length.
    for level in levels: