Exemplo n.º 1
0
        sequence_lengths   = the sequence length with which the model made the predictions.
        input_settings     = the input settings used: a subset from 
                             {'dialogue act', 'speaker', 'level', 'utterance length'}. The list must consist of 
                             abbreviations in the format '_<first letter>', for example ['_d', '_d_u'], which uses first 
                             only dialogue acts and then dialogue acts and utterance lengths.

    The script outputs csv files containing the accuracy scores of the dialogue acts (rows) per level and accuracy 
    metric (columns).       
"""

models = ['unweighted', 'weighted']
sequence_length = 3

# Reads in the data containing the predictions of a model under the given settings.
filename = 'analyses/old_model_sequence_length_3_test_set_predictions.csv'
data = Preprocessing(filename)
statistics = Statistics(data)

# Gets the precision, recall and f1-score for every dialogue act for different model input settings.
for weighted in models:
    accuracy_dict = dict()
    for dialogue_act in data.DAs:
        columns = ['labels_' + weighted, 'predictions_' + weighted]
        precision, recall, f1 = statistics.precision_recall_f1(
            data.data, columns, dialogue_act)

        if 'all_levels' not in accuracy_dict.keys():
            accuracy_dict['all_levels'] = dict()
            accuracy_dict['all_levels']['p'] = dict()
            accuracy_dict['all_levels']['r'] = dict()
            accuracy_dict['all_levels']['f1'] = dict()
hidden_dimensions = [30, 35, 40, 45, 50]

# Hyper parameters that are fixed.
levels = [1, 2, 3, 4]
weighted = 'weighted'
sequence_length = 3
k = 10
number_of_layers = 1
learning_rate = 0.001
batch_size = 16
epochs = 20

# Need to store convergence in table to determine number of epochs.

# Preprocesses the data for the sequence length.
preprocessed = Preprocessing('data/train_belc_das_2020.csv')
preprocessed.save_dialogues_as_matrices(sequence_length=3, store_index=True)

# Loops over all the settings, computes the accuracy and outputs it into a data frame.
output = np.empty((1, 3)).astype(str)

for hidden_dimension in hidden_dimensions:
    print("Cross-validation for hidden dimension {}".format(hidden_dimension))

    data = DataSet()

    # Performs cross-validation.
    cross_validation = CrossValidation(data, k)
    cross_validation.make_k_fold_cross_validation_split(levels)
    scores = cross_validation.validate(learning_rate,
                                       batch_size,
Exemplo n.º 3
0
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from data import Statistics, Preprocessing
''' This file plots all the statistics graphs needed for the thesis report. '''

# Preprocesses the data.
preprocessed = Preprocessing('data/DA_labeled_belc_2019.csv')
statistics = Statistics(preprocessed)

#######################################################################################################################
#                                 PLOTTING DIALOGUE ACT DISTRIBUTIONAL DATA                                           #
#######################################################################################################################
statistics.get_da_distribution()
statistics.get_da_distributions(['participant', 'interviewer'], [1, 2, 3, 4])
statistics.get_da_counts(preprocessed.data, 'dialogue_act', [1, 2, 3, 4])
statistics.get_average_utterance_length(['participant', 'interviewer'],
                                        [1, 2, 3, 4])
statistics.get_speaker_ratios([1, 2, 3, 4])

# Get the dialogue act distribution.
distribution_order = pd.read_csv('analyses/dialogue_act_distribution.csv',
                                 index_col=[0],
                                 header=None)

# Plot the dialogue act distribution.
sns.set_palette(sns.color_palette('Blues_r', 13))
graph = distribution_order.plot.bar()
plt.legend().remove()
_, labels = plt.xticks()
graph.set_xticklabels(labels,
input_classes = ['dialogue_act', 'speaker', 'level', 'utterance_length']
embedding_dimensions = [1, 7, 2]

# Training hyper parameters.
learning_rate = 0.001
batch_size = 16
epochs = 20

data_frame = pd.read_csv('data/test_belc_das_2020.csv')

# Makes predictions for the weighted and unweighted model and stores them.
for sequence_length in sequence_lengths:
    print("Prediction performance for sequence length " + str(sequence_length))

    # Preprocesses the training data for the sequence length.
    preprocessed_train = Preprocessing('data/train_belc_das_2020.csv')
    preprocessed_train.save_dialogues_as_matrices(
        sequence_length=sequence_length, store_index=True)
    preprocessed_train.save_dialogue_ids()
    preprocessed_train.save_class_representation()
    train_data = DataSet()

    # Preprocesses the test data for the sequence length.
    preprocessed_test = Preprocessing('data/test_belc_das_2020.csv')
    preprocessed_test.save_dialogues_as_matrices(
        sequence_length=sequence_length, store_index=True)
    preprocessed_test.save_dialogue_ids()
    preprocessed_test.save_class_representation()
    test_data = DataSet()

    # Initialises model.
    The data is read in from a .csv file containing the predictions of different models 
    and input settings. Only the labels will be used as these stay the same for all the model settings and baselines.

    The variables that need to be specified:
        filename    = a .csv file containing the labels belonging to the inputs.
        baselines   = a list containing the baselines used, choose from {'majority_class', 'random', 'weighted_random'}

    The script outputs a .csv file containing the accuracy scores of the dialogue acts (rows) and the accuracy 
    metric per baseline (columns).       
"""

filename = 'analyses/n_gram_models_predictions.csv'
baselines = ['majority_class', 'random', 'weighted_random']

# Reads in the data containing the predictions of a model under the given settings.
data = Preprocessing(filename)
for baseline in baselines:
    data.add_baseline(baseline)
statistics = Statistics(data)

data.data.to_csv('analyses/simple_baselines_predictions.csv')

# Gets the precision, recall and f1-score for every dialogue act for different baselines.
for baseline in baselines:
    accuracy_dict = dict()
    columns = ['labels_2_gram', baseline]
    confusion_matrix = (
        statistics.get_normalised_confusion_matrix(data.data, columns) *
        100).round(2)
    confusion_matrix.to_csv('analyses/' + baseline + '_error_analysis.csv')
    for dialogue_act in data.DAs:
    DAs = sorted(list(set(test_data['dialogue_act'])))
    for i in range(len(DAs)):
        input_frame = input_frame.replace({str(i) + '.0': DAs[i]})

    # Adds the labels and predictions for test set as columns to the original test data in one DataFrame.
    test_data = test_data.merge(input_frame, how='left', left_index=True, right_index=True)

# Saves the DataFrame containing all the labels and predictions for the different n_gram models.
test_data.to_csv('analyses/n_gram_models_predictions.csv')

########################################################################################################################
#                               COMPUTING THE PRECISION, RECALL AND F1-SCORES                                          #
########################################################################################################################

# Reads in the data containing the predictions of a model under the given settings.
data = Preprocessing('analyses/n_gram_models_predictions.csv')
statistics = Statistics(data)
statistics.get_da_distribution()

# Gets the precision, recall and f1-score for every dialogue act for different model input settings.
for n_gram in n:
    accuracy_dict = dict()
    accuracy_frame = data.data[['labels_' + str(n_gram) + '_gram', 'predictions_' + str(n_gram) + '_gram']].dropna()
    accuracy = accuracy_frame[accuracy_frame['labels_' + str(n_gram) + '_gram'] ==
                              accuracy_frame['predictions_' + str(n_gram) + '_gram']].shape[0] / accuracy_frame.shape[0]
    print("The accuracy of the " + str(n_gram) + "-gram model is: ", accuracy)

    columns = ['labels_' + str(n_gram) + '_gram', 'predictions_' + str(n_gram) + '_gram']
    confusion_matrix = (statistics.get_normalised_confusion_matrix(data.data, columns) * 100).round(2)
    confusion_matrix.to_csv('analyses/' + str(n_gram) + 'gram_error_analysis.csv')
    for dialogue_act in data.DAs:
""" This script preprocesses the data and saves it into files. """
from data import Preprocessing, Statistics

preprocessed = Preprocessing('data/DA_labeled_belc_2019.csv')
preprocessed.save_dialogue_ids()
preprocessed.save_class_representation()
# preprocessed.save_dialogues_as_matrices_old(sequence_length=7)
preprocessed.save_dialogues_as_matrices()
batch_size = 16
epochs = 20

for weighted in models:

    if weighted == 'weighted':
        hidden_nodes = 20
    elif weighted == 'unweighted':
        hidden_nodes = 12
    output = np.empty((1, 3))
    for sequence_length in sequence_lengths:
        print(
            "Cross-validation for sequence length {}".format(sequence_length))

        # Preprocesses the data for the sequence length.
        preprocessed = Preprocessing('data/DA_labeled_belc_2019.csv')
        preprocessed.save_dialogues_as_matrices(
            sequence_length=sequence_length, store_index=True)
        preprocessed.save_dialogue_ids()
        preprocessed.save_class_representation()
        data_frame = preprocessed.data
        data = DataSet()

        # Initialise cross validator.
        cross_validation = CrossValidation(data, k)
        cross_validation.make_k_fold_cross_validation_split(levels)

        input_short = '_'.join([c[0] for c in input_classes])
        print("Cross-validation for input {}".format(input_classes))

        # Performs cross-validation.