Exemplo n.º 1
0
def _plot_difficulties():
    """
    Plots difficulties over time as a scatter time and exludes the ones where the difficulty is constant 2 or 3.

    Folder:     Logfiles/
    Plot name:  difficulties.pdf

    """
    print("Plotting difficulties...")

    resolution = 10  # resample every x seconds -> the bigger, the smoother
    fig, ax = plt.subplots()
    total = 0
    high = 0
    for idx, df in enumerate(sd.df_list):
        df = pl.transform_df_to_numbers(df)
        df_num_resampled = ph.resample_dataframe(df, resolution)
        ax.scatter(df_num_resampled['Time'], df_num_resampled['physDifficulty'], c=ph.green_color, alpha=0.3)
        high += len(df_num_resampled[df_num_resampled['physDifficulty'] == 3])
        total += len(df_num_resampled)

    # print('Across all logfiles, the users are in ' + str(round(high/total, 2)) + '% on level HIGH')

    ax.set_ylabel('Physical Difficulty')
    ax.set_xlabel('Time (s)',)
    plt.yticks([1, 2, 3], ['Low', 'Medium', 'High'])
    plt.title('Difficulties')

    ph.save_plot(plt, 'Report/', 'difficulties.pdf')
Exemplo n.º 2
0
def _crashes_per_obstacle_arrangement():
    """
    Plots the percentage of crashes vs the obstacle arrangement

    Folder:     Logfiles/
    Plot name:  barplot_%crashes_per_obstacle_arrangement.pdf

    """

    df = pd.concat(sd.df_list, ignore_index=True)
    conc_dataframes = transform_df_to_numbers(df)

    # For each obstacle-arrangement, make a dictionary-entry with a list [#occurences, #crashes]
    obst_dict = {}

    # For each crash, find corresponding row where we can find the obstacle he crashed into.
    for index, row in conc_dataframes.iterrows():
        if row['Logtype'] == 'EVENT_CRASH':
            obstacle = row['obstacle']
            if obstacle in obst_dict:
                obst_dict[obstacle] = [
                    obst_dict[obstacle][0] + 1, obst_dict[obstacle][1] + 1
                ]
            else:
                obst_dict[obstacle] = [1, 1]
        if row['Logtype'] == 'EVENT_OBSTACLE':
            obstacle = row['obstacle']
            if obstacle in obst_dict:
                obst_dict[obstacle] = [
                    obst_dict[obstacle][0] + 1, obst_dict[obstacle][1]
                ]
            else:
                obst_dict[obstacle] = [1, 0]

    obst_dict = collections.OrderedDict(
        sorted(obst_dict.items(), key=lambda s: len(s[0])))
    index = obst_dict.keys()
    columns = ["#Occurences", "#Crashes", "Crashes in %"]
    data = np.zeros(shape=(len(index), 3))
    count = 0
    for key, value in obst_dict.items():
        data[count][0] = value[0]  # #Occurences
        data[count][1] = value[1]  # #Crashes
        data[count][2] = value[1] / value[0] * 100
        count += 1

    df = pd.DataFrame(data, index=index, columns=columns)

    fix, ax = plt.subplots()
    ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3)
    ax.set_axisbelow(True)
    [i.set_linewidth(0.3) for i in ax.spines.values()]
    plt.xticks(rotation=90)
    plt.title('Crashes vs. obstacle arrangement')
    plt.ylabel('Crashes at this arrangement [%]')
    plt.xlabel('Obstacle arrangement')
    plt.bar(df.index, df['Crashes in %'])

    hp.save_plot(plt, 'Logfiles/',
                 'barplot_%crashes_per_obstacle_arrangement.pdf')
Exemplo n.º 3
0
def _plot_difficulty_vs_size_obstacle_scatter_plot():
    """
    Plots the difficulty of the level and the size of the obstacle at a given difficulty in a scatter plot

    Folder:     Logfiles/
    Plot name:  scatter_difficulty_vs_num_obstacles.pdf

    """

    plt.figure()
    values = _get_number_of_obstacles_per_difficulty()

    for i in [0, 1, 2]:
        li = values[5 * i:5 * i + 5]
        maximum = max(li) if (max(li) > 0) else 1
        values[5 * i:5 * i + 5] = [x / maximum * 2000 for x in li]
    fig, ax = plt.subplots()
    plt.title('Size of obstacle vs difficulty ')
    plt.ylabel('obstacle size')
    ax.yaxis.set_major_locator(
        MaxNLocator(integer=True))  # Only show whole numbers as difficulties
    plt.xticks([1, 2, 3], ['Low', 'Medium', 'High'])

    plt.xlabel('Difficulty')
    x = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]
    y = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4]

    plt.scatter(x, y, s=values)

    hp.save_plot(plt, 'Logfiles/', 'scatter_difficulty_vs_num_obstacles.pdf')
Exemplo n.º 4
0
def _plot_hr_vs_difficulty_scatter_plot():
    """
    Plots the heartrate vs the difficulty in a scatter plot

    Folder:     Logfiles/
    Plot name:  scatter_difficulty_vs_heartrate.pdf

    """

    df = pd.concat(sd.df_list, ignore_index=True)
    df_num = transform_df_to_numbers(df)
    df_num.set_index('timedelta', inplace=True)
    resolution = 3

    # resample and take mean over difficulty. This means that a point can now have a difficulty "between"
    # Low/Medium/High, depending on how many seconds out of the resolution seconds it was on which level.
    avg_hr_df_resampled = df_num.resample(str(resolution) + 'S').mean()

    plt.title('Difficulty vs. heartrate')
    plt.ylabel('heartrate')
    plt.xlabel('Difficulty')
    x = avg_hr_df_resampled['physDifficulty']
    y = avg_hr_df_resampled['Heartrate']
    plt.scatter(x, y, s=30)
    plt.xticks([1, 2, 3], ['Low', 'Medium', 'High'])

    hp.save_plot(plt, 'Logfiles/', 'scatter_difficulty_vs_heartrate.pdf')
Exemplo n.º 5
0
def _plot_heartrate_change():
    """
    Plot Heartrate changes from one time point to the next for each logfile

    Folder:     Logfiles/Abs Heartrate Changes/
    Plot name:  histogram_hr_change_percentage_{logfile name}.pdf

    """

    bpm_changes = []  # Stores all percentage changes in HR per logfile

    X = []
    for idx, df in enumerate(sd.df_list):
        if not (df['Heartrate'] == -1).all():
            X.append(idx)
            resampled = hp.resample_dataframe(df, 1)
            percentage_change = np.diff(
                resampled['Heartrate']) / resampled['Heartrate'][:-1] * 100.
            x = percentage_change[np.logical_not(np.isnan(percentage_change))]
            bpm_changes.append(x)

    plt.ylabel('#Times HR changed')
    plt.xlabel('Change in Heartrate [%]')

    for idx, l in enumerate(bpm_changes):  # Histogram per user
        name = str(sd.names_logfiles[idx])
        plt.figure()
        plt.title('Heartrate change for plot ' + name)
        plt.hist(l, color=hp.blue_color)
        hp.save_plot(plt, 'Logfiles/Abs Heartrate Changes/',
                     'histogram_hr_change_percentage_' + name + '.pdf')
Exemplo n.º 6
0
def _plot_mean_and_std_hr_boxplot():
    """
    Plots mean and std bpm per user in a box-chart

    Folder:     Logfiles/
    Plot name:  boxplot_mean_hr_per_user.pdf

    """

    conc_dataframes = pd.concat(sd.df_list, ignore_index=True)

    df2 = conc_dataframes.pivot(columns=conc_dataframes.columns[1],
                                index=conc_dataframes.index)
    df2.columns = df2.columns.droplevel()
    conc_dataframes[['Heartrate', 'userID']].boxplot(by='userID',
                                                     grid=False,
                                                     sym='r+')

    names = [n[:2] for n in sd.names_logfiles]
    locs, labels = plt.xticks()  # Get locations and labels
    plt.xticks(locs, list(OrderedDict.fromkeys(names)))

    plt.ylabel('Heartrate (bpm)')
    plt.xlabel('User name')
    plt.title('')
    hp.save_plot(plt, 'Logfiles/', 'boxplot_mean_hr_per_user.pdf')
Exemplo n.º 7
0
def _plot_average_hr_over_all_logfiles():
    """
    Plots average heartrate over all logfiles

    Folder:     Logfiles/
    Plot name:  lineplot_average_heartrate.pdf

    """

    plt.subplots()
    plt.ylabel('Heartrate (bpm)')
    plt.xlabel('Playing time (s)')
    plt.title('Average Heartrate across all users')

    conc_dataframes = pd.concat(sd.df_list, ignore_index=True)
    time_df = conc_dataframes.groupby(['userID', 'logID'])['Time'].max()

    min_time = time_df.min()

    conc_dataframes = conc_dataframes[
        conc_dataframes['Time'] <
        min_time]  # Cut all dataframes to the same minimal length

    df_copy = conc_dataframes.copy()  # to prevent SettingWithCopyWarning
    avg_hr_df = df_copy.groupby(['timedelta'
                                 ])[['timedelta', 'Heartrate'
                                     ]].mean()  # Take mean over all logfiles
    avg_hr_df.reset_index(inplace=True)
    avg_hr_df_resampled = hp.resample_dataframe(avg_hr_df, 10)

    plt.plot(avg_hr_df_resampled['Time'], avg_hr_df_resampled['Heartrate'])
    hp.save_plot(plt, 'Logfiles/', 'lineplot_average_heartrate.pdf')
def _plot_feature_distributions(X):
    """
    Plots the distribution of the features in separate plots

    :param X: Feature matrix

    Folder:     Features/Feature_distributions/
    Plot name:  histogram_{feature name}.pdf

    """

    print("Plotting histogram of each feature...")

    f_names = f_factory.feature_names
    for idx, feature in enumerate(f_names):
        x = X[:, idx]
        plt.figure()
        if feature == 'timedelta_to_last_obst':
            mean: float = np.mean(x)
            std_dev: float = np.std(x)
            plt.hist(x,
                     bins=np.arange(mean - 2 * std_dev, mean + 2 * std_dev,
                                    0.005))
        else:
            plt.hist(x)
            # add a 'best fit' line
            # sb.distplot(x)

        plt.title(feature)
        plt.tight_layout()
        filename = 'histogram_' + feature + '.pdf'
        hp.save_plot(plt, 'Features/Feature_distributions/', filename)
Exemplo n.º 9
0
def _plot_hr_of_dataframes():
    """
    Generates one heartrate plot for each dataframes (Used to compare normalized hr to original hr)
    Only works for real data at the moment, because of name_logfile not existing if synthesized_data...

    Folder:     Logfiles/Heartrate_Events/
    Plot name:  lineplot_hr_{logfile name}.pdf

    """

    print("Plotting heartrate of dataframes over time...")
    resolution = 5
    for idx, df in enumerate(sd.df_list):
        if not (df['Heartrate'] == -1).all():
            df_num_resampled = hp.resample_dataframe(df, resolution)
            # Plot Heartrate
            _, ax1 = plt.subplots()
            ax1.plot(df_num_resampled['Time'], df_num_resampled['Heartrate'],
                     hp.blue_color)
            ax1.set_xlabel('Playing time (s)')
            ax1.set_ylabel('Heartrate', color=hp.blue_color)
            ax1.tick_params('y', colors=hp.blue_color)

            filename = 'lineplot_hr_' + sd.names_logfiles[idx] + '.pdf'
            hp.save_plot(plt, 'Logfiles/Heartrate/', filename)
Exemplo n.º 10
0
def _plot_heat_map_of_grid_search(cv_results, Classifier):
    """
    Plots a heatmap over the hyperparameters, showing the corresponding roc_auc score
    Problem: We can only show 2 hyperparameters

    :param cv_results: cv_results of RandomizedSearchCV
    :param Classifier: the classfier

    """

    params = ([
        list(set(v.compressed())) for k, v in cv_results.items()
        if k.startswith('param_')
    ])
    plt.figure()
    results_df = pd.DataFrame(cv_results)
    scores = np.array(results_df.mean_test_score).reshape(
        len(params[0]), len(params[1]))
    sns.heatmap(scores,
                annot=True,
                xticklabels=params[0],
                yticklabels=params[1],
                cmap=plt.cm.RdYlGn)
    plt.title('Grid Search roc_auc Score')
    plots_helpers.save_plot(plt, 'Gridsearch/', Classifier.name + '.pdf')
Exemplo n.º 11
0
def _plot_mean_value_of_heartrate_at_crash():
    """
    For each feature, print the average of it when there was a crash vs. there was no crash

    Folder:     Features/Crash Correlation/
    Plot name:  barplot_mean_{feature name}_at_crash.pdf

    """

    print("Plotting mean value of heartrate when crash vs no crash happened...")

    means_when_crash = []
    means_when_no_crash = []
    stds_when_crash = []
    stds_when_no_crash = []
    for df in sd.df_list:

        df_with_crash = df[df['Logtype'] == 'EVENT_CRASH']
        df_without_crash = df[df['Logtype'] == 'EVENT_OBSTACLE']

        means_when_crash.append(df_with_crash['Heartrate'].mean())
        means_when_no_crash.append(df_without_crash['Heartrate'].mean())
        stds_when_crash.append(df_with_crash['Heartrate'].std())
        stds_when_no_crash.append(df_without_crash['Heartrate'].std())

    fix, ax = plt.subplots()
    bar_width = 0.3
    line_width = 0.3

    index = np.arange(len(means_when_crash))
    ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3)
    ax.set_axisbelow(True)
    [i.set_linewidth(line_width) for i in ax.spines.values()]

    plt.bar(index, means_when_crash, bar_width,
            color=ph.red_color,
            label='Heartrate when crash',
            yerr=stds_when_crash,
            error_kw={'elinewidth': line_width,
                      'capsize': 1.4,
                      'markeredgewidth': line_width},
            )

    plt.bar(index + bar_width, means_when_no_crash, bar_width,
            color=ph.blue_color,
            label='Heartrate when no crash',
            yerr=stds_when_no_crash,
            error_kw={'elinewidth': line_width,
                      'capsize': 1.4,
                      'markeredgewidth': line_width},
            )

    plt.ylabel('Heartrate (normalized)')
    plt.xlabel('Logfile')
    plt.title('Average value of Heartrate when crash or not crash')
    plt.xticks(index + bar_width / 2, np.arange(1, 20), rotation='horizontal')
    plt.legend(prop={'size': 6})
    filename = 'barplot_mean_heartrate_at_crash.pdf'
    ph.save_plot(plt, 'Report/', filename)
def _plot_feature(X, i):
    """
    Plots the feature at position i of each logfile over time

    :param X: Feature matrix
    :param i: Feature index to plot (look at features_factoy for order)

    Folder:     Features/Feature Plots/
    Plot name:  lineplot_{feature name}_{logfile_name}.pdf

    """

    print('Plotting feature ' + f_factory.feature_names[i] +
          ' of each logfile over time...')

    # df_num_resampled = resample_dataframe(samples, resolution)
    feature_name = f_factory.feature_names[i]
    for idx, _ in enumerate(sd.df_list):
        obst_df = sd.obstacle_df_list[idx]
        times = obst_df['Time']
        start = sum([len(l) for l in sd.obstacle_df_list[:idx]])
        samples = list(X[start:start + len(times), i])
        _, ax1 = plt.subplots()

        # Plot crashes
        crash_times = [
            row['Time'] for _, row in obst_df.iterrows() if row['crash']
        ]
        crash_values = [
            samples[index] for index, row in obst_df.iterrows() if row['crash']
        ]

        plt.scatter(crash_times,
                    crash_values,
                    c='r',
                    marker='.',
                    label='crash')
        plt.legend()
        ax1.plot(times, samples, c=hp.blue_color)
        ax1.set_xlabel('Playing time (s)')
        ax1.set_ylabel(feature_name, color=hp.blue_color)
        plt.title('Feature ' + feature_name + ' for logfile ' +
                  sd.names_logfiles[idx])
        ax1.tick_params('y', colors=hp.blue_color)
        # plt.ylim([max(np.mean(X[:, i]) - 3 * np.std(X[:, i]), min(X[:, i])), max(X[:, i])])
        # plt.ylim([0, 1])
        ax1.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3)
        ax1.set_axisbelow(True)

        ax1.spines['top'].set_linewidth(0.3)
        ax1.spines['right'].set_linewidth(0.3)
        filename = 'lineplot_' + feature_name + '_' + sd.names_logfiles[
            idx] + '.pdf'
        hp.save_plot(plt, 'Features/Feature Plots/' + feature_name + '/',
                     filename)
Exemplo n.º 13
0
def _plot_heartrate_and_events():
    """
    Plots the heartrate of logfile 4 (user Is), together with the crashes, Shieldtutorials and Brokenship events.
    Note: Same as plot_heartrate_and_events in plots_logfiles.py, but only for one specific logfile

    Folder:     Report/
    Plot name:  lineplot_hr_and_events.pdf

    """
    sd.setup(
        fewer_data=False,  # Specify if we want fewer data (for debugging purposes...)
        normalize_heartrate=False,
        remove_tutorials=False  # We want tutorial to be exactly at 3 and 7.5 minutes!
    )
    print("Plotting heartrate and events...")

    idx = 4
    df = sd.df_list[idx]

    # Plot Heartrate
    _, ax1 = plt.subplots()
    ax1.plot(df['Time'], df['Heartrate'], ph.blue_color, linewidth=1.0, label='Heartrate')
    ax1.set_xlabel('Playing time (s)')
    ax1.set_ylabel('Heartrate', color=ph.blue_color)
    ax1.tick_params('y', colors=ph.blue_color)

    times_crashes = [row['Time'] for _, row in sd.obstacle_df_list[idx].iterrows() if row['crash']]
    heartrate_crashes = [df[df['Time'] == row['Time']].iloc[0]['Heartrate']
                         for _, row in sd.obstacle_df_list[idx].iterrows() if row['crash']]
    plt.scatter(times_crashes, heartrate_crashes, c='r', marker='.', label='Crash')

    # Plot Brokenships
    times_repairing = [row['Time'] for _, row in df.iterrows() if row['Gamemode'] == 'BROKENSHIP']
    hr_max = df['Heartrate'].max()
    hr_min = df['Heartrate'].min()

    for xc in times_repairing:
        plt.vlines(x=xc, ymin=hr_min, ymax=hr_max+0.2, color='y', linewidth=1, label='Ship broken')

    # Plot Shieldtutorial
    times_repairing = [row['Time'] for _, row in
                       df.iterrows() if row['Gamemode'] == 'SHIELDTUTORIAL']
    hr_max = df['Heartrate'].max()
    hr_min = df['Heartrate'].min()

    for xc in times_repairing:
        plt.vlines(x=xc, ymin=hr_min, ymax=hr_max + 0.2, color='g', linewidth=1, label='Shield tutorial')

    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))  # Otherwise we'd have one label for each vline
    plt.legend(by_label.values(), by_label.keys())

    filename = 'lineplot_hr_and_events.pdf'
    ph.save_plot(plt, 'Report/', filename)
def _feature_selection(X, y, verbose=False):
    """
    Feature Selection with ExtraTreesClassifier. Prints and plots the importance of the features


    Source: http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    :param X:       Feature matrix
    :param y:       labels
    :param verbose: Whether a detailed report should be printed out

    :return new feature matrix with selected features

    """

    clf = ExtraTreesClassifier(n_estimators=250, class_weight='balanced')

    forest = clf.fit(X, y)

    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    X_new = SelectFromModel(clf).fit_transform(X, y)

    # Print the feature ranking
    if verbose:
        print("Feature ranking:")
        print('\n# features after feature-selection: ' + str(X_new.shape[1]))
    x_ticks = []
    for f in range(X.shape[1]):
        x_ticks.append(f_factory.feature_names[indices[f]])
        if verbose:
            print("%d. feature %s (%.3f)" % (f + 1, f_factory.feature_names[indices[f]], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices],
            color="r", yerr=std[indices], align="center")
    plt.xticks(range(X.shape[1]), x_ticks, rotation='vertical')
    plt.xlim([-1, X.shape[1]])
    plt.tight_layout()

    plots_helpers.save_plot(plt, 'Features/', 'feature_importance_decision_tree.pdf')

    return X_new, y
def _plot_mean_value_of_feature_at_crash(X, y):
    """
    For each feature, print the average of it when there was a crash vs. there was no crash

    :param X: Feature matrix
    :param y: labels

    Folder:     Features/Crash Correlation/
    Plot name:  barplot_mean_{feature name}_at_crash.pdf

    """

    print(
        "Plotting mean value of each feature when crash vs no crash happened..."
    )

    rows_with_crash = [val for (idx, val) in enumerate(X) if y[idx] == 1]
    rows_without_crash = [val for (idx, val) in enumerate(X) if y[idx] == 0]

    # Iterate over all features and plot corresponding plot
    for i in range(0, len(X[0])):
        mean_when_crash = np.mean([l[i] for l in rows_with_crash])
        mean_when_no_crash = np.mean([l[i] for l in rows_without_crash])
        std_when_crash = np.std([l[i] for l in rows_with_crash])
        std_when_no_crash = np.std([l[i] for l in rows_without_crash])

        plt.subplots()

        plt.bar(1,
                mean_when_no_crash,
                width=0.5,
                yerr=std_when_crash,
                color=hp.blue_color)
        plt.bar(2,
                mean_when_crash,
                width=0.5,
                yerr=std_when_no_crash,
                color=hp.green_color)
        plt.ylim(0)
        plt.xticks([1, 2], ['No crash', 'Crash'])
        plt.ylabel(str(f_factory.feature_names[i]))

        plt.title('Average value of feature ' +
                  str(f_factory.feature_names[i]) + ' when crash or not crash')

        filename = 'barplot_mean_' + str(
            f_factory.feature_names[i]) + '_at_crash.pdf'
        hp.save_plot(plt, 'Features/Crash Correlation/', filename)
def _plot_hr(dataframe, i):
    """
    Plots the heartrate of the dataframe

    :param dataframe: Dataframe from which the heartrate should be plotted
    :param i: id to differentiate plots

    """

    fig, ax1 = plt.subplots()
    fig.suptitle('heartrate')

    ax1.plot(dataframe['Time'], dataframe['Heartrate'])
    ax1.set_xlabel('Playing time (s)')
    ax1.set_ylabel('Heartrate')

    plots_helpers.save_plot(plt, 'Logfiles/synthesized_data/',
                            'heartrate_testdata_' + str(i) + '.pdf')
def plot_precision_recall_curve(classifier, X, y, filename):
    """
    Plots and saves a precision recall curve

    :param classifier:  Classifier to generate precision-recall curve from
    :param X:           Feature matrix
    :param y:           labels
    :param filename:    Name of the file the plot should be stored to


    """

    # allows to add probability output to classifiers which implement decision_function()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    scaler = MinMaxScaler(feature_range=(0, 1))

    X_train = scaler.fit_transform(X_train)  # Fit and transform on trainig set, then transform test set too
    X_test = scaler.transform(X_test)

    corr = FindCorrelation(threshold=0.9)
    X_train = corr.fit(X_train).transform(X_train)
    X_test = corr.transform(X_test)
    classifier.fit(X_train, y_train)

    decision_fct = getattr(classifier, "decision_function", None)
    if callable(decision_fct):
        y_score = classifier.decision_function(X_test)
        precision, recall, _ = precision_recall_curve(y_test, y_score)

        plt.step(recall, precision, color='b', alpha=0.2,
                 where='post')
        plt.fill_between(recall, precision, step='post', alpha=0.2,
                         color='b')

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title('2-class Precision-Recall curve')

        plots_helpers.save_plot(plt, 'Performance/Precision Recall Curves/', filename)
    else:
        print('\tThis classifier doesn\'t implement decision_function(), '
              'thus no precision_recall curve can be generated')
Exemplo n.º 18
0
def _plot_heartrate_histogram():
    """
    Plots a histogram of  heartrate data accumulated over all logfiles

    Folder:     Logfiles/
    Plot name:  histogram_hr_all_logfiles.pdf

    """

    print("Plotting histogram of heartrate of accumulated logfiles...")

    _, ax = plt.subplots()
    df = pd.concat(sd.df_list, ignore_index=True)
    df = df[df['Heartrate'] != -1]['Heartrate']
    plt.hist(df)
    plt.title('Histogram of HR: $\mu=%.3f$, $\sigma=%.3f$' %
              (float(np.mean(df)), float(np.std(df))))
    ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3)
    ax.set_axisbelow(True)
    [i.set_linewidth(0.3) for i in ax.spines.values()]
    hp.save_plot(plt, 'Logfiles/', 'histogram_hr_all_logfiles.pdf')
Exemplo n.º 19
0
def _plot_crashes_vs_size_of_obstacle():
    """
    Plots the percentage of crashes depending on the size of the obstacle

    Folder:     Logfiles/
    Plot name:  barplot_%crashes_per_size_of_obstacles.pdf

    """

    conc_dataframes = pd.concat(sd.df_list, ignore_index=True)
    conc_dataframes = transform_df_to_numbers(conc_dataframes)
    new = conc_dataframes['obstacle'].apply(
        lambda x: 0 if x == 'none' else x.count(
            ",") + 1)  # count number of obstacle parts per obstacle
    conc_num = conc_dataframes.assign(numObstacles=new)
    # [a,b,c,d,e], e..g  #obstacles that had size 0,1,2,3,4 respectively
    num_obstacles_per_size = conc_num.groupby('numObstacles').size().tolist()

    # num_obstacles_per_size.insert(2, 0)  # No obstacles of size 2...
    num_crashes_per_size = [0, 0, 0, 0, 0]

    # For each crash, find corresponding row where we can find the size of the obstacle he crashed into.
    for index, row in conc_num.iterrows():
        if row['Logtype'] == 'EVENT_CRASH':
            sizeOfObstacle = row['numObstacles']
            num_crashes_per_size[sizeOfObstacle] += 1

    percentage_of_crashes = [
        0 if (x == 0 or y == 0) else x / y * 100.0
        for x, y in zip(num_crashes_per_size, num_obstacles_per_size)
    ]

    x = [0, 1, 2, 3, 4]
    plt.title('Crash percentage per size of obstacle')
    plt.ylabel('Crashes [%]')
    plt.xlabel('Size of obstacle')
    plt.bar(x, percentage_of_crashes)

    hp.save_plot(plt, 'Logfiles/',
                 'barplot_%crashes_per_size_of_obstacles.pdf')
Exemplo n.º 20
0
def _plot_hr_or_points_and_difficulty(to_compare):
    """
    Plots heartrate or points together with the difficulty in a line plot

    :param to_compare: 'Heartrate' or 'Points'

    Folder:     Logfiles/Heartrate Difficulty Corr/ or Logfiles/Points Difficulty Corr/
    Plot name:  lineplot_heartrate_difficulty_{logfile name}.pdf and lineplot_points_difficulty_{logfile name}.pdf

    """

    resolution = 10  # resample every x seconds -> the bigger, the smoother
    for idx, df in enumerate(sd.df_list):
        df = transform_df_to_numbers(df)
        if not (df['Heartrate'] == -1).all():
            df_num_resampled = hp.resample_dataframe(df, resolution)
            # Plot Heartrate
            fig, ax1 = plt.subplots()
            ax1.plot(df_num_resampled['Time'], df_num_resampled[to_compare],
                     hp.blue_color)
            ax1.set_xlabel('Playing time (s)')
            ax1.set_ylabel(to_compare, color=hp.blue_color)
            ax1.tick_params('y', colors=hp.blue_color)

            # Plot Difficulty
            ax2 = ax1.twinx()
            ax2.plot(df_num_resampled['Time'],
                     df_num_resampled['physDifficulty'], hp.green_color)
            ax2.set_ylabel('physDifficulty', color=hp.green_color)
            ax2.tick_params('y', colors=hp.green_color)
            ax2.yaxis.set_major_locator(MaxNLocator(
                integer=True))  # Only show whole numbers as difficulties
            plt.yticks([1, 2, 3], ['Low', 'Medium', 'High'])
            plt.title('Difficulty and ' + to_compare + ' for user ' +
                      sd.names_logfiles[idx])
            hp.save_plot(
                plt, 'Logfiles/',
                to_compare + ' Difficulty Corr/lineplot_' + to_compare +
                '_difficulty_' + str(sd.names_logfiles[idx]) + '.pdf')
def _plot_roc_curve(predicted_probas,  y, filename, title='ROC', plot_thresholds=False):
    """
    Plots roc_curve for a given classifier

    :param predicted_probas: Probabilities of positive label
    :param y: labels
    :param filename: name of the file that the roc plot should be stored in
    :param title: title of the roc plot
    :param plot_thresholds: Also plot thresholds

    """

    # allows to add probability output to classifiers which implement decision_function()
    # clf = CalibratedClassifierCV(classifier)

    fpr_, tpr_, thresholds_ = roc_curve(y, predicted_probas)
    roc_auc = auc(fpr_, tpr_)

    plt.figure()
    plt.title(title)
    plt.plot(fpr_, tpr_, plots_helpers.blue_color, label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], c='gray',  ls='--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

    if plot_thresholds:
        # create the axis of thresholds (scores)
        ax2 = plt.gca().twinx()
        ax2.plot(fpr_, thresholds_, markeredgecolor='r', linestyle='dashed', color='r')
        ax2.set_ylabel('Threshold', color='r')

        ax2.set_ylim([thresholds_[-1], thresholds_[0]])
        ax2.set_xlim([fpr_[0], fpr_[-1]])

    plots_helpers.save_plot(plt, 'Report/', filename)
Exemplo n.º 22
0
def _plot_feature_correlation_matrix(reduced_features=True):
    """
    Function plots a heatmap of the correlation matrix for each pair of columns (=features) in the dataframe.

    Source: https://seaborn.pydata.org/examples/many_pairwise_correlations.html

    :param reduced_features: Should we use all features or only the reduced ones?

    Folder:     Features/
    Plot name:  correlation_matrix_all_features.pdf or correlation_matrix_reduced_features.pdf

    """

    print("Plotting correlation matrix...")

    X, _ = f_factory.get_feature_matrix_and_label(False, True, True, False, reduced_features)
    X = pd.DataFrame(X)
    corr = X.corr()
    sb.set(style="white")
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask, k=0)] = True
    # Set up the matplotlib figure
    fig, ax = plt.subplots(figsize=(len(f_factory.feature_names), len(f_factory.feature_names)))
    # Generate a custom diverging colormap
    cmap = sb.diverging_palette(220, 10, as_cmap=True)
    # Draw the heatmap with the mask and correct aspect ratio
    ax.tick_params(labelsize=20)
    sb.heatmap(corr, mask=mask, cmap=cmap, center=0, annot=True, xticklabels=f_factory.feature_names,
               yticklabels=f_factory.feature_names, square=True,
               linewidths=0.0, cbar_kws={"shrink": .6}, vmin=-1, vmax=1)
    cax = plt.gcf().axes[-1]
    cax.tick_params(labelsize=20)
    if reduced_features:
        ph.save_plot(plt, 'Report/', 'correlation_matrix_reduced_features.pdf')
    else:
        ph.save_plot(plt, 'Report/', 'correlation_matrix_all_features.pdf')
Exemplo n.º 23
0
def _plot_heartrate_change():
    """
    Plot number of times the heartrate changed more than {thresh} times

    Folder:     Logfiles/
    Plot name:  barplot_hr_change_thresh.pdf

    """
    thresh = 10

    bpm_changes_over_thresh = []  # Stores #points where change > thresh per logfile

    for idx, df in enumerate(sd.df_list):
        if not (df['Heartrate'] == -1).all():
            resampled = ph.resample_dataframe(df, 1)
            percentage_change = np.diff(resampled['Heartrate']) / resampled['Heartrate'][:-1] * 100.
            x = percentage_change[np.logical_not(np.isnan(percentage_change))]
            bpm_changes_over_thresh.append(len([i for i in x if i > thresh]))

    fig, ax = plt.subplots()

    # plt.title('Number of times the heartrate changed more than ' + str(thresh) + '%')
    plt.ylabel('Number of times')
    plt.xlabel('Logfile')
    index = np.arange(len(bpm_changes_over_thresh))
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))  # Only show whole numbers as difficulties
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))  # Only show whole numbers as difficulties
    plt.xticks(index, np.arange(1, 20), rotation='horizontal')

    plt.bar(index, bpm_changes_over_thresh, color=ph.blue_color, width=0.25)

    ax.yaxis.grid(True, zorder=0, color='grey',  linewidth=0.3)
    ax.set_axisbelow(True)
    [i.set_linewidth(0.3) for i in ax.spines.values()]

    ph.save_plot(plt, 'Report/', 'barplot_hr_change_thresh.pdf')
def test_all_windows():
    """
    Keeps one window fixed and changes the other two. Calculates the roc_auc of the Random Forest with
    pre-tuned parameters for each window combination and plots it.

    """
    print("\n################# Testing all window sizes #################\n")

    const_window = 'cw'

    const_w = 10
    list_1 = [5, 10, 20, 30, 50, 60]
    list_2 = list_1[::-1]

    if const_window == 'hw':
        name1 = 'Crash window (s)'
        name2 = 'Gradient window (s)'
        filename = 'windows_const_hw.pdf'
    elif const_window == 'cw':
        name1 = 'Default window (s)'
        name2 = 'Gradient window (s)'
        filename = 'windows_const_cw.pdf'
    else:
        name1 = 'Crash window'
        name2 = 'Default window'
        filename = 'windows_const_gradient_w.pdf'

    mean_scores = np.zeros((len(list_1), len(list_2)))
    model_name = 'Nearest Neighbor'
    for idx_w1, w1 in enumerate(list_1):
        for idx_w2, w2 in enumerate(list_2):
            if const_window == 'hw':
                X, y = f_factory.get_feature_matrix_and_label(
                    verbose=True,
                    use_cached_feature_matrix=True,
                    save_as_pickle_file=True,
                    h_window=const_w,
                    c_window=w1,
                    gradient_window=w2,
                    reduced_features=False)
                model = classifiers.get_cclassifier_with_name(
                    model_name, X, y).tuned_clf

                roc_auc_mean, roc_auc_std, _, _, _, _, _, _, _, _, _, _ = model_factory. \
                    get_performance(model, model_name,  X, y, tuned_params_keys=None, verbose=False,
                                    create_curves=False)

                mean_scores[idx_w1][idx_w2] = roc_auc_mean
            elif const_window == 'cw':
                X, y = f_factory.get_feature_matrix_and_label(
                    verbose=True,
                    use_cached_feature_matrix=True,
                    save_as_pickle_file=True,
                    h_window=w1,
                    c_window=const_w,
                    gradient_window=w2,
                    reduced_features=False)
                model = classifiers.get_cclassifier_with_name(
                    model_name, X, y).tuned_clf

                roc_auc_mean, roc_auc_std, _, _, _, _, _, _, _, _, _, _ = model_factory. \
                    get_performance(model, model_name, X, y, tuned_params_keys=None, verbose=False,
                                    create_curves=False)

                mean_scores[idx_w1][idx_w2] = roc_auc_mean
            else:
                X, y = f_factory.get_feature_matrix_and_label(
                    verbose=True,
                    use_cached_feature_matrix=True,
                    save_as_pickle_file=True,
                    h_window=w1,
                    c_window=w2,
                    gradient_window=const_w,
                    reduced_features=False)

                model = classifiers.get_cclassifier_with_name(
                    model_name, X, y).tuned_clf

                roc_auc_mean, roc_auc_std, _, _, _, _, _, _, _, _, _, _ = model_factory. \
                    get_performance(model, model_name, X, y, tuned_params_keys=None, verbose=False,
                                    create_curves=False)

                mean_scores[idx_w1][idx_w2] = roc_auc_mean

    mean_scores = np.fliplr(
        np.flipud(mean_scores))  # Flip to plot it correctly

    # Plot elements
    plt.subplot()
    plt.imshow(mean_scores, cmap='RdYlGn')
    plt.title('Average classifier performance when using constant ' +
              const_window)
    ax = plt.gca()
    ax.set_xticks(np.arange(0, len(list_1), 1))
    ax.set_yticks(np.arange(0, len(list_2), 1))
    ax.set_xticklabels(list_1)
    ax.set_yticklabels(list_2)
    ax.set_ylabel(name1)
    ax.set_xlabel(name2)
    plt.colorbar()
    plots_helpers.save_plot(plt, 'Performance/Windows/', filename)
def _plot_timedeltas_and_crash_per_logfile(do_normalize=True):
    """
    Plots for each logfile the mean and std of timedelta_to_last_obst at each obstacle  and if a crash or not happened

    :param do_normalize: Whether to normalize timedelta_feature over time

    Folder:     Features/Timedelta vs Crash Detailed
    Plot name:  crash_logfile_{logfile_name}.pdf

    """

    for idx, df in enumerate(sd.obstacle_df_list):
        timedelta_crash = []
        timedelta_no_crash = []
        computed_timedeltas = []
        for i in range(0, len(df.index)):
            current_obstacle_row = df.iloc[i]
            previous_obstacle_row = df.iloc[
                i - 1] if i > 0 else current_obstacle_row
            timedelta = current_obstacle_row['Time'] - previous_obstacle_row[
                'Time']

            # Clamp outliers (e.g. because of tutorials etc.). If timedelta >3, it's most likely e.g 33 seconds, so I
            # clamp to c.a. the average
            if timedelta > 3 or timedelta < 1:
                timedelta = 2

            if do_normalize:
                # Normalization (since timedelta over time decreases slightly)
                if len(computed_timedeltas) >= 1:
                    normalized = timedelta / computed_timedeltas[-1]
                else:
                    normalized = 1

                if current_obstacle_row['crash']:
                    timedelta_crash.append(normalized)
                else:
                    timedelta_no_crash.append(normalized)
            else:
                if current_obstacle_row['crash']:
                    timedelta_crash.append(timedelta)
                else:
                    timedelta_no_crash.append(timedelta)

            computed_timedeltas.append(timedelta)

        # Rescale values
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler.fit(
            np.array(timedelta_crash + timedelta_no_crash).reshape(-1, 1))

        # Evaluation
        mean_when_crash = np.mean(timedelta_crash)
        mean_when_no_crash = np.mean(timedelta_no_crash)
        std_when_crash = np.std(timedelta_crash)
        std_when_no_crash = np.std(timedelta_no_crash)

        _, _ = plt.subplots()
        plt.ylim(0, 1.2)
        plt.ylabel('Feature value')
        plt.bar(1, mean_when_no_crash, width=0.5, yerr=std_when_no_crash)
        plt.bar(2,
                mean_when_crash,
                width=0.5,
                yerr=std_when_crash,
                label='Crash')
        plt.xticks([1, 2], ['No crash', 'Crash'])
        plt.title('Average timedelta value for logfile ' + str(idx) +
                  ' when crash or not crash')

        filename = 'crash_logfile_' + sd.names_logfiles[idx] + '.pdf'
        hp.save_plot(plt, 'Features/Timedelta vs Crash Detailed/', filename)
def _plot_scores_with_different_feature_selections():
    """
    After trying different feature selcetions, I plot the scores for each classifier in a barchart.
    Note: The numbers were colelcted by analyzsing the performances!

    1. timedelta_to_last_obst only
    2. timedelta_to_last_obst + last_obstacle_crash
    3. all features
    4. old features (=all features without timedelta_to_last_obst)

    Folder:     Performance
    Plot name:  clf_performance_with_different_features.pdf

    """

    scores_timedelta_only = [0.69, 0.69, 0.84, 0.69, 0.86, 0.86, 0.8, 0.69]
    scores_timedelta_and_last_obst_crash = [
        0.745, 0.726, 0.99, 0.73, 0.99, 0.994, 0.96, 0.73
    ]
    scores_all_features = [0.68, 0.68, 0.61, 0.64, 0.96, 0.95, 0.965, 0.65]
    scores_old_features = [0.62, 0.63, 0.57, 0.622, 0.53, 0.6, 0.64, 0.74]

    fix, ax = plt.subplots()
    bar_width = 0.2
    line_width = 0.3

    index = np.arange(len(scores_timedelta_only))
    ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3)
    ax.set_axisbelow(True)
    [i.set_linewidth(line_width) for i in ax.spines.values()]

    plt.bar(
        index,
        scores_timedelta_and_last_obst_crash,
        bar_width,
        color=hp.red_color,
        label='timedelta_to_last_obst + last_obstacle_crash',
    )

    plt.bar(
        index + bar_width,
        scores_timedelta_only,
        bar_width,
        color=hp.blue_color,
        label='timedelta_to_last_obst',
    )

    plt.bar(
        index + 2 * bar_width,
        scores_all_features,
        bar_width,
        color=hp.green_color,
        label='all features',
    )

    plt.bar(
        index + 3 * bar_width,
        scores_old_features,
        bar_width,
        color=hp.yellow_color,
        label='all features, but without timedelta_to_last_obst',
    )

    plt.ylabel('roc_auc')
    plt.title('roc_auc when selecting different features')
    plt.xticks(index + bar_width / 4, classifiers.names, rotation='vertical')
    ax.set_ylim([0, 1.2])
    plt.legend(prop={'size': 6})

    plt.yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])

    plt.tight_layout()

    hp.save_plot(plt, 'Performance/',
                 'clf_performance_with_different_features.pdf')
def _plot_timedelta_vs_obstacle_scatter(X, y):
    """
    Plots timedelta-feature and labels in a scatter plot and the histogram on top

    :param X: Feature matrix
    :param y: labels

    Folder:     Features/Timedelta vs crash/
    Plot name:  scatter_timedelta_crash_mean_over_all_users.pdf or scatter_timedelta_crash_{logfile_name}.pdf

    """

    # Split up feature matrix into one matrix for each logfile
    feature_matrices = []
    label_lists = []
    obstacles_so_far = 0
    for df in sd.obstacle_df_list:
        num_obstacles = len(df.index)
        feature_matrices.append(
            X.take(range(obstacles_so_far, obstacles_so_far + num_obstacles),
                   axis=0))
        label_lists.append(y[obstacles_so_far:obstacles_so_far +
                             num_obstacles])
        obstacles_so_far += num_obstacles

    X_old = X
    y_old = y

    for i in range(0, len(sd.df_list) + 1):
        plt.subplot()
        if i == len(sd.df_list):  # Do the plot with the entire feature matrix
            X = X_old
            y = y_old
            plt.title('Timedelta vs crash plot aggregated over all logfiles')

        else:
            X = feature_matrices[i]
            y = label_lists[i]
            plt.title('Timedelta vs crash plot for logfile ' +
                      sd.names_logfiles[i])

        g = sb.jointplot(X[:, 9], X[:, 8], kind='reg')

        g.ax_joint.cla()
        plt.sca(g.ax_joint)

        colors = [hp.red_color if i == 1 else hp.green_color for i in y]
        plt.scatter(X[:, 9], X[:, 8], c=colors, alpha=0.3, s=150)
        plt.xticks([0, 1], ['False', 'True'])
        plt.ylim([
            np.mean(X[:, 8]) - 3 * np.std(X[:, 8]),
            np.mean(X[:, 8]) + 3 * np.std(X[:, 8])
        ])  # Achse fixen!
        plt.ylabel('Time to last obstacle')
        plt.xlabel('Crash at last obstacle')
        green_patch = mpatches.Patch(color=hp.green_color, label='no crash')
        red_patch = mpatches.Patch(color=hp.red_color, label='crash')

        plt.legend(handles=[green_patch, red_patch])

        if i == len(sd.df_list):
            hp.save_plot(plt, 'Features/Timedelta vs crash/',
                         'scatter_timedelta_crash_mean_over_all_users.pdf')
        else:
            hp.save_plot(
                plt, 'Features/Timedelta vs crash/',
                'scatter_timedelta_crash_' + sd.names_logfiles[i] + '.pdf')
def _plot_crashes_vs_timedelta(X):
    """
    Plots the percentage of crashes happening depending on the timedelta-feature in a barchart

    :param X:  Feature matrix

    Folder:     Features/
    Plot name:  barplot_%crashes_vs_timedelta.pdf

    """

    print("Plotting percentage crashes vs timedelta...")
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X)
    timedelta_values_at_crashes = []
    timedelta_values_at_non_crashes = []
    timedelta_feature_index = f_factory.feature_names.index(
        'timedelta_to_last_obst')

    obst_conc = pd.concat(sd.obstacle_df_list)

    for idx, row in obst_conc.iterrows():
        if row['crash']:
            timedelta_values_at_crashes.append(X[idx, timedelta_feature_index])
        else:
            timedelta_values_at_non_crashes.append(X[idx,
                                                     timedelta_feature_index])

    def get_percentage_crashes_for_bin(i):
        """
        Returns percentage of crashes when timedelta is in a certain bin, where bin i: [i/10 , i/10 + 0.1]

        :param i: Bin
        :return: tuple with (opercentage, #occurences)

        """

        conc = timedelta_values_at_crashes + timedelta_values_at_non_crashes
        try:
            return (len([
                x for x in timedelta_values_at_crashes
                if i / 10 <= x <= i / 10 + 0.1
            ]) / len([x for x in conc if i / 10 <= x <= i / 10 + 0.1]),
                    len([
                        x for x in timedelta_values_at_crashes
                        if i / 10 <= x <= i / 10 + 0.1
                    ]))

        except ZeroDivisionError:
            return 0, 0

    x_tick_labels = [
        '[0.0, 0.1]', '[0.1, 0.2]', '[0.2, 0.3]', '[0.3, 0.4]', '[0.4, 0.5]',
        '[0.5, 0.6]', '[0.6, 0.7]', '[0.7, 0.8]', '[0.8, 0.9]', '[0.9, 1.0]'
    ]
    tuples = [get_percentage_crashes_for_bin(i) for i in range(0, 10)]
    value_list = [t[0] for t in tuples]
    occurences_list = [t[1] for t in tuples]

    bar_width = 0.2
    fig, ax = plt.subplots()

    plt.title('Percentage of crashes depending on timedelta')
    plt.ylabel('crashes (%)')
    plt.xlabel('timedelta to previous obstacle (s, normalized)')
    plt.xticks(np.arange(len(value_list)) + bar_width / 2, rotation='vertical')
    ax.set_xticklabels(x_tick_labels)
    # ax.set_ylim(0, ceil(max(value_list) * 10) / 10.0)
    plt.bar(np.arange(len(value_list)),
            value_list,
            color=hp.blue_color,
            width=bar_width,
            label='Crashes (%)')

    ax2 = ax.twinx()
    plt.bar(np.arange(len(value_list)) + bar_width,
            occurences_list,
            color=hp.red_color,
            width=bar_width,
            label='Occurences')
    ax2.set_ylabel('Occurences', color=hp.red_color)
    ax2.tick_params('y', colors=hp.red_color)

    # Add legend with two axis
    lines, labels = ax.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc=0)

    ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3)
    ax.set_axisbelow(True)
    [i.set_linewidth(0.3) for i in ax.spines.values()]

    hp.save_plot(plt, 'Features/', 'barplot_%crashes_vs_timedelta.pdf')
Exemplo n.º 29
0
def _plot_scores_normal_cv_vs_leaveone_group_out_cv(names,
                                                    auc_scores_scenario_1,
                                                    auc_stds_scenario_1,
                                                    auc_scores_scenario_2,
                                                    auc_stds_scenario_2):
    """
    Plots the roc_auc score and the standard deviation for each classifier for both scenarios next to each other

    :param names: names of the logfiles
    :param auc_scores_scenario_1: list of roc_auc scores when doing normal cv
    :param auc_stds_scenario_1: list of roc_auc_std scores when doing normal cv
    :param auc_scores_scenario_2: list of roc_auc scores when doing leave_one_user_out cv
    :param auc_stds_scenario_2: list of roc_auc_std scores when doing leave_one_user_out cv

    """

    fix, ax = plt.subplots()
    bar_width = 0.3
    line_width = 0.3

    index = np.arange(len(auc_scores_scenario_1))
    ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3)
    ax.set_axisbelow(True)
    [i.set_linewidth(line_width) for i in ax.spines.values()]

    plt.bar(
        index,
        auc_scores_scenario_1,
        bar_width,
        color=plots_helpers.blue_color,
        label='10-fold cross-validation',
        yerr=auc_stds_scenario_1,
        error_kw={
            'elinewidth': line_width,
            'capsize': 1.4,
            'markeredgewidth': line_width
        },
    )

    plt.bar(
        index + bar_width,
        auc_scores_scenario_2,
        bar_width,
        color=plots_helpers.red_color,
        label='Leave One Group Out cross-validation',
        yerr=auc_stds_scenario_2,
        error_kw={
            'elinewidth': line_width,
            'capsize': 1.4,
            'markeredgewidth': line_width
        },
    )

    plt.ylabel('AUC')
    plt.title('Performance when leaving one user out in training phase')
    plt.xticks(index + bar_width / 2, names, rotation='vertical')
    ax.set_ylim([0, 1.0])
    plt.legend(prop={'size': 10})
    '''
    def autolabel(rects):
        """
           Attach a text label above each bar displaying its height
           """
        for rect in rects:
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width() / 2., 1.1 * height,
                    '%0.2f' % height,
                    ha='center', va='bottom', size=5)

    # autolabel(r1)
    # autolabel(r2)
    '''

    plt.tight_layout()

    plots_helpers.save_plot(
        plt, 'Report/', 'clf_performance_with_user_left_out_vs_normal.pdf')
def plot_roc_curves(hyperparameter_tuning=False, pre_set=True, with_lstm=False):
    """
    Plots roc_curves for all classifier in one single plot

    :param hyperparameter_tuning: Do hyperparameter tuning
    :param pre_set: Some classifiers have pre_tuned parameters (on Euler). Take those instead of tuning
    :param with_lstm: Also include ROC of LSTM network (takes a little time...)

    Folder:     Report/
    Plot name:  roc_curves.pdf

    """

    X, y = f_factory.get_feature_matrix_and_label(
                verbose=False,
                use_cached_feature_matrix=True,
                save_as_pickle_file=True,
                reduced_features=False,
                use_boxcox=False
        )

    clf_names = ['SVM', 'Nearest Neighbor', 'Random Forest', 'Naive Bayes']

    if pre_set:
        clf_list = [classifiers.get_cclassifier_with_name(name, X, y).tuned_clf for name in clf_names]
    else:
        clf_list = [classifiers.get_cclassifier_with_name(name, X, y).clf for name in clf_names]

    tprs = []
    fprs = []
    roc_aucs = []

    for idx, classifier in enumerate(clf_list):
        if hyperparameter_tuning:
            classifier, _ = hyperparameter_optimization.get_tuned_clf_and_tuned_hyperparameters(
                X, y, clf_name=clf_names[idx], verbose=False, pre_set=True
            )

        # clf = CalibratedClassifierCV(classifier)
        clf = classifier
        kf = KFold(n_splits=10)
        predicted_probas_list = []
        y = np.array(y)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            scaler = MinMaxScaler(feature_range=(0, 1))

            X_train = scaler.fit_transform(X_train)  # Fit and transform on trainig set, then transform test set too
            X_test = scaler.transform(X_test)

            corr = FindCorrelation(threshold=0.9)
            X_train = corr.fit(X_train).transform(X_train)
            X_test = corr.transform(X_test)

            clf.fit(X_train, y_train)

            predicted_probas = clf.predict_proba(X_test)
            predicted_probas_list.append(predicted_probas[:, 1])

        fpr, tpr, _ = roc_curve(y, list(itertools.chain.from_iterable(predicted_probas_list)))
        roc_auc = auc(fpr, tpr)
        fprs.append(fpr)
        tprs.append(tpr)
        roc_aucs.append(roc_auc)

    # Also add LSTM scores:
    if with_lstm:
        clf_names.append("LSTM")
        fpr, tpr, roc_auc = LSTM.create_roc_curve(X, y, 130)
        fprs.append(fpr)
        tprs.append(tpr)
        roc_aucs.append(roc_auc)

    plt.figure()

    for idx, name in enumerate(clf_names):
        plt.plot(fprs[idx], tprs[idx], label=name + ' (AUC = %0.2f)' % roc_aucs[idx])

    plt.title('Roc curves')
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], c='gray', ls='--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

    plots_helpers.save_plot(plt, 'Report/', 'roc_curves.pdf')