示例#1
0
def boxplots_genres(scores,
                    results_path,
                    filename="boxplots_genres",
                    offset=100):
    create_missing_folders(results_path + "/plots/")
    fig2, ax21 = plt.subplots()

    scores_sorted_lists = [
        sorted(rand_jitter(np.array(scores[i * offset:(i + 1) * offset])))
        for i in range(10)
    ]
    genres = [
        "blues", "classical", "country", "disco", "hiphop", "jazz", "metal",
        "pop", "reggae", "rock"
    ]
    colors = [
        'darkorange', 'cornflowerblue', 'darkviolet', 'chocolate',
        'yellowgreen', 'lightseagreen', 'forestgreen', 'crimson', 'coral',
        'wheat'
    ]
    box = ax21.boxplot(scores_sorted_lists,
                       vert=0,
                       patch_artist=True,
                       labels=genres)  # plotting t, a separately
    for patch, color in zip(box['boxes'], colors):
        patch.set_facecolor(color)

    handle, label = ax21.get_legend_handles_labels()
    ax21.legend(handle, label)
    fig2.tight_layout()
    pylab.savefig(results_path + "/plots/" + filename)
    plt.close()
    del scores, scores_sorted_lists
示例#2
0
def plot_data_distribution(train_scores,
                           valid_scores,
                           results_path,
                           filename="scores_data_distribution"):
    create_missing_folders(results_path + "/plots/")
    fig2, ax21 = plt.subplots()

    scores_train = sorted(train_scores)
    scores_valid = sorted(valid_scores)

    ax21.plot(scores_train, 'b--', label='Train')  # plotting t, a separately
    ax21.plot(scores_valid, 'r--', label='Valid')  # plotting t, a separately
    ax21.hlines(np.mean(train_scores),
                xmin=0,
                xmax=900,
                colors='b',
                label='Train mean')
    ax21.hlines(np.mean(valid_scores),
                xmin=0,
                xmax=900,
                colors='r',
                label='Valid mean')
    # ax21.vlines(500, ymin=0, ymax=1, colors='k')
    ax21.set_xlabel('epochs')
    ax21.set_ylabel('Loss')
    handle, label = ax21.get_legend_handles_labels()
    ax21.legend(handle, label)
    fig2.tight_layout()
    pylab.savefig(results_path + "/plots/" + filename)
    plt.close()
示例#3
0
def QDA(data_frame,
        images_folder_path,
        dataset_name,
        epoch,
        a=0.5,
        verbose=0,
        info="none",
        show_images=True):
    import pandas as pd
    import numpy as np
    try:
        assert type(data_frame) == pd.core.frame.DataFrame
    except:
        print(
            "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)"
        )
        print(type(data_frame))
        exit()
        return
    if type(dataset_name) == list:
        names = [name for name in dataset_name]
        dataset_name = "_".join(names)

    y = np.array(data_frame.columns, dtype=str)
    classes_list = np.unique(y)
    data_frame.values[np.isnan(data_frame.values)] = 0

    X = np.transpose(data_frame.values)
    # Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    y_pred = lda.fit(X, y).predict(X)
    splot = plot_data(lda, X, y, y_pred)
    plot_lda_cov(lda, splot)
    plt.axis('tight')

    # Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis(store_covariance=True)
    y_pred = qda.fit(X, y).predict(X)
    splot = plot_data(qda, X, y, y_pred)
    plot_qda_cov(qda, splot)
    plt.axis('tight')
    plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant'
                 'Analysis')
    try:
        plt.tight_layout()
    except:
        pass
    type_images_folder_path = "/".join([images_folder_path,
                                        str(dataset_name)]) + "/"
    type_images_folder_path = type_images_folder_path + info + "/"

    create_missing_folders(type_images_folder_path)

    plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png",
                dpi=100)
    if show_images:
        plt.show()
    plt.close()
示例#4
0
def performance_per_score(predicted_values,
                          target_values,
                          results_path,
                          n,
                          filename="scores_performance",
                          valid=False,
                          noise=False):
    create_missing_folders(results_path + "/plots/")
    fig2, ax21 = plt.subplots()
    predicted_values = np.array(predicted_values)

    target_values = np.array(target_values)
    # ax21.set_ylim([0, 1])
    # ax21.set_xlim([0, 1])
    if not noise:
        plt.scatter(rand_jitter(target_values),
                    predicted_values,
                    facecolors='none',
                    edgecolors="r")
    else:
        plt.scatter(target_values,
                    predicted_values,
                    facecolors='none',
                    edgecolors="r")
    # else:
    #    for i, (c, genre) in enumerate(zip(colors, genres)):
    #        plt.scatter(target_values[i * n:(i + 1) * n], rand_jitter(predicted_values[i * n:(i + 1) * n]),
    #                    facecolors='none', edgecolors=c)

    ax21.hlines(np.mean(predicted_values),
                xmin=0,
                xmax=1,
                colors='b',
                label='Predicted values average')
    ax21.hlines(np.mean(target_values),
                xmin=0,
                xmax=1,
                colors='k',
                label='Target values average')
    plt.plot(np.unique(target_values),
             np.poly1d(np.polyfit(target_values, predicted_values,
                                  1))(np.unique(target_values)),
             label="Best fit")
    ident = [0.0, 1.0]
    ax21.plot(ident, ident, color="g", label='Identity line')
    handle, label = ax21.get_legend_handles_labels()
    ax21.legend(handle, label)
    fig2.tight_layout()
    pylab.savefig(results_path + "/plots/" + filename)
    plt.close()
    del predicted_values, target_values, results_path
示例#5
0
def ordination2d(data_frame,
                 ORD=PCA,
                 images_folder_path="/home/simon/results/annleukemia/plots/",
                 filenames="NoName",
                 a=0.5):
    type_images_folder_path = images_folder_path + filenames + "/"
    create_missing_folders(type_images_folder_path)
    try:
        assert type(data_frame) == pd.core.frame.DataFrame
    except:
        print(
            "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)"
        )
        return

    y = np.array(data_frame.columns, dtype=str)
    classes_list = np.unique(y)

    ord = ORD(n_components=2, verbose=1)
    principalComponents = ord.fit_transform(np.transpose(data_frame.values))
    principalDf = pd.DataFrame(
        data=principalComponents,
        columns=['principal component 1', 'principal component 2'])
    finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title('2 component tSNE', fontsize=20)
    colors = ['r', 'g', 'b']
    for target, color in zip(classes_list, colors):
        indicesToKeep = finalDf[0] == target
        data1 = finalDf.loc[indicesToKeep, 'principal component 1']
        data2 = finalDf.loc[indicesToKeep, 'principal component 2']
        ellipse_data(data1, data2, ax, color)

        ax.scatter(data1, data2, c=color, s=12)
    ax.legend(classes_list)
    ax.grid()

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, classes_list)

    plt.tight_layout()
    fig.tight_layout()

    fig.savefig(images_folder_path + type_ord + filenames + ".png")
    plt.close(fig)
示例#6
0
def plot_performance(running_loss, valid_loss, results_path, filename):
    create_missing_folders(results_path + "/plots/")
    fig2, ax21 = plt.subplots()
    ax21.plot(running_loss, 'b-', label='Train')  # plotting t, a separately
    ax21.plot(valid_loss, 'r-', label='Valid')  # plotting t, a separately
    ax21.set_xlabel('epochs')
    ax21.set_ylabel('Loss')
    handle, label = ax21.get_legend_handles_labels()
    ax21.legend(handle, label)
    fig2.tight_layout()
    # pylab.show()
    create_missing_folders(results_path + "/plots/")
    try:
        pylab.savefig(results_path + "/plots/" + filename)
    except:
        pass
    plt.close()
示例#7
0
def ordination2d(data_frame, ord_type, images_folder_path, dataset_name, epoch, a=0.4, verbose=0, info="none",
                 show_images=True, df_valid=None, df_test=None, n=4):
    import pandas as pd
    import numpy as np

    pc1 = 'Component_1'
    pc2 = 'Component_2'

    type_images_folder_path = "/".join([images_folder_path, str(ord_type), str(dataset_name)]) + "/"
    type_images_folder_path = type_images_folder_path + info + "/"

    create_missing_folders(type_images_folder_path)


    try:
        assert type(data_frame) == pd.core.frame.DataFrame
    except:
        print("The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)")
        print(type(data_frame))
        exit()
        return
    if type(dataset_name) == list:
        names = [name for name in dataset_name]
        dataset_name = "_".join(names)

    y = np.array(data_frame.columns, dtype=str)
    classes_list = np.unique(y)
    data_frame.values[np.isnan(data_frame.values)] = 0
    ord = None
    ys = False
    if ord_type in ["pca", "PCA"]:
        ys = False
        ord = PCA(n_components=2)
    elif ord_type in ["kpca", "KPCA"]:
        ys = False
        ord = KernelPCA(n_components=2, kernel="rbf")
    elif ord_type in ["tsne", "tSNE", "TSNE", "t-sne", "T-SNE", "t-SNE"]:
        ys = False
        ord = TSNE(n_components=2, verbose=verbose)
    elif ord_type in ["lda", "LDA", "flda", "FLDA"]:
        ys = True
        ord = LDA(n_components=2)
    elif ord_type in ["qda", "QDA"]:
        ord = QDA()
        ys = True
    else:
        print(ord_type)
        exit("No ordination of that name is implemented. Exiting...")
    if ys:
        principal_components = ord.fit_transform(np.transpose(data_frame.values), y=y)
        if df_valid is not None:
            pcs_valid = ord.transform(df_valid.values)
            pcs_valid = pd.DataFrame(data=pcs_valid,  columns=['principal component 1', 'principal component 2'])
            y_valid = df_valid.columns
            pcs_valid = pd.concat([pcs_valid, pd.DataFrame(y_valid)], axis=1)

            pcs_test = ord.transform(df_test.values)
            pcs_test = pd.DataFrame(data=pcs_test,  columns=['principal component 1', 'principal component 2'])
            y_test = df_valid.columns


            pcs_test = ord.transform(pcs_test.values)
            pcs_test = pd.concat([pcs_test, pd.DataFrame(y_test)], axis=1)

    else:
        principal_components = ord.fit_transform(np.transpose(data_frame.values))

    if ord_type == "pca":
        ev = ord.explained_variance_ratio_
        means = ord.mean_
        if sum(means < 0):
            means = means - min(means)
        means_ratio = means / np.sum(np.sum(means, axis=0)) * 100
        coeff = np.transpose(ord.components_)
        order_importance = list(reversed(np.argsort(means)))
        coeff, means_ratio = coeff[order_importance], means_ratio[order_importance]

        factors = np.array(data_frame.index)[order_importance]
        x = list(range(len(factors)))
        plt.xlabel("Initial Features")
        plt.ylabel("% of varaince explained")
        plt.title("% of the variance is explained by the initial features (Total:" + str(np.round(np.sum(ev) * 100, 2)) + ")")
        plt.xticks([x[0]], [factors[0]], rotation=45, fontsize=8)
        plt.plot(means_ratio)
        plt.tight_layout()
        plt.savefig(type_images_folder_path + info + "_" + str(epoch) + "_var_exaplined_2D.png", dpi=100)
        print("plot at ", type_images_folder_path)

    principal_df = pd.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
    final_df = pd.concat([principal_df, pd.DataFrame(y)], axis=1)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    if ord_type not in "kpca":
        ev = ord.explained_variance_ratio_
        if len(ev) > 1:
            pc1 = pc1 + ': ' + str(np.round(ev[0] * 100, decimals=2)) + "%"
            pc2 = pc2 + ': ' + str(np.round(ev[1] * 100, decimals=2)) + "%"

    ax.set_xlabel(pc1, fontsize=15)
    ax.set_ylabel(pc2, fontsize=15)
    ax.set_title('2 component Ordination', fontsize=20)

    # colors = cm.viridis(np.linspace(0, 1, len(classes_list)))
    colors = ["g", "b", "k", "r"]
    print("coeff shape", coeff.shape)
    if len(coeff) < n:
        n = len(coeff)

    for t, target in enumerate(classes_list):
        indices_to_keep = final_df[0] == target
        indices_to_keep = list(indices_to_keep)
        data1 = final_df.loc[indices_to_keep, 'principal component 1']
        data2 = final_df.loc[indices_to_keep, 'principal component 2']
        try:
            assert np.sum(np.isnan(data1)) == 0 and np.sum(np.isnan(data2)) == 0
        except:
            print("Nans were detected. Please verify the DataFrame...")
            exit()
        ellipse_data(data1, data2, ax, colors[t])

        ax.scatter(data1, data2, s=10, alpha=a, c=colors[t])

        labels = factors
        for i in range(n):
            plt.arrow(0, 0, coeff[i, 0], coeff[i, 1], color='r', alpha=0.5)
            if labels is None:
                plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, "Var" + str(i + 1) + str(np.round(means_ratio[i], 2)),
                         color='g', ha='center', va='center')
            else:
                plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, str(labels[i]) + str(np.round(means_ratio[i], 2)),
                         color='g',
                         ha='center', va='center')

    ax.legend(classes_list)
    ax.grid()

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, classes_list)

    if df_valid is not None:
        for t, target in enumerate(classes_list):
            indices_to_keep = final_df[0] == target
            indices_to_keep = list(indices_to_keep)
            data1 = pcs_valid.loc[indices_to_keep, 'principal component 1']
            data2 = pcs_valid.loc[indices_to_keep, 'principal component 2']
            try:
                assert np.sum(np.isnan(data1)) == 0 and np.sum(np.isnan(data2)) == 0
            except:
                print("Nans were detected. Please verify the DataFrame...")
                exit()
            ellipse_data(data1, data2, ax, colors[t])

            ax.scatter(data1, data2, s=10, alpha=a)
        ax.legend(classes_list)
        ax.grid()

        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, classes_list)

    if df_test is not None:
        for t, target in enumerate(classes_list):
            indices_to_keep = final_df[0] == target
            indices_to_keep = list(indices_to_keep)
            data1 = pcs_test.loc[indices_to_keep, 'principal component 1']
            data2 = pcs_test.loc[indices_to_keep, 'principal component 2']
            try:
                assert np.sum(np.isnan(data1)) == 0 and np.sum(np.isnan(data2)) == 0
            except:
                print("Nans were detected. Please verify the DataFrame...")
                exit()
            ellipse_data(data1, data2, ax, colors[t])

            ax.scatter(data1, data2, s=10, alpha=a)
        ax.legend(classes_list)
        ax.grid()

        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, classes_list)




    try:
        plt.tight_layout()
        fig.tight_layout()
    except:
        pass
    plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png", dpi=100)
    if show_images:
        plt.show()
    plt.close(fig)
示例#8
0
def ordination2d(data_frame,
                 ORD=PCA,
                 images_folder_path="/home/simon/results/annleukemia/plots/",
                 filename="pca",
                 a=0.5):
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    type_images_folder_path = images_folder_path + filename + "/"
    create_missing_folders(type_images_folder_path)
    try:
        assert type(data_frame) == pd.core.frame.DataFrame
    except:
        print(
            "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)"
        )
        return

    y = np.array(data_frame.columns, dtype=str)
    classes_list = np.unique(y)

    pca = ORD(n_components=2)

    data_frame.values[np.isnan(data_frame.values)] = 0
    principalComponents = pca.fit_transform(data_frame.values)
    principalDf = pd.DataFrame(
        data=principalComponents,
        columns=['principal component 1', 'principal component 2'])
    finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title('2 component PCA', fontsize=20)
    #colors = get_colors()
    for target in classes_list:
        indicesToKeep = finalDf[0] == target
        data1 = finalDf.loc[indicesToKeep, 'principal component 1']
        data2 = finalDf.loc[indicesToKeep, 'principal component 2']
        try:
            assert np.sum(np.isnan(data1)) == 0 and np.sum(
                np.isnan(data2)) == 0
        except:
            print("Nans were detected. Please verify the DataFrame...")
            exit()
        ellipse_data(data1, data2, ax)

        ax.scatter(data1,
                   data2,
                   s=20,
                   alpha=a,
                   linewidths=0,
                   edgecolors='none')
    ax.legend(classes_list)
    ax.grid()

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, classes_list)

    plt.tight_layout()
    fig.tight_layout()
    fig.savefig(type_images_folder_path + "PCA2d" + filename + ".png")
    plt.close(fig)