def boxplots_genres(scores, results_path, filename="boxplots_genres", offset=100): create_missing_folders(results_path + "/plots/") fig2, ax21 = plt.subplots() scores_sorted_lists = [ sorted(rand_jitter(np.array(scores[i * offset:(i + 1) * offset]))) for i in range(10) ] genres = [ "blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock" ] colors = [ 'darkorange', 'cornflowerblue', 'darkviolet', 'chocolate', 'yellowgreen', 'lightseagreen', 'forestgreen', 'crimson', 'coral', 'wheat' ] box = ax21.boxplot(scores_sorted_lists, vert=0, patch_artist=True, labels=genres) # plotting t, a separately for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) handle, label = ax21.get_legend_handles_labels() ax21.legend(handle, label) fig2.tight_layout() pylab.savefig(results_path + "/plots/" + filename) plt.close() del scores, scores_sorted_lists
def plot_data_distribution(train_scores, valid_scores, results_path, filename="scores_data_distribution"): create_missing_folders(results_path + "/plots/") fig2, ax21 = plt.subplots() scores_train = sorted(train_scores) scores_valid = sorted(valid_scores) ax21.plot(scores_train, 'b--', label='Train') # plotting t, a separately ax21.plot(scores_valid, 'r--', label='Valid') # plotting t, a separately ax21.hlines(np.mean(train_scores), xmin=0, xmax=900, colors='b', label='Train mean') ax21.hlines(np.mean(valid_scores), xmin=0, xmax=900, colors='r', label='Valid mean') # ax21.vlines(500, ymin=0, ymax=1, colors='k') ax21.set_xlabel('epochs') ax21.set_ylabel('Loss') handle, label = ax21.get_legend_handles_labels() ax21.legend(handle, label) fig2.tight_layout() pylab.savefig(results_path + "/plots/" + filename) plt.close()
def QDA(data_frame, images_folder_path, dataset_name, epoch, a=0.5, verbose=0, info="none", show_images=True): import pandas as pd import numpy as np try: assert type(data_frame) == pd.core.frame.DataFrame except: print( "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)" ) print(type(data_frame)) exit() return if type(dataset_name) == list: names = [name for name in dataset_name] dataset_name = "_".join(names) y = np.array(data_frame.columns, dtype=str) classes_list = np.unique(y) data_frame.values[np.isnan(data_frame.values)] = 0 X = np.transpose(data_frame.values) # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariance=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant' 'Analysis') try: plt.tight_layout() except: pass type_images_folder_path = "/".join([images_folder_path, str(dataset_name)]) + "/" type_images_folder_path = type_images_folder_path + info + "/" create_missing_folders(type_images_folder_path) plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png", dpi=100) if show_images: plt.show() plt.close()
def performance_per_score(predicted_values, target_values, results_path, n, filename="scores_performance", valid=False, noise=False): create_missing_folders(results_path + "/plots/") fig2, ax21 = plt.subplots() predicted_values = np.array(predicted_values) target_values = np.array(target_values) # ax21.set_ylim([0, 1]) # ax21.set_xlim([0, 1]) if not noise: plt.scatter(rand_jitter(target_values), predicted_values, facecolors='none', edgecolors="r") else: plt.scatter(target_values, predicted_values, facecolors='none', edgecolors="r") # else: # for i, (c, genre) in enumerate(zip(colors, genres)): # plt.scatter(target_values[i * n:(i + 1) * n], rand_jitter(predicted_values[i * n:(i + 1) * n]), # facecolors='none', edgecolors=c) ax21.hlines(np.mean(predicted_values), xmin=0, xmax=1, colors='b', label='Predicted values average') ax21.hlines(np.mean(target_values), xmin=0, xmax=1, colors='k', label='Target values average') plt.plot(np.unique(target_values), np.poly1d(np.polyfit(target_values, predicted_values, 1))(np.unique(target_values)), label="Best fit") ident = [0.0, 1.0] ax21.plot(ident, ident, color="g", label='Identity line') handle, label = ax21.get_legend_handles_labels() ax21.legend(handle, label) fig2.tight_layout() pylab.savefig(results_path + "/plots/" + filename) plt.close() del predicted_values, target_values, results_path
def ordination2d(data_frame, ORD=PCA, images_folder_path="/home/simon/results/annleukemia/plots/", filenames="NoName", a=0.5): type_images_folder_path = images_folder_path + filenames + "/" create_missing_folders(type_images_folder_path) try: assert type(data_frame) == pd.core.frame.DataFrame except: print( "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)" ) return y = np.array(data_frame.columns, dtype=str) classes_list = np.unique(y) ord = ORD(n_components=2, verbose=1) principalComponents = ord.fit_transform(np.transpose(data_frame.values)) principalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('2 component tSNE', fontsize=20) colors = ['r', 'g', 'b'] for target, color in zip(classes_list, colors): indicesToKeep = finalDf[0] == target data1 = finalDf.loc[indicesToKeep, 'principal component 1'] data2 = finalDf.loc[indicesToKeep, 'principal component 2'] ellipse_data(data1, data2, ax, color) ax.scatter(data1, data2, c=color, s=12) ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) plt.tight_layout() fig.tight_layout() fig.savefig(images_folder_path + type_ord + filenames + ".png") plt.close(fig)
def plot_performance(running_loss, valid_loss, results_path, filename): create_missing_folders(results_path + "/plots/") fig2, ax21 = plt.subplots() ax21.plot(running_loss, 'b-', label='Train') # plotting t, a separately ax21.plot(valid_loss, 'r-', label='Valid') # plotting t, a separately ax21.set_xlabel('epochs') ax21.set_ylabel('Loss') handle, label = ax21.get_legend_handles_labels() ax21.legend(handle, label) fig2.tight_layout() # pylab.show() create_missing_folders(results_path + "/plots/") try: pylab.savefig(results_path + "/plots/" + filename) except: pass plt.close()
def ordination2d(data_frame, ord_type, images_folder_path, dataset_name, epoch, a=0.4, verbose=0, info="none", show_images=True, df_valid=None, df_test=None, n=4): import pandas as pd import numpy as np pc1 = 'Component_1' pc2 = 'Component_2' type_images_folder_path = "/".join([images_folder_path, str(ord_type), str(dataset_name)]) + "/" type_images_folder_path = type_images_folder_path + info + "/" create_missing_folders(type_images_folder_path) try: assert type(data_frame) == pd.core.frame.DataFrame except: print("The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)") print(type(data_frame)) exit() return if type(dataset_name) == list: names = [name for name in dataset_name] dataset_name = "_".join(names) y = np.array(data_frame.columns, dtype=str) classes_list = np.unique(y) data_frame.values[np.isnan(data_frame.values)] = 0 ord = None ys = False if ord_type in ["pca", "PCA"]: ys = False ord = PCA(n_components=2) elif ord_type in ["kpca", "KPCA"]: ys = False ord = KernelPCA(n_components=2, kernel="rbf") elif ord_type in ["tsne", "tSNE", "TSNE", "t-sne", "T-SNE", "t-SNE"]: ys = False ord = TSNE(n_components=2, verbose=verbose) elif ord_type in ["lda", "LDA", "flda", "FLDA"]: ys = True ord = LDA(n_components=2) elif ord_type in ["qda", "QDA"]: ord = QDA() ys = True else: print(ord_type) exit("No ordination of that name is implemented. Exiting...") if ys: principal_components = ord.fit_transform(np.transpose(data_frame.values), y=y) if df_valid is not None: pcs_valid = ord.transform(df_valid.values) pcs_valid = pd.DataFrame(data=pcs_valid, columns=['principal component 1', 'principal component 2']) y_valid = df_valid.columns pcs_valid = pd.concat([pcs_valid, pd.DataFrame(y_valid)], axis=1) pcs_test = ord.transform(df_test.values) pcs_test = pd.DataFrame(data=pcs_test, columns=['principal component 1', 'principal component 2']) y_test = df_valid.columns pcs_test = ord.transform(pcs_test.values) pcs_test = pd.concat([pcs_test, pd.DataFrame(y_test)], axis=1) else: principal_components = ord.fit_transform(np.transpose(data_frame.values)) if ord_type == "pca": ev = ord.explained_variance_ratio_ means = ord.mean_ if sum(means < 0): means = means - min(means) means_ratio = means / np.sum(np.sum(means, axis=0)) * 100 coeff = np.transpose(ord.components_) order_importance = list(reversed(np.argsort(means))) coeff, means_ratio = coeff[order_importance], means_ratio[order_importance] factors = np.array(data_frame.index)[order_importance] x = list(range(len(factors))) plt.xlabel("Initial Features") plt.ylabel("% of varaince explained") plt.title("% of the variance is explained by the initial features (Total:" + str(np.round(np.sum(ev) * 100, 2)) + ")") plt.xticks([x[0]], [factors[0]], rotation=45, fontsize=8) plt.plot(means_ratio) plt.tight_layout() plt.savefig(type_images_folder_path + info + "_" + str(epoch) + "_var_exaplined_2D.png", dpi=100) print("plot at ", type_images_folder_path) principal_df = pd.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2']) final_df = pd.concat([principal_df, pd.DataFrame(y)], axis=1) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) if ord_type not in "kpca": ev = ord.explained_variance_ratio_ if len(ev) > 1: pc1 = pc1 + ': ' + str(np.round(ev[0] * 100, decimals=2)) + "%" pc2 = pc2 + ': ' + str(np.round(ev[1] * 100, decimals=2)) + "%" ax.set_xlabel(pc1, fontsize=15) ax.set_ylabel(pc2, fontsize=15) ax.set_title('2 component Ordination', fontsize=20) # colors = cm.viridis(np.linspace(0, 1, len(classes_list))) colors = ["g", "b", "k", "r"] print("coeff shape", coeff.shape) if len(coeff) < n: n = len(coeff) for t, target in enumerate(classes_list): indices_to_keep = final_df[0] == target indices_to_keep = list(indices_to_keep) data1 = final_df.loc[indices_to_keep, 'principal component 1'] data2 = final_df.loc[indices_to_keep, 'principal component 2'] try: assert np.sum(np.isnan(data1)) == 0 and np.sum(np.isnan(data2)) == 0 except: print("Nans were detected. Please verify the DataFrame...") exit() ellipse_data(data1, data2, ax, colors[t]) ax.scatter(data1, data2, s=10, alpha=a, c=colors[t]) labels = factors for i in range(n): plt.arrow(0, 0, coeff[i, 0], coeff[i, 1], color='r', alpha=0.5) if labels is None: plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, "Var" + str(i + 1) + str(np.round(means_ratio[i], 2)), color='g', ha='center', va='center') else: plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, str(labels[i]) + str(np.round(means_ratio[i], 2)), color='g', ha='center', va='center') ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) if df_valid is not None: for t, target in enumerate(classes_list): indices_to_keep = final_df[0] == target indices_to_keep = list(indices_to_keep) data1 = pcs_valid.loc[indices_to_keep, 'principal component 1'] data2 = pcs_valid.loc[indices_to_keep, 'principal component 2'] try: assert np.sum(np.isnan(data1)) == 0 and np.sum(np.isnan(data2)) == 0 except: print("Nans were detected. Please verify the DataFrame...") exit() ellipse_data(data1, data2, ax, colors[t]) ax.scatter(data1, data2, s=10, alpha=a) ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) if df_test is not None: for t, target in enumerate(classes_list): indices_to_keep = final_df[0] == target indices_to_keep = list(indices_to_keep) data1 = pcs_test.loc[indices_to_keep, 'principal component 1'] data2 = pcs_test.loc[indices_to_keep, 'principal component 2'] try: assert np.sum(np.isnan(data1)) == 0 and np.sum(np.isnan(data2)) == 0 except: print("Nans were detected. Please verify the DataFrame...") exit() ellipse_data(data1, data2, ax, colors[t]) ax.scatter(data1, data2, s=10, alpha=a) ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) try: plt.tight_layout() fig.tight_layout() except: pass plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png", dpi=100) if show_images: plt.show() plt.close(fig)
def ordination2d(data_frame, ORD=PCA, images_folder_path="/home/simon/results/annleukemia/plots/", filename="pca", a=0.5): import pandas as pd import matplotlib.pyplot as plt import numpy as np type_images_folder_path = images_folder_path + filename + "/" create_missing_folders(type_images_folder_path) try: assert type(data_frame) == pd.core.frame.DataFrame except: print( "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)" ) return y = np.array(data_frame.columns, dtype=str) classes_list = np.unique(y) pca = ORD(n_components=2) data_frame.values[np.isnan(data_frame.values)] = 0 principalComponents = pca.fit_transform(data_frame.values) principalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('2 component PCA', fontsize=20) #colors = get_colors() for target in classes_list: indicesToKeep = finalDf[0] == target data1 = finalDf.loc[indicesToKeep, 'principal component 1'] data2 = finalDf.loc[indicesToKeep, 'principal component 2'] try: assert np.sum(np.isnan(data1)) == 0 and np.sum( np.isnan(data2)) == 0 except: print("Nans were detected. Please verify the DataFrame...") exit() ellipse_data(data1, data2, ax) ax.scatter(data1, data2, s=20, alpha=a, linewidths=0, edgecolors='none') ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) plt.tight_layout() fig.tight_layout() fig.savefig(type_images_folder_path + "PCA2d" + filename + ".png") plt.close(fig)