def extract_features(extraction_type, n_components): if extraction_type == 'pca': ext = PCA(n_components=n_components) return ext elif extraction_type == 'lda': ext = LDA(n_discriminants=n_components) return ext else: print("Input a valid method for (PCA or LDA)\n")
def extract_features(tipo, n): if tipo == 'pca': ext = PCA(n_components=n) return ext elif tipo == 'lda': ext = LDA(n_discriminants=n) return ext else: print ("Ingrese un método válido (pca o lda)\n")
def plot_projections(holder, labels, preprocess_lda='PCA', class_name='Antioxidants', only_pca=False, binarize_class=True, standardize=True, cluster=True, return_distances=False): ''' holder should be a dictionary with df's as values and fp-filenames as keys labels should be a mapping of DrugCombID: ATC_class ''' if only_pca: from sklearn.decomposition import PCA df = dict() for ind, i in enumerate([ 'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit', 'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new', 'fps_transformer_1024bit_new', 'fps_transformer_64bit_new', 'fps_gae_64bit_new' ]): df_cluster = holder[i].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: from mlxtend.preprocessing import standardize as st classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) df_cluster = st(df_cluster) else: classes = df_cluster.index.copy() pca = PCA(n_components=2) temp = pca.fit_transform(df_cluster) df[ind] = pd.DataFrame(index=df_cluster.index, data=temp) df[ind]['classes'] = classes df[ind]['classes'] = df[ind]['classes'].map(labels) title = 'PCA' else: # to LDA from mlxtend.feature_extraction import LinearDiscriminantAnalysis as LDA from sklearn.preprocessing import LabelEncoder # binary https://stats.stackexchange.com/questions/178587/why-is-the-rank-of-covariance-matrix-at-most-n-1/180366#180366 df = dict() for ind, i in enumerate([ 'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit', 'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new', 'fps_transformer_1024bit_new', 'fps_transformer_64bit_new', 'fps_gae_64bit_new' ]): df_cluster = holder[i].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: from mlxtend.preprocessing import standardize as st from sklearn.preprocessing import MinMaxScaler classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) mms = MinMaxScaler() df_cluster = pd.DataFrame(data=mms.fit_transform(df_cluster), index=df_cluster.index, columns=df.columns) else: classes = df_cluster.index.copy() df_cluster['classes'] = classes df_cluster['classes'] = df_cluster['classes'].map(labels) if binarize_class: df_cluster.loc[df_cluster.classes != class_name, 'classes'] = 'not ' + 'class_name' # change labels from str to int enc = LabelEncoder() real_classes = df_cluster.loc[:, 'classes'] df_cluster.loc[:, 'classes'] = enc.fit_transform( df_cluster['classes']) classes = df_cluster.pop('classes') if preprocess_lda == 'PLS': from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=10, scale=False) temp = pls.fit_transform(df_cluster.values, classes.values)[0] elif preprocess_lda == 'PCA': from sklearn.decomposition import PCA pca = PCA(n_components=0.95, svd_solver='full', whiten=False) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'kernelPCA': from sklearn.decomposition import KernelPCA pca = KernelPCA(kernel="rbf", gamma=5) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'NONE': temp = df_cluster.values # lda lda = LDA(n_discriminants=2) lda.fit(temp, classes.values) temp = lda.transform(temp) with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', 'Casting complex values to real discards the imaginary part' ) temp = temp.astype(np.float) # in case of complex numbers/// df[ind] = pd.DataFrame(index=df_cluster.index, columns=[0, 1], data=temp) df[ind]['classes'] = real_classes title = 'LDA' sns.set_context(context='talk') sns.set_style('dark') sns.set_style({'font.family': 'serif', 'font.sans-serif': ['Helvetica']}) fig, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize=(13, 14)) cm = plt.cm.get_cmap('Spectral') my_cmap = cm(np.linspace(0, 1, len(np.unique(df[ind]['classes']))), alpha=0.6) if return_distances: distances = dict() sil_scores = dict() chs_scores = dict() for ax_n, key, x, name in zip( [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9], df.keys(), df.values(), [ 'E3FP', 'Morgan_300', 'Topo_1024', 'Infomax', 'VAE_256', 'VAE_16', 'Trans_1024', 'Trans_64', 'GAE_64' ]): if not binarize_class: for ind, i in enumerate(np.unique(x['classes'])): color = my_cmap[ind] marker = '.' if i == class_name: color = 'black', marker = ',' ax_n.scatter( x.loc[x.classes == i, 0], x.loc[x.classes == i, 1], marker=marker, label=i + f' (n={str(len(x.loc[x.classes==i, 0]))}) vs Rest ({str(len(x.loc[x.classes!=i, 0]))})', color=color) ax_n.title.set_text(name) else: ax_n.scatter(x.loc[:, 0], x.loc[:, 1], marker='.') ax_n.scatter( x.loc[x.classes == class_name, 0], x.loc[x.classes == class_name, 1], marker=',', label=class_name + f' (n={str(len(x.loc[x.classes==class_name, 0]))}) vs rest (n={str(len(x.loc[x.classes!=class_name, 0]))})', color='darkorange') ax_n.title.set_text(name) if cluster: from sklearn.cluster import KMeans from scipy.spatial.distance import pdist from sklearn.metrics import silhouette_score as sil from sklearn.metrics import calinski_harabasz_score as chs km = KMeans(init='k-means++', n_clusters=1, n_init=10) km.fit(x.loc[x.classes != class_name, [0, 1]]) km1 = KMeans(init='k-means++', n_clusters=1, n_init=10) km1.fit(x.loc[x.classes == class_name, [0, 1]]) ax_n.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='X', color='darkblue', s=100, linewidth=3) ax_n.scatter(km1.cluster_centers_[:, 0], km1.cluster_centers_[:, 1], marker='X', color='red', s=100, linewidth=3) d = round( pdist([km.cluster_centers_[0], km1.cluster_centers_[0]], metric='euclidean')[0], 3) d_sc = round(sil(x.loc[:, [0, 1]], x['classes']), 3) d_chs = round(chs(x.loc[:, [0, 1]], x['classes']), 3) if return_distances: cl_name = class_name + ' ' + name distances[cl_name] = d sil_scores[cl_name] = d_sc chs_scores[cl_name] = d_chs name = name + '\n|d:' + str(d) + '|sil:' + str( d_sc) + '|chs:' + str(d_chs) ax_n.title.set_text(name) for ax in [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]: ax.set_xticks([]) ax.set_yticks([]) labels = ax_n.get_legend_handles_labels()[1] if only_pca: fig.suptitle(labels[0] + "\n classified with: " + title) else: fig.suptitle(labels[0] + "\n classified with: " + title + f', preprocessed with: {preprocess_lda}') fig.tight_layout() if not return_distances: return fig else: return fig, distances, sil_scores, chs_scores
def test_fail_array_transform(): lda = LDA() lda.fit(X, y) exp = lda.transform(X[1])
def test_fail_array_fit(): lda = LDA() lda.fit(X[1], y[1])
def test_evals(): lda = LDA(n_discriminants=2) res = lda.fit(X, y).transform(X) np.set_printoptions(suppress=True) print('%s' % lda.e_vals_) assert_almost_equal(lda.e_vals_, [20.90, 0.14, 0.0, 0.0], decimal=2)
def test_default_components(): lda = LDA(n_discriminants=0) lda.fit(X, y) res = lda.fit(X).transform(X)
def test_default_2components(): lda = LDA(n_discriminants=2) lda.fit(X, y) res = lda.fit(X, y).transform(X) assert res.shape[1] == 2
def test_default_components(): lda = LDA() lda.fit(X, y) res = lda.fit(X).transform(X) assert res.shape[1] == 4
def test_fail_array_transform(): lda = LDA() lda.fit(X, y) with pytest.raises(ValueError): lda.transform(X[1])
def test_fail_array_fit(): lda = LDA() with pytest.raises(ValueError): lda.fit(X[1], y[1])
def test_default_0components(): with pytest.raises(AttributeError): LDA(n_discriminants=0)