def export(self, query, n_topics, n_words, title="PCA Export", fname="PCAExport"): vec = DictVectorizer() rows = topics_to_vectorspace(self.model, n_topics, n_words) X = vec.fit_transform(rows) pca = skPCA(n_components=2) X_pca = pca.fit(X.toarray()).transform(X.toarray()) match = [] for i in range(n_topics): topic = [t[1] for t in self.model.show_topic(i, len(self.dictionary.keys()))] m = None for word in topic: if word in query: match.append(word) break pyplot.figure() for i in range(X_pca.shape[0]): pyplot.scatter(X_pca[i, 0], X_pca[i, 1], alpha=.5) pyplot.text(X_pca[i, 0], X_pca[i, 1], s=' '.join([str(i), match[i]])) pyplot.title(title) pyplot.savefig(fname) pyplot.close()
def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'digits': X, _ = datasets.load_digits(return_X_y=True) else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) skpca = skPCA(n_components=2) skpca.fit(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) cupca.fit(X) cupca.handle.sync() for attr in ['singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_']: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def pca_comparison_kropt(): X, y = datasets.load_kropt() # Perform custom PCA, sklearn PCA and IPCA transformations pca = PCA(n_components=2, verbose=True) X_trans1 = pca.fit_transform(X) skpca = skPCA(n_components=2) X_trans2 = skpca.fit_transform(X) ipca = IncrementalPCA(n_components=2, batch_size=5000) X_trans3 = ipca.fit_transform(X) # Plot transformed spaces fig, ax = plt.subplots(1, 3, figsize=(15, 5)) ax[0].scatter(X_trans1[:, 0], X_trans1[:, 1]) ax[0].title.set_text('Custom PCA Kropt') ax[0].set_xlabel('PC1') ax[0].set_ylabel('PC2') ax[1].scatter(X_trans2[:, 0], X_trans2[:, 1]) ax[1].title.set_text('Sklearn PCA Kropt') ax[1].set_xlabel('PC1') ax[1].set_ylabel('PC2') ax[2].scatter(X_trans3[:, 0], X_trans3[:, 1]) ax[2].title.set_text('Sklearn IPCA Kropt') ax[2].set_xlabel('PC1') ax[2].set_ylabel('PC2') plt.show()
def test_pca_defaults(n_samples, n_features, sparse): if sparse: X = cupyx.scipy.sparse.random(n_samples, n_features, density=0.03, dtype=cp.float32, random_state=10) else: X, Y = make_multilabel_classification(n_samples=n_samples, n_features=n_features, n_classes=2, n_labels=1, random_state=1) cupca = cuPCA() cupca.fit(X) curesult = cupca.transform(X) cupca.handle.sync() if sparse: X = X.toarray().get() skpca = skPCA() skpca.fit(X) skresult = skpca.transform(X) assert skpca.svd_solver == cupca.svd_solver assert cupca.components_.shape[0] == skpca.components_.shape[0] assert curesult.shape == skresult.shape assert array_equal(curesult, skresult, 1e-3, with_sign=False)
def pca_comparison_satimage(): X, y = datasets.load_satimage() pca = PCA(2, verbose=True) X_trans1 = pca.fit_transform(X) skpca = skPCA(2) X_trans2 = skpca.fit_transform(X) # transform dataset with sklearn's IncrementalPCA ipca = IncrementalPCA(2) X_trans3 = ipca.fit_transform(X) fig = plt.figure(figsize=(15, 5)) ax = fig.add_subplot(1, 3, 1) ax.set_title('SatImage PCA') ax.plot(X_trans1[:, 0], X_trans1[:, 1], 'o') # , dataPCA[:,2],'o') ax.set_xlabel('X') ax.set_ylabel('Y') ax = fig.add_subplot(1, 3, 2) ax.set_title('SatImage sklearn PCA') ax.plot(X_trans2[:, 0], X_trans2[:, 1], 'o') # , dataPCA[:,2],'o') ax.set_xlabel('X') ax.set_ylabel('Y') ax = fig.add_subplot(1, 3, 3) ax.set_title('SatImage sklearn IncrementalPCA') ax.plot(X_trans3[:, 0], X_trans3[:, 1], 'o') # , dataPCA[:,2],'o') ax.set_xlabel('X') ax.set_ylabel('Y') plt.show()
def pca_comparison_credita(): X, y = datasets.load_credita() fig, ax = plt.subplots(1, 3, figsize=(15, 5)) plt.subplots_adjust(bottom=.10, left=.05, top=.90, right=.95) # transform dataset with our PCA pca = PCA(2, verbose=True) X_trans1 = pca.fit_transform(X) ax[0].scatter(X_trans1[:, 0], X_trans1[:, 1]) ax[0].title.set_text('2-component PCA on Credit-A') # transform dataset with sklearn's PCA skpca = skPCA(2) X_trans2 = skpca.fit_transform(X) ax[1].scatter(X_trans2[:, 0], X_trans2[:, 1]) ax[1].title.set_text('2-component PCA (sklearn) on Credit-A') # transform dataset with sklearn's IncrementalPCA ipca = IncrementalPCA(2) X_trans3 = ipca.fit_transform(X) ax[2].scatter(X_trans3[:, 0], X_trans3[:, 1]) ax[2].title.set_text('2-component IncrementalPCA (sklearn) on Credit-A') plt.show()
def test_pca_fit(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skPCA(n_components=2) skpca.fit(X) cupca = cuPCA(n_components=2) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cupca.fit(gdf) else: cupca.fit(X) for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_', 'noise_variance_' ]: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) if isinstance(cuml_res, cudf.Series): cuml_res = cuml_res.to_array() else: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def test_pca_fit_transform(datatype, input_type, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) if name != 'blobs': skpca = skPCA(n_components=2) Xskpca = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) X_cupca = cupca.fit_transform(X) cupca.handle.sync() if name != 'blobs': assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True) assert Xskpca.shape[0] == X_cupca.shape[0] assert Xskpca.shape[1] == X_cupca.shape[1]
def run_pca(X, n_components, svd_solver, whiten, random_state, model): if model == 'sklearn': pca = skPCA(n_components=n_components, svd_solver=svd_solver, whiten=whiten, random_state=random_state) elif model == 'h2o4gpu': from h2o4gpu.solvers.pca import PCAH2O as h2oPCA pca = h2oPCA(n_components=n_components, whiten=whiten) #, random_state=random_state) elif model == 'cuml': from cuSKL import PCA as cumlPCA pca = cumlPCA(n_components=n_components, svd_solver=svd_solver, whiten=whiten, random_state=random_state) else: raise NotImplementedError @timer def fit_(pca, X, model): pca.fit(X) return pca @timer def transform_(pca, X, model): return pca.transform(X) pca = fit_(pca, X, model=model) Xpca = transform_(pca, X, model=model) pca.transformed_result = lambda: None setattr(pca, 'transformed_result', Xpca) return pca
def test_pca_fit_transform(datatype, input_type, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) if name != 'blobs': skpca = skPCA(n_components=2) Xskpca = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) if input_type == 'dataframe': X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) X_cupca = cupca.fit_transform(X_cudf) else: X_cupca = cupca.fit_transform(X) cupca.handle.sync() if name != 'blobs': assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
def test_pca_defaults(n_samples, n_features, sparse): # FIXME: Disable the case True-300-200 due to flaky test if sparse and n_features == 300 and n_samples == 200: pytest.xfail('Skipping the case True-300-200 due to flaky test') if sparse: X = cupyx.scipy.sparse.random(n_samples, n_features, density=0.03, dtype=cp.float32, random_state=10) else: X, Y = make_multilabel_classification(n_samples=n_samples, n_features=n_features, n_classes=2, n_labels=1, random_state=1) cupca = cuPCA() cupca.fit(X) curesult = cupca.transform(X) cupca.handle.sync() if sparse: X = X.toarray().get() skpca = skPCA() skpca.fit(X) skresult = skpca.transform(X) assert skpca.svd_solver == cupca.svd_solver assert cupca.components_.shape[0] == skpca.components_.shape[0] assert curesult.shape == skresult.shape assert array_equal(curesult, skresult, 1e-3, with_sign=False)
def runPCA(log_name): # preprocess log_name = log_name + '_PCA.txt' f = open(log_name, 'w') X = df.iloc[:, 0:-1] y = df.iloc[:, -1] X_scaled = skPreprocessing.scale(X) # params numClass = len(np.unique(y)) numDataPnt = X_scaled.shape[0] numDimen = X_scaled.shape[1] # run PCA start_time = time.time() pca = skPCA(n_components=numClass) X_pca = pca.fit(X_scaled).transform(X_scaled) costs = np.zeros(NUM_RUN) labels_pred = np.zeros((NUM_RUN, numDataPnt)) labels_half = np.zeros((int(NUM_RUN / 2), numDataPnt)) for i in range(NUM_RUN): kmeans_model = skCluster.KMeans(n_clusters=numClass, init='random').fit(X_pca) costs[i] = kmeans_model.inertia_ labels_pred[i] = kmeans_model.labels_ end_time = time.time() labels_half = removeHalfUpperCosts(costs, labels_pred, numDataPnt) # run Evaluations runEvalMetrics(X_pca, labels_true=y, labels_pred=labels_half, f=f) f.write("\n") f.write("# of Class : %d, # of Data Points : %d, # of Dimensions : %d \n" % (numClass, numDataPnt, numDimen)) f.write("Shape of X [%d %d]" % (X_pca.shape[0], X_pca.shape[1])) f.write('\n') f.write("Clustering took %.2f s\n" % (end_time - start_time)) f.close()
def WJldaTest(images, labels, j, R, P, k, I, pcaAcc, name=''): """ Calculates a transformation matrix based on PCA such that pcaAcc of the signal are retained and after that applies LDA to transform the data. Using PCA and LDA/SVM from Scikit-learn. """ fj = FJ(images, j, R, P, k, I, name) ipca = skPCA(n_components=pcaAcc) ipca.fit(fj) fj2 = ipca.transform(fj) print fj2.shape if fj2.shape[1] == 1: return (None, None) ilda = skLDA() #ilda = skLDA(solver='eigen') try: ilda.fit(fj2, labels) except Exception as e: print j return (None, None) fj3 = ilda.transform(fj2) print fj3.shape return (ipca, ilda)
def test(self): " validate result with sklearn.decomposition.PCA " from sklearn.decomposition import PCA as skPCA X = np.random.normal(3.2, 5.1, size=(20, 8)) pca = PCA(3).fit(X) skpca = skPCA(3).fit(X) output = skpca.transform(X) self.assertTrue(np.allclose(np.abs(pca.transform(X)), np.abs(output)), "Should be equal")
def main(): set_printoptions(precision=3, suppress=True) X = randn(5, 3) pca = PCA(n_components=2) print pca.fit_transform(X) print pca.fit(X) skpca = skPCA(n_components=2) print skpca.fit_transform(X) print skpca.components_
def phase3(self, data, trueLabels): step3ResultsFolder = Path(self.config["resultsDir"]) / "phase3" step3ResultsFolder.mkdir(exist_ok=True, parents=True) pca = skPCA(N_COMPONENTS) ipca = IncrementalPCA(N_COMPONENTS) reducedData = pca.fit_transform(data) iReducedData = ipca.fit_transform(data) Visualizer.labeledScatter3D(reducedData, trueLabels, path=step3ResultsFolder / f"{N_COMPONENTS}_dims_pcaScatter.png") Visualizer.labeledScatter3D(iReducedData, trueLabels, path=step3ResultsFolder / f"{N_COMPONENTS}_dims_ipcaScatter.png") return reducedData, iReducedData
def PCA(feature_array, n_components=None, whiten=True, svd_solver='auto', tol=0.0, random_state=None): """ Performs Principal Component Analysis on a feature set to deduce axis that have the highest variance. Each component is guarenteed orthogonal in the original feature space (boon and bane). Args: feature_array: 2D array (#Sample x #Feature), should be float64 n_components : How many components are desired, None = all (# input features) whiten : Only set to False if you have already whitened the data tol : Tolerance on update at each iteration svd_solver : Which solver to use, 'auto' intelligently selects random_state : Random seed to fix output Output: components : PCA components (#features, #components) weights : Sample weights (#samples , #components) >>> import numpy as N >>> N.random.seed(0) >>> feature_array = N.random.random((1000,10)) >>> components, weights = PCA(feature_array, n_components = 3) >>> components.shape (10, 3) >>> weights.shape (1000, 3) """ from sklearn.decomposition import PCA as skPCA _PCA = skPCA(n_components=n_components, whiten=whiten, svd_solver=svd_solver, tol=tol, copy=True, random_state=random_state) weights = _PCA.fit(feature_array).transform(feature_array) components = _PCA.components_.T return components, weights
def select_components_above_background(expression_values: np.ndarray, n_permutations: int, path_name: str = 'pathway'): pca = skPCA().fit(expression_values.T) expr_flat = expression_values.flatten() explained_var_df = pd.DataFrame(index=list(range(n_permutations)), columns=list( range(expression_values.shape[1]))) for i in range(n_permutations): np.random.shuffle(expr_flat) expr_permuted = expr_flat.reshape(expression_values.shape[0], expression_values.shape[1]) pca_permuted = skPCA().fit(expr_permuted.T) explained_var_df.loc[i] = pca_permuted.explained_variance_ratio_ pval = list() for j in range(expression_values.shape[1]): pval.append( np.sum( explained_var_df.iloc[:, j] >= pca.explaiend_variance_ratio_[j]) / n_permutations) n_significant_components = np.where(np.array(pval) >= 0.05)[0][0] explained_var_sign_comp = pca.explained_variance_ratio_[ 0:n_significant_components] * 100 var_df = pd.DataFrame.from_dict( { 'PCs': int(n_significant_components), 'explained_var': explained_var_sign_comp }, orient='index', columns=[path_name]) return n_significant_components, var_df
def _pca(mat, dim): """ Wrapper to PCA method, use sklearn See: http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html >>> mat = [[1., 1., 0.], [0., 1., 0.], [1., 0., 0.]] >>> ReducePCA._pca(mat, 2) array([[ 0. , 0.19526215], [-0.70710678, -0.09763107], [ 0.70710678, -0.09763107]]) """ from sklearn.decomposition import KernelPCA as skPCA mypca = skPCA(n_components=dim, kernel="cosine") return mypca.fit_transform(mat) #[:,:self.out_dim]
def test_pca_defaults(n_samples, n_features): X, Y = make_multilabel_classification(n_samples=n_samples, n_features=n_features, n_classes=2, n_labels=1, random_state=1) skpca = skPCA() skpca.fit(X) cupca = cuPCA() cupca.fit(X) cupca.handle.sync() assert skpca.svd_solver == cupca.svd_solver assert cupca.components_.shape[0] == skpca.components_.shape[0]
def test_pca_fit_transform(datatype): gdf = pygdf.DataFrame() gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype) gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype) print("Calling fit_transform") cupca = cuPCA(n_components = 2) Xcupca = cupca.fit_transform(gdf) skpca = skPCA(n_components = 2) Xskpca = skpca.fit_transform(X) assert array_equal(Xcupca, Xskpca, 1e-3,with_sign=False)
def test_pca(eng): x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0) x = fromarray(x, engine=eng) from sklearn.decomposition import PCA as skPCA pca = skPCA(n_components=2) t1 = pca.fit_transform(x.toarray()) w1_T = pca.components_ t2, w2_T = PCA(k=2, svd_method='direct').fit(x) assert allclose_sign(w1_T.T, w2_T.T) assert allclose_sign(t1, t2) t2, w2_T = PCA(k=2, svd_method='em', max_iter=100, seed=0).fit(x) tol = 1e-1 assert allclose_sign(w1_T.T, w2_T.T, atol=tol) assert allclose_sign(t1, t2, atol=tol)
def test_pca_fit(datatype): gdf = pygdf.DataFrame() gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype) gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype) print("Calling fit") cupca = cuPCA(n_components = 2) cupca.fit(gdf) skpca = skPCA(n_components = 2) skpca.fit(X) for attr in ['singular_values_','components_','explained_variance_','explained_variance_ratio_','noise_variance_']: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cupca,attr),getattr(skpca,attr), 1e-3,with_sign=with_sign)
def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skPCA(n_components=2) skpca.fit(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) if input_type == 'dataframe': X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) cupca.fit(X_cudf) else: cupca.fit(X) cupca.handle.sync() for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_', 'noise_variance_' ]: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) if isinstance(cuml_res, cudf.Series): cuml_res = cuml_res.to_array() else: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_fit_transform(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skPCA(n_components=2) Xskpca = skpca.fit_transform(X) cupca = cuPCA(n_components=2) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) Xcupca = cupca.fit_transform(gdf) else: Xcupca = cupca.fit_transform(X) assert array_equal(Xcupca, Xskpca, 1e-3, with_sign=True)
def test_pca_fit_then_transform(datatype, input_type, name, use_handle): blobs_n_samples = 500000 if name == 'blobs' and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: blobs_n_samples = int(blobs_n_samples * pytest.max_gpu_memory / 32) else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") if name == 'blobs': X, y = make_blobs(n_samples=blobs_n_samples, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) if name != 'blobs': skpca = skPCA(n_components=2) skpca.fit(X) Xskpca = skpca.transform(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) cupca.fit(X) X_cupca = cupca.transform(X) cupca.handle.sync() if name != 'blobs': assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True) assert Xskpca.shape[0] == X_cupca.shape[0] assert Xskpca.shape[1] == X_cupca.shape[1]
def return_top_pca_gene(by_cell_matrix, range_genes=None): gene_number = 100 gene_pca = skPCA(n_components=3) np_by_gene = np.asarray(by_cell_matrix.transpose()) gene_index = by_cell_matrix.index.tolist() if range_genes is not None: start_num = range_genes[0] end_num_genes = range_genes[1] else: start_num = 0 end_num_genes = min(gene_number, len(gene_index)) by_gene_trans = gene_pca.fit_transform(np_by_gene) Pc_df = pd.DataFrame(gene_pca.components_.T, columns=['PC-1', 'PC-2', 'PC-3'], index=gene_index) pca_rank_df = Pc_df.abs().sum(axis=1) Pc_sort_df = pca_rank_df.nlargest(len(gene_index)) top_pca_list = Pc_sort_df.index.tolist() new_cell_matrix = by_cell_matrix.ix[ top_pca_list[start_num:end_num_genes], :] return new_cell_matrix.transpose(), top_pca_list[start_num:end_num_genes]
def test_pca(self): dataLocal = [ array([1.0, 1.0, 1.0, 5.0]), array([2.0, 3.0, 4.0, 1.0]), array([6.0, 0.0, 6.0, 6.0]) ] data = self.sc.parallelize(zip(range(1, 4), dataLocal)) mat = RowMatrix(data) pca1 = PCA(k=1, svdMethod='direct') pca1.fit(mat) out1_comps = pca1.comps out1_scores = pca1.scores.collectValuesAsArray() * pca1.latent out1_transform_scores = pca1.transform(mat).collectValuesAsArray() * pca1.latent from sklearn.decomposition import PCA as skPCA pca2 = skPCA(n_components=1) pca2.fit(array(dataLocal)) out2_comps = pca2.components_ out2_scores = pca2.transform(array(dataLocal)) assert(allclose(out1_comps, out2_comps) | allclose(out1_comps, -out2_comps)) assert(allclose(out1_scores, out2_scores) | allclose(out1_scores, -out2_scores)) assert(allclose(out1_scores, out1_transform_scores))
def _fit_local(self, X): from sklearn.decomposition import PCA as skPCA pca = skPCA(n_components=self.k) t = pca.fit_transform(X) w_T = pca.components_ return t, w_T
def plot_PCA(df_by_gene, num_genes=100, gene_list_filter=False, title='', plot=False, label_map=False, gene_map = False, annotate=False): gene_list = df_by_gene.columns.tolist() sns.set_palette("RdBu_r", 10, 1) if gene_list_filter: sig_by_gene = df_by_gene[gene_list_filter] sig_by_cell = sig_by_gene.transpose() else: sig_by_gene = df_by_gene sig_by_cell = sig_by_gene.transpose() gene_pca = skPCA(n_components=3) np_by_gene = np.asarray(sig_by_gene) by_gene_trans = gene_pca.fit_transform(np_by_gene) Pc_df = pd.DataFrame(gene_pca.components_.T, columns=['PC-1', 'PC-2', 'PC-3'], index=sig_by_gene.columns.tolist()) pca_rank_df = Pc_df.abs().sum(axis=1) Pc_sort_df = pca_rank_df.nlargest(len(sig_by_gene.columns.tolist())) top_pca_list = Pc_sort_df.index.tolist() print(top_pca_list[0:num_genes], 'top_pca_list') top_by_gene = df_by_gene[top_pca_list[0:num_genes]] gene_top = skPCA(n_components=2) cell_pca = skPCA(n_components=2) top_by_cell = top_by_gene.transpose() np_top_gene = np.asarray(top_by_cell) np_top_cell = np.asarray(top_by_gene) top_cell_trans = cell_pca.fit_transform(np_top_cell) top_gene_trans = gene_top.fit_transform(np_top_gene) if not np.isnan(top_cell_trans).any(): fig, (ax_cell, ax_gene) = plt.subplots(2, 1, figsize=(15, 30), sharex=False) rect_cell = ax_cell.patch rect_gene = ax_gene.patch rect_cell.set_facecolor('white') rect_gene.set_facecolor('white') ax_cell.grid(b=True, which='major', color='grey', linestyle='--', linewidth=0.3) ax_gene.grid(b=True, which='major', color='grey', linestyle='--', linewidth=0.3) if label_map: X = [x for x in top_cell_trans[:, 0]] Y = [y for y in top_cell_trans[:, 1]] labels = [label_map[cell][2] for cell in top_by_cell.columns.tolist()] markers = [label_map[cell][1] for cell in top_by_cell.columns.tolist()] colors = [label_map[cell][0] for cell in top_by_cell.columns.tolist()] label_done = [] for X_pos, Y_pos, m, color, l in zip(X, Y, markers, colors, labels): if l in label_done: lab = '' else: lab= l label_done.append(l) ax_cell.scatter(X_pos, Y_pos, marker=m, c=color, label=lab, s=30) else: ax_cell.scatter(top_cell_trans[:, 0], top_cell_trans[:, 1], alpha=0.75) ax_cell.set_xlim([min(top_cell_trans[:, 0])-1, max(top_cell_trans[:, 0]+1)]) ax_cell.set_ylim([min(top_cell_trans[:, 1])-1, max(top_cell_trans[:, 1]+2)]) ax_cell.set_title(title+'_cell') ax_cell.legend(loc='best', ncol=1, prop={'size':12}, markerscale=1.5, frameon=True) ax_cell.set_xlabel('PC1') ax_cell.set_ylabel('PC2') if annotate: for label, x, y in zip(top_by_cell.columns, top_cell_trans[:, 0], top_cell_trans[:, 1]): ax_cell.annotate(label, (x+0.1, y+0.1)) if gene_map: X = [x for x in top_gene_trans[:, 0]] Y = [y for y in top_gene_trans[:, 1]] labels = top_by_gene.columns.tolist() colors = [gene_map[gene] for gene in top_by_gene.columns.tolist()] for X_pos, Y_pos, color, l in zip(X, Y, colors, labels): ax_gene.scatter(X_pos, Y_pos, marker='o', c=color, label = l, s=30) else: ax_gene.scatter(top_gene_trans[:, 0], top_gene_trans[:, 1], alpha=0.75) ax_gene.set_xlim([min(top_gene_trans[:, 0])-1, max(top_gene_trans[:, 0])+1]) ax_gene.set_ylim([min(top_gene_trans[:, 1])-1, max(top_gene_trans[:, 1])+2]) ax_gene.set_title(title+'_gene') ax_gene.set_xlabel('PC1') ax_gene.set_ylabel('PC2') for label, x, y in zip(top_by_gene.columns, top_gene_trans[:, 0], top_gene_trans[:, 1]): ax_gene.annotate(label, (x+.5, y+.5)) if plot: plt.show() if title != '': save_name = '_'.join(title.split(' ')[0:2]) plt.savefig(os.path.join(filename,save_name+'_skpca.pdf'), bbox_inches='tight') else: plt.savefig(os.path.join(filename,'non_group_skpca.pdf'), bbox_inches='tight') plt.close() return top_pca_list else: return []
# 4.3 Restoring original dataset and computing mean relative error X_tilde = pca.inv_transform(W) MRE = lambda Xreal, Xpred: np.sum([ la.norm(Xreal[:, j] - Xpred[:, j], 2) / la.norm(Xreal[:, j], 2) for j in range(Xreal.shape[1]) ]) / Xreal.shape[0] mean_relative_error = MRE(X, X_tilde) print( f"\n Dataset approximated with the first {pca.n_components} principal components. Mean Relative Error = {mean_relative_error}" ) # 4.3.1 Comparison with sklearn PCA implementation from sklearn.decomposition import PCA as skPCA skpca = skPCA(n_components=0.9) skW = skpca.fit_transform(X) skX_tilde = skpca.inverse_transform(skW) # error = np.sum([la.norm(X[i] - skX_tilde[i])/la.norm(X[i]) for i in range(X.shape[0])])/X.shape[0] error = MRE(X, skX_tilde) print( f" Dataset approximated with sklearn implementation ({skpca.n_components} var explained/{len(skpca.singular_values_)} components). Mean Relative Error = {error}" ) pca_std = PCA(n_components=0.9, use_std=True) W_std = pca_std.fit_transform(X) X_tilde_std = pca_std.inv_transform(W_std) error = MRE(X, X_tilde_std) print( f" Dataset approximated with use of std ({pca_std.var_explained} var explained/{pca_std.n_components} components). Mean Relative Error = {error}"
def PCA(mixed, state: dict, options: dict): mixed = mixed.T unmix = skPCA(n_components=mixed.shape[1]).fit_transform(mixed) return unmix.T, state
def __init__(self, config): """PCA constructor""" Transform.__init__(self, config) self.transformer = skPCA(self.dimension) self.process_func_train = self.transformer.fit_transform self.process_func_test = self.transformer.transform