def test_OptSpace(self): """Tests the basic validity of the actual OptSpace() method's output.""" # run base OptSpace opt = MatrixCompletion(n_components=self.rank, max_iterations=self.iteration, tol=self.tol).fit(self.test_rclr) U_res, s_res, V_res = MatrixCompletion(n_components=self.rank, max_iterations=self.iteration, tol=self.tol).fit_transform( self.test_rclr) # use base optspace helper to check # that wrapper is not changing outcomes U_exp, s_exp, V_exp = OptSpace(n_components=self.rank, max_iterations=self.iteration, tol=self.tol).solve(self.test_rclr) # more exact testing of directionally is done # in test_method.py. Here we just compare abs # see (c/o @cameronmartino's comment in #29). for i in range(self.rank): np.testing.assert_array_almost_equal(abs(U_exp[:, i]), abs(opt.sample_weights[:, i])) np.testing.assert_array_almost_equal(abs(s_exp[:, i]), abs(opt.s[:, i])) np.testing.assert_array_almost_equal( abs(V_exp[:, i]), abs(opt.feature_weights[:, i])) np.testing.assert_array_almost_equal(abs(U_exp[:, i]), abs(U_res[:, i])) np.testing.assert_array_almost_equal(abs(s_exp[:, i]), abs(s_res[:, i])) np.testing.assert_array_almost_equal(abs(V_exp[:, i]), abs(V_res[:, i]))
def test_OptSpace_iter_raises(self): """Tests ValueError for OptSpace() iteration 0.""" # test iter too low try: MatrixCompletion(max_iterations=0).fit(self.test_rclr) except ValueError: pass else: raise AssertionError("ValueError was not raised")
def test_OptSpace_illformatted_raises(self): """Tests ValueError for OptSpace() no infs.""" # test inf try: MatrixCompletion().fit(clr(self.test_table)) except ValueError: pass else: raise AssertionError("ValueError was not raised")
def test_OptSpace_rank_raises(self): """Tests ValueError for OptSpace() rank.""" # test rank too low try: MatrixCompletion(n_components=1).fit(self.test_rclr) except ValueError: pass else: raise AssertionError("ValueError was not raised") # test rank way too high try: MatrixCompletion(n_components=10000).fit(self.test_rclr) except ValueError: pass else: raise AssertionError("ValueError was not raised") try: MatrixCompletion(n_components=100).fit(self.test_rclr) except ValueError: pass else: raise AssertionError("ValueError was not raised")
def rpca(adata): from deicode.matrix_completion import MatrixCompletion from deicode.preprocessing import rclr min_samples = max(3, np.floor(n_samples * 0.1)) sc.pp.filter_genes(adata, min_cells=min_samples) X = rclr(adata.raw.X) opt = MatrixCompletion(n_components=n_comps, max_iterations=10).fit(X) n_components = opt.s.shape[0] X = opt.sample_weights @ opt.s @ opt.feature_weights.T X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) adata.obsm['X_deicode'] = sc.tl.pca(X, svd_solver='arpack', n_comps=n_comps) return adata
def rpca( table: biom.Table, n_components: int = DEFAULT_RANK, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, max_iterations: int = DEFAULT_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of DEICODE. """ # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter and import table table = table.filter(observation_filter, axis='observation') table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # rclr preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit(rclr(table)) rename_cols = ['PC' + str(i + 1) for i in range(n_components)] X = opt.sample_weights @ opt.s @ opt.feature_weights.T X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) u, s, v = svd(X) u = u[:, :n_components] v = v.T[:, :n_components] p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in DEICODE -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def rpca( table: biom.Table, n_components: Union[int, str] = DEFAULT_RANK, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, min_feature_frequency: float = DEFAULT_MFF, max_iterations: int = DEFAULT_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of DEICODE. """ # get shape of table n_features, n_samples = table.shape # filter sample to min seq. depth def sample_filter(val, id_, md): return sum(val) > min_sample_count # filter features to min total counts def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter features by N samples presence def frequency_filter(val, id_, md): return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100) # filter and import table for each filter above table = table.filter(observation_filter, axis='observation') table = table.filter(frequency_filter, axis='observation') table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T # check the table after filtering if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # Robust-clt (rclr) preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit(rclr(table)) # get new n-comp when applicable n_components = opt.s.shape[0] # get PC column labels for the skbio OrdinationResults rename_cols = ['PC' + str(i + 1) for i in range(n_components)] # get completed matrix for centering X = opt.sample_weights @ opt.s @ opt.feature_weights.T # center again around zero after completion X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) # re-factor the data u, s, v = svd(X) # only take n-components u = u[:, :n_components] v = v.T[:, :n_components] # calc. the new variance using projection p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] # save the loadings robust_clr = pd.DataFrame(X, index=table.index, columns=table.columns) feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in DEICODE -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res, robust_clr