def rclr_test_table(): # build a table to test test_table = create_test_table() # export table from biom test_table = test_table.matrix_data.toarray() # the matrix_rclr is tested in other places # this is just used as input into # the OptSpace tests test_table = np.array(test_table) table_rclr = matrix_rclr(test_table) return test_table, table_rclr
def test_errors(self): """Test building a tensor error raises.""" # flatten tensor into matrix matrix_counts = self.tensor_true.transpose([0, 2, 1]) matrix_counts = matrix_counts.reshape(9, 2) # build mapping and table dataframe to rebuild mapping = np.array([[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]]) mapping = pd.DataFrame(mapping.T, columns=['ID', 'conditional']) table = pd.DataFrame(matrix_counts.T) # rebuild the tensor tensor = build() tensor.construct(table, mapping, 'ID', ['conditional']) # test less than 2D throws ValueError with self.assertRaises(ValueError): tensor_rclr(np.array(range(3))) # test negatives throws ValueError with self.assertRaises(ValueError): tensor_rclr(tensor.counts * -1) tensor_true_error = self.tensor_true.astype(float) tensor_true_error[tensor_true_error <= 10] = np.inf # test infs throws ValueError with self.assertRaises(ValueError): tensor_rclr(tensor_true_error) tensor_true_error = self.tensor_true.astype(float) tensor_true_error[tensor_true_error <= 10] = np.nan # test nan(s) throws ValueError with self.assertRaises(ValueError): tensor_rclr(tensor_true_error) # test matrix_rclr on already made tensor with self.assertRaises(ValueError): matrix_rclr(self.tensor_true) # test matrix_rclr on negatives with self.assertRaises(ValueError): matrix_rclr(self.tensor_true * -1) # test that missing id in mapping ValueError with self.assertRaises(ValueError): tensor.construct(table, mapping.drop(['ID'], axis=1), 'ID', ['conditional']) # test that missing conditional in mapping ValueError with self.assertRaises(ValueError): tensor.construct(table, mapping.drop(['conditional'], axis=1), 'ID', ['conditional']) # test negatives throws ValueError with self.assertRaises(ValueError): tensor.construct(table * -1, mapping, 'ID', ['conditional']) table_error = table.astype(float) table_error[table_error <= 10] = np.inf # test infs throws ValueError with self.assertRaises(ValueError): tensor.construct(table_error, mapping, 'ID', ['conditional']) table_error = table.astype(float) table_error[table_error <= 10] = np.nan # test nan(s) throws ValueError with self.assertRaises(ValueError): tensor.construct(table_error, mapping, 'ID', ['conditional']) # test adding up counts for repeat samples table[9] = table[8] - 1 mapping.loc[9, ['ID', 'conditional']] = mapping.loc[8, ['ID', 'conditional']] with self.assertWarns(Warning): tensor.construct(table, mapping, 'ID', ['conditional']) duplicate_tensor_true = self.tensor_true.copy() duplicate_tensor_true[2, :, 2] = duplicate_tensor_true[2, :, 2] - 1 npt.assert_allclose(tensor.counts, duplicate_tensor_true.astype(float))
def test_rclr_nan_raises(self): """Test matrix_rclr ValueError on missing (as nan).""" # test nan throw value error with self.assertRaises(ValueError): matrix_rclr(self.bad3)
def test_rclr_inf_raises(self): """Test matrix_rclr ValueError on undefined.""" # test undefined throw value error with self.assertRaises(ValueError): matrix_rclr(self.bad2)
def test_rclr_negative_raises(self): """Test matrix_rclr ValueError on negative.""" # test negatives throw value error with self.assertRaises(ValueError): matrix_rclr(self.bad1)
def test_rclr_sparse(self): """Test matrix_rclr on sparse data.""" # test a case with zeros cmat = matrix_rclr(self.cdata2) npt.assert_allclose(cmat, self.true2)
def test_rclr_dense(self): """Test matrix_rclr and clr are the same on dense datasets.""" # test clr works the same if there are no zeros cmat = matrix_rclr(self.cdata1) npt.assert_allclose(cmat, clr(self.cdata1.copy()))
def rpca( table: biom.Table, n_components: Union[int, str] = DEFAULT_COMP, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, min_feature_frequency: float = DEFAULT_MFF, max_iterations: int = DEFAULT_OPTSPACE_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an matrix_rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of gemelli. """ # get shape of table n_features, n_samples = table.shape # filter sample to min seq. depth def sample_filter(val, id_, md): return sum(val) > min_sample_count # filter features to min total counts def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter features by N samples presence def frequency_filter(val, id_, md): return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100) # filter and import table for each filter above table = table.filter(observation_filter, axis='observation') table = table.filter(frequency_filter, axis='observation') table = table.filter(sample_filter, axis='sample') # table to dataframe table = pd.DataFrame(table.matrix_data.toarray(), table.ids('observation'), table.ids('sample')).T # check the table after filtering if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # Robust-clt (matrix_rclr) preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit( matrix_rclr(table)) # get new n-comp when applicable n_components = opt.s.shape[0] # get PC column labels for the skbio OrdinationResults rename_cols = ['PC' + str(i + 1) for i in range(n_components)] # get completed matrix for centering X = opt.sample_weights @ opt.s @ opt.feature_weights.T # center again around zero after completion X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) # re-factor the data u, s, v = svd(X) # only take n-components u = u[:, :n_components] v = v.T[:, :n_components] # calc. the new variance using projection p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] # save the loadings feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in gemelli -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res