def setUp(self): eigvals = pd.Series(np.array([0.50, 0.25, 0.25]), index=['PC1', 'PC2', 'PC3']) samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) proportion_explained = pd.Series([15.5, 12.2, 8.8], index=['PC1', 'PC2', 'PC3']) samples_df = pd.DataFrame(samples, index=['A', 'B', 'C', 'D'], columns=['PC1', 'PC2', 'PC3']) self.pcoa = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, proportion_explained=proportion_explained) samples_df = pd.DataFrame(samples + 1.01, index=['A', 'B', 'C', 'D'], columns=['PC1', 'PC2', 'PC3']) self.other = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals.copy(), samples_df, proportion_explained=proportion_explained.copy()) self.metadata = qiime2.Metadata( pd.DataFrame( { 'val1': ['1.0', '2.0', '3.0', '4.0'], 'val2': ['3.3', '3.5', '3.6', '3.9'] }, index=pd.Index(['A', 'B', 'C', 'D'], name='id')))
def setUp(self): axes = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'] eigvals = pd.Series(np.array([1.5, 0.75, 0.3, 0.15, 0.15, 0.15]), index=axes) samples = np.array([[0, 3, 4, 4, 0, 0], [1, 2, 1, 4, 3, 3], [2, 3, 1, 0, 0, 1], [0, 3, 2, 4, 3, 0]]) proportion_explained = pd.Series([0.50, 0.25, 0.10, 0.05, 0.05, 0.05], index=axes) samples_df = pd.DataFrame(samples, index=['A', 'B', 'C', 'D'], columns=axes) self.reference = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, proportion_explained=proportion_explained) samples = np.array([[0.7, 3.7, 4.7, 4.7, 0.7, 0.7], [1.7, 2.7, 1.7, 4.7, 3.7, 3.7], [2.7, 3.7, 1.7, 0.7, 0.7, 1.7], [30, 3.7, 2.7, 4.7, 3.7, 0.7]]) samples_df = pd.DataFrame(samples, index=['A', 'B', 'C', 'D'], columns=axes) self.other = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals.copy(), samples_df.copy(), proportion_explained=proportion_explained.copy()) S = [[-0.1358036, 0.0452679, 0.3621430, 0.1810715, -0.2716072], [0.0452679, -0.1358036, -0.1810715, 0.1810715, 0.2716072], [0.2263394, 0.0452679, -0.1810715, -0.5432145, -0.2716072], [-0.1358036, 0.0452679, 0.0000000, 0.1810715, 0.2716072]] samples_df = pd.DataFrame(np.array(S), index=['A', 'B', 'C', 'D'], columns=axes[:5]) self.expected_ref = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals[:5].copy(), samples_df.copy(), proportion_explained=proportion_explained[:5].copy()) S = [[0.0482731, -0.0324317, 0.0494312, -0.0316828, -0.1584374], [0.0803620, -0.0718115, -0.0112234, -0.0171011, -0.1101209], [0.0527554, -0.0042753, -0.0126739, -0.0969602, -0.0964822], [-0.1813905, 0.1085184, -0.0255339, 0.1457440, 0.3650405]] samples_df = pd.DataFrame(np.array(S), index=['A', 'B', 'C', 'D'], columns=axes[:5]) self.expected_other = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals[:5].copy(), samples_df.copy(), proportion_explained=proportion_explained[:5].copy())
def rpca( table: biom.Table, rank: int = 3, min_sample_count: int = 500, min_feature_count: int = 10, iterations: int = 5 ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """ Runs RPCA with an rclr preprocessing step""" # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T.drop_duplicates() table = table.T[table.sum() > min_feature_count].T # rclr preprocessing and OptSpace (RPCA) opt = OptSpace(rank=rank, iteration=iterations).fit(rclr().fit_transform( table.copy())) rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)} # Feature Loadings feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns) feature_loading = feature_loading.rename(columns=rename_cols) feature_loading.sort_values('PC1', inplace=True, ascending=True) # Sample Loadings sample_loading = pd.DataFrame(opt.sample_weights, index=table.index) sample_loading = sample_loading.rename(columns=rename_cols) # % var explained proportion_explained = pd.Series(opt.explained_variance_ratio, index=list(rename_cols.values())) # eigan-vals eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values())) # if the rank is two add PC3 of zeros if rank == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def test_remove_empty_nothing_to_remove_with_ordination(self, mock_stdout): good_pcoa = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', self.eigvals, self.samples_df.drop(labels="Sample4", axis="index"), features=self.features_df.drop(labels="e", axis="index"), proportion_explained=self.proportion_explained) ft, fsm = remove_empty_samples_and_features(self.table_ef, self.sm_ef, good_pcoa) self.assertEqual(ft, self.table_ef) assert_frame_equal(fsm, self.sm_ef) self.assertEqual(mock_stdout.getvalue(), "")
def test_remove_empty_with_empty_feature_in_ordination(self): bad_feature_pcoa = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', self.eigvals, self.samples_df.drop(labels="Sample4", axis="index"), features=self.features_df, proportion_explained=self.proportion_explained) with self.assertRaisesRegex( ValueError, (r"The ordination contains features that are empty \(i.e. all " r"0s\) in the table. Problematic feature IDs: e")): remove_empty_samples_and_features(self.table, self.sm, bad_feature_pcoa)
def setUp(self): eigvals = np.array([0.50, 0.25, 0.25]) samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) proportion_explained = pd.Series([15.5, 12.2, 8.8], index=['PC1', 'PC2', 'PC3']) samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'], ['PC1', 'PC2', 'PC3']) self.pcoa = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, proportion_explained=proportion_explained) self.metadata = qiime2.Metadata( pd.DataFrame({'val1': ['1.0', '2.0', '3.0', '4.0']}, index=['A', 'B', 'C', 'D']))
def test_remove_empty_with_empty_sample_and_feature_in_ordination(self): # Checks behavior when both an empty sample and an empty feature are in # the ordination. Currently the code is structured so that empty sample # errors take precedence over empty feature errors -- I imagine this # will be the more common of the two scenarios, which is partially why # I went with this. But this is probably a rare edge case anyway. extremely_funky_pcoa = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', self.eigvals, self.samples_df, features=self.features_df, proportion_explained=self.proportion_explained) with self.assertRaisesRegex( ValueError, (r"The ordination contains samples that are empty \(i.e. all " r"0s\) in the table. Problematic sample IDs: Sample4")): remove_empty_samples_and_features(self.table, self.sm, extremely_funky_pcoa)
def setUp(self): self.table = biom.Table( np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 0, 0]]).T, list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4']) # After filtering out empty samples/features: self.table_ef = biom.Table( np.array([[1, 2, 4], [8, 7, 5], [1, 0, 0]]).T, ['a', 'b', 'd'], ['Sample1', 'Sample2', 'Sample3']) self.sm = pd.DataFrame( { "Metadata1": [0, 0, 0, 1], "Metadata2": [0, 0, 0, 0], "Metadata3": [1, 2, 3, 4], "Metadata4": ["abc", "def", "ghi", "jkl"] }, index=list(self.table.ids())) # After filtering out empty samples/features: # (Note that we only care about "emptiness" from the table's # perspective. We don't consider a sample with 0 for all of its # metadata, or a metadata field with 0 for all samples, to be empty.) self.sm_ef = pd.DataFrame( { "Metadata1": [0, 0, 0], "Metadata2": [0, 0, 0], "Metadata3": [1, 2, 3], "Metadata4": ["abc", "def", "ghi"] }, index=self.table_ef.ids().copy()) self.sid2idx = {"Sample1": 0, "Sample2": 1, "Sample3": 2} self.tm = pd.DataFrame( { "Level 1": ["k__Bacteria", "k__Bacteria"], "Level 2": ["p__Bacteroidetes", "p__Bacteroidetes"], "Level 3": ["c__Bacteroidia", "c__Bacteroidia"], "Level 4": ["o__Bacteroidales", "o__Bacteroidales"], "Level 5": ["f__Bacteroidaceae", "f__Bacteroidaceae"], "Level 6": ["g__Bacteroides", "g__Bacteroides"], "Level 7": ["s__", "s__uniformis"], "Confidence": [0.95, 0] }, index=["e", "a"]) self.im = pd.DataFrame( { "Level 1": ["k__Bacteria", "k__Archaea"], "Level 2": ["p__Proteobacteria", "Unspecified"], "Level 3": ["c__Gammaproteobacteria", "Unspecified"], "Level 4": ["o__Pasteurellales", "Unspecified"], "Level 5": ["f__Pasteurellaceae", "Unspecified"], "Level 6": ["g__", "Unspecified"], "Level 7": ["s__", "Unspecified"], "Confidence": [0.8, 1] }, index=["h", "m"]) self.exp_fm_cols = [ "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6", "Level 7", "Confidence" ] self.exp_ctm = { "e": [ "k__Bacteria", "p__Bacteroidetes", "c__Bacteroidia", "o__Bacteroidales", "f__Bacteroidaceae", "g__Bacteroides", "s__", "0.95" ], # The ".0" in "a"'s Confidence value is due to the 0 being treated # as numeric by Pandas, since this was a numeric column in the DF. # We can *try* to prevent this sort of thing from happening, but I # doubt this will make a difference to anyone -- and also it's kind # of dependent on whatever tool is reading the metadata in the # first place (if it was all read with dtype=str, then this # shouldn't be a problem). So, more of a QIIME 2 problem. "a": [ "k__Bacteria", "p__Bacteroidetes", "c__Bacteroidia", "o__Bacteroidales", "f__Bacteroidaceae", "g__Bacteroides", "s__uniformis", "0.0" ] } self.exp_cim = { "h": [ "k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Pasteurellales", "f__Pasteurellaceae", "g__", "s__", "0.8" ], "m": [ "k__Archaea", "Unspecified", "Unspecified", "Unspecified", "Unspecified", "Unspecified", "Unspecified", "1.0" ] } # Ordination info (for testing inputs to remove_empty...()) self.eigvals = pd.Series(np.array([0.50, 0.25, 0.25]), index=["PC1", "PC2", "PC3"]) samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) self.proportion_explained = pd.Series([15.5, 12.2, 8.8], index=["PC1", "PC2", "PC3"]) self.samples_df = pd.DataFrame( samples, index=["Sample1", "Sample2", "Sample3", "Sample4"], columns=["PC1", "PC2", "PC3"]) features = np.array([[0.9, 0.8, 0.7], [0.6, 0.5, 0.4], [0.3, 0.2, 0.1], [0.0, 0.2, 0.4]]) self.features_df = pd.DataFrame(features, index=["a", "b", "e", "d"], columns=["PC1", "PC2", "PC3"]) # self.pcoa is problematic by default, because it contains Sample4 self.pcoa = skbio.OrdinationResults( "PCoA", "Principal Coordinate Analysis", self.eigvals, self.samples_df, proportion_explained=self.proportion_explained)
def setUp(self): self.tree = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2):1;') self.pruned_tree = TreeNode.read( StringIO('(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;')) # Test table/metadata (mostly) adapted from Qurro: self.table = biom.Table( np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 1, 0]]).T, list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4']) self.unrelated_table = biom.Table( np.array([[5, 2, 0, 2], [2, 3, 0, 1], [5, 2, 0, 0], [4, 5, 0, 4]]).T, list("hijk"), ['Sample1', 'Sample2', 'Sample3', 'Sample4']) self.sample_metadata = pd.DataFrame( { "Metadata1": [0, 0, 0, 1], "Metadata2": [0, 0, 0, 0], "Metadata3": [1, 2, 3, 4], "Metadata4": ["abc", "def", "ghi", "jkl"] }, index=list(self.table.ids())) self.feature_metadata = pd.DataFrame( { "fmdcol1": ["asdf", "ghjk"], "fmdcol2": ["qwer", "tyui"] }, index=["a", "h"]) self.filtered_table = biom.Table( np.array([[1, 2, 4], [8, 7, 5], [1, 0, 0]]).T, ['a', 'b', 'd'], ['Sample1', 'Sample2', 'Sample3']) self.filtered_sample_metadata = pd.DataFrame( { "Metadata1": [0, 0, 0], "Metadata2": [0, 0, 0], "Metadata3": [1, 2, 3], "Metadata4": ["abc", "def", "ghi"] }, index=["Sample1", "Sample2", "Sample3"]) eigvals = pd.Series(np.array([0.50, 0.25, 0.25]), index=['PC1', 'PC2', 'PC3']) samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) proportion_explained = pd.Series([15.5, 12.2, 8.8], index=['PC1', 'PC2', 'PC3']) samples_df = pd.DataFrame( samples, index=['Sample1', 'Sample2', 'Sample3', 'Sample4'], columns=['PC1', 'PC2', 'PC3']) self.pcoa = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, proportion_explained=proportion_explained) features = np.abs(samples_df.copy() / 2.0).iloc[:2, :] features.index = 'f.' + features.index self.biplot_no_matches = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, features=features, proportion_explained=proportion_explained) features = np.abs(samples_df / 2.0).iloc[:2, :] features.index = pd.Index(['a', 'h']) self.biplot = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, features=features, proportion_explained=proportion_explained) self.biplot_tree = parse_newick( '(((y:1,z:2):1,b:2)g:1,(:1,d:3)h:2):1;') self.biplot_table = biom.Table( np.array([[1, 2], [8, 7], [1, 0], [0, 3]]).T, ['y', 'z'], ['Sample1', 'Sample2', 'Sample3', 'Sample4']) self.files_to_remove = [] self.maxDiff = None
def rpca( table: biom.Table, n_components: int = DEFAULT_RANK, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, max_iterations: int = DEFAULT_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of DEICODE. """ # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter and import table table = table.filter(observation_filter, axis='observation') table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # rclr preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit(rclr(table)) rename_cols = ['PC' + str(i + 1) for i in range(n_components)] X = opt.sample_weights @ opt.s @ opt.feature_weights.T X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) u, s, v = svd(X) u = u[:, :n_components] v = v.T[:, :n_components] p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in DEICODE -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def rpca( table: biom.Table, n_components: Union[int, str] = DEFAULT_COMP, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, min_feature_frequency: float = DEFAULT_MFF, max_iterations: int = DEFAULT_OPTSPACE_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an matrix_rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of gemelli. """ # get shape of table n_features, n_samples = table.shape # filter sample to min seq. depth def sample_filter(val, id_, md): return sum(val) > min_sample_count # filter features to min total counts def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter features by N samples presence def frequency_filter(val, id_, md): return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100) # filter and import table for each filter above table = table.filter(observation_filter, axis='observation') table = table.filter(frequency_filter, axis='observation') table = table.filter(sample_filter, axis='sample') # table to dataframe table = pd.DataFrame(table.matrix_data.toarray(), table.ids('observation'), table.ids('sample')).T # check the table after filtering if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # Robust-clt (matrix_rclr) preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit( matrix_rclr(table)) # get new n-comp when applicable n_components = opt.s.shape[0] # get PC column labels for the skbio OrdinationResults rename_cols = ['PC' + str(i + 1) for i in range(n_components)] # get completed matrix for centering X = opt.sample_weights @ opt.s @ opt.feature_weights.T # center again around zero after completion X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) # re-factor the data u, s, v = svd(X) # only take n-components u = u[:, :n_components] v = v.T[:, :n_components] # calc. the new variance using projection p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] # save the loadings feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in gemelli -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def setUp(self): self.tree = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2):1;') self.pruned_tree = TreeNode.read( StringIO('(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;')) # Test table/metadata (mostly) adapted from Qurro: # the table is transposed to match QIIME2's expectation self.table = pd.DataFrame( { "Sample1": [1, 2, 0, 4], "Sample2": [8, 7, 0, 5], "Sample3": [1, 0, 0, 0], "Sample4": [0, 0, 0, 0] }, index=["a", "b", "e", "d"]).T self.unrelated_table = pd.DataFrame( { "Sample1": [5, 2, 0, 2], "Sample2": [2, 3, 0, 1], "Sample3": [5, 2, 0, 0], "Sample4": [4, 5, 0, 4] }, index=["h", "i", "j", "k"]).T self.sample_metadata = pd.DataFrame( { "Metadata1": [0, 0, 0, 1], "Metadata2": [0, 0, 0, 0], "Metadata3": [1, 2, 3, 4], "Metadata4": ["abc", "def", "ghi", "jkl"] }, index=list(self.table.index)) self.feature_metadata = pd.DataFrame( { "fmdcol1": ["asdf", "ghjk"], "fmdcol2": ["qwer", "tyui"] }, index=["a", "h"]) self.filtered_table = pd.DataFrame( { "Sample1": [1, 2, 4], "Sample2": [8, 7, 5], "Sample3": [1, 0, 0], "Sample4": [0, 0, 0] }, index=["a", "b", "d"]).T eigvals = pd.Series(np.array([0.50, 0.25, 0.25]), index=['PC1', 'PC2', 'PC3']) samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) proportion_explained = pd.Series([15.5, 12.2, 8.8], index=['PC1', 'PC2', 'PC3']) samples_df = pd.DataFrame( samples, index=['Sample1', 'Sample2', 'Sample3', 'Sample4'], columns=['PC1', 'PC2', 'PC3']) self.pcoa = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, proportion_explained=proportion_explained) self.files_to_remove = [] self.maxDiff = None
def setUp(self): axes = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'] eigvals = pd.Series(np.array([1.5, 0.75, 0.3, 0.15, 0.15, 0.15]), index=axes) samples = np.array([[0, 3, 4, 4, 0, 0], [1, 2, 1, 4, 3, 3], [2, 3, 1, 0, 0, 1], [0, 3, 2, 4, 3, 0]]) proportion_explained = pd.Series([0.50, 0.25, 0.10, 0.05, 0.05, 0.05], index=axes) samples_df = pd.DataFrame(samples, index=['A', 'B', 'C', 'D'], columns=axes) self.reference = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, proportion_explained=proportion_explained) samples = np.array([[0.7, 3.7, 4.7, 4.7, 0.7, 0.7], [1.7, 2.7, 1.7, 4.7, 3.7, 3.7], [2.7, 3.7, 1.7, 0.7, 0.7, 1.7], [30, 3.7, 2.7, 4.7, 3.7, 0.7]]) samples_df = pd.DataFrame(samples, index=['A', 'B', 'C', 'D'], columns=axes) self.other = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals.copy(), samples_df.copy(), proportion_explained=proportion_explained.copy()) S = [[-0.1358036, 0.0452679, 0.3621430, 0.1810715, -0.2716072], [0.0452679, -0.1358036, -0.1810715, 0.1810715, 0.2716072], [0.2263394, 0.0452679, -0.1810715, -0.5432145, -0.2716072], [-0.1358036, 0.0452679, 0.0000000, 0.1810715, 0.2716072]] samples_df = pd.DataFrame(np.array(S), index=['A', 'B', 'C', 'D'], columns=axes[:5]) self.expected_ref = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals[:5].copy(), samples_df.copy(), proportion_explained=proportion_explained[:5].copy()) S = [[0.0482731, -0.0324317, 0.0494312, -0.0316828, -0.1584374], [0.0803620, -0.0718115, -0.0112234, -0.0171011, -0.1101209], [0.0527554, -0.0042753, -0.0126739, -0.0969602, -0.0964822], [-0.1813905, 0.1085184, -0.0255339, 0.1457440, 0.3650405]] samples_df = pd.DataFrame(np.array(S), index=['A', 'B', 'C', 'D'], columns=axes[:5]) self.expected_other = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals[:5].copy(), samples_df.copy(), proportion_explained=proportion_explained[:5].copy()) noise = [ [0.04988341, -0.03234447, 0.03177641, -0.03507789, -0.13564394], [0.09117347, -0.08318546, -0.02249053, -0.01597601, -0.10901541], [0.05077765, -0.003994, -0.00984688, -0.09356729, -0.09648388], [-0.19183453, 0.11952393, 0.000561, 0.14462118, 0.34114323] ] samples_df = pd.DataFrame(np.array(noise), index=['A', 'B', 'C', 'D'], columns=axes[:5]) self.expected_noise = skbio.OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals[:5].copy(), samples_df.copy(), proportion_explained=proportion_explained[:5].copy()) self.expected_m2 = 0.72240956 self.expected_p = 0.5