def _map_observations(table: biom.Table) -> biom.Table: obs_dict = {} for taxa in table.ids('observation'): obs_dict[taxa] = taxa.replace('_', ' ') table = table.update_ids(id_map=obs_dict, axis='observation', inplace=False) return table
def rename_ids(table: biom.Table, metadata: qiime2.CategoricalMetadataColumn, axis: str = 'sample', strict: bool = False)\ -> biom.Table: rename = metadata.to_series() if axis == 'feature': axis = 'observation' old_ids = table.ids(axis=axis) new_ids = _generate_new_names(old_ids, rename, strict, False) updated = table.update_ids(new_ids, axis=axis, inplace=False) return updated
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update({sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update( {sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add( sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [ basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id ]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
class TestSculptor(TestCase): def setUp(self): # small synthetic dataset sample_ids = [ 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11' ] self.mf = pd.DataFrame(data=[ ['fasting', '8', 'A'], ['fasting', '-1', 'A'], ['control', '1', 'B'], ['control', '2', 'B'], ['control', '3', 'B'], ['fasting', '2', 'A'], ['fasting', '11', 'A'], ['control', '4', 'B'], ['control', '5', 'B'], ['control', '90', 'B'], ['fasting', '19.9', 'A'], ], columns=['Treatment', 'Day', 'Host'], index=sample_ids) self.mf['Day'] = pd.to_numeric(self.mf['Day'], errors='coerce') otu_ids = [str(i) for i in range(1, 8)] data = np.array([[0.0, 2.0, 5.0, 5.0, 0.0, 0.0, 0.0], [0.0, 0.0, 6.0, 9.0, 0.0, 4.0, 0.0], [2.0, 6.0, 0.0, 0.0, 5.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 5.0], [1.0, 0.0, 8.0, 9.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 2.0, 3.0, 0.0], [0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0], [0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 0.0], [9.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]) self.bt = Table(data.T, otu_ids, sample_ids) tree_string = ("((1:0.2, 2:0.1)3P:0.3, (((7:0.1, 8:0.1)7P:0.8, (5:0.2," " 6:0.2)8P:0.1)5P:0.1, (3:0.2, 4:0.7)6P:0.9)4P:0.3)" "root;") self.tree = TreeNode.read(StringIO(tree_string)) # assumes to be only directories self.to_delete = [] def tearDown(self): for element in self.to_delete: shutil.rmtree(element, ignore_errors=True) # delete the directory only if it is empty try: os.rmdir('roc-curves') except (OSError, FileNotFoundError): pass def test_constructor(self): obs = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'test-name') self.assertTrue(obs.mapping_file is None) self.assertTrue(obs.biom_table is None) self.assertEqual(obs.name, 'test-name') self.assertTrue(obs._alpha_diversity_values is None) self.assertTrue(obs._beta_diversity_matrices is None) pd.util.testing.assert_frame_equal(self.mf, obs._original_mf) np.testing.assert_equal(obs._original_bt.ids(), self.bt.ids()) np.testing.assert_equal(obs._original_bt.ids('observation'), self.bt.ids('observation')) a = [self.bt.data(i) for i in self.bt.ids()] b = [obs._original_bt.data(i) for i in obs._original_bt.ids()] np.testing.assert_allclose(a, b) # needed to allow for phylogenetic metrics for node in obs.tree.postorder(): self.assertTrue(node.length is not None) def test_constructor_errors(self): with self.assertRaisesRegex(ValueError, 'The gradient category'): _ = Sculptor(self.bt, self.mf, self.tree, 'XXX', 'Host') with self.assertRaisesRegex(ValueError, 'The trajectory category'): _ = Sculptor(self.bt, self.mf, self.tree, 'Day', 'XXX') with self.assertRaisesRegex(ValueError, 'numeric dtype'): _ = Sculptor(self.bt, self.mf, self.tree, 'Treatment', 'Host') # create fake metadata self.bt.update_ids({i: i + 'xx' for i in self.bt.ids()}, inplace=True) with self.assertRaisesRegex(ValueError, 'without metadata'): _ = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host') def test_random_select(self): np.random.seed(0) obs = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'random-select') self.assertTrue(obs.mapping_file is None) self.assertTrue(obs.biom_table is None) obs.randomly_select(3) # if we randomly select three samples there should be 6 in total self.assertTrue(len(obs.mapping_file) == 6) self.assertEqual(obs.biom_table.shape, (7, 6)) def test_random_select_errors(self): obs = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'random-select-errors') with self.assertRaisesRegex(ValueError, 'uniformly subsampled'): obs.alpha_table() with self.assertRaisesRegex(ValueError, 'uniformly subsampled'): obs.beta_table() with self.assertRaisesRegex(ValueError, 'uniformly subsampled'): obs.microbes_over_time() def test_alpha(self): skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'test-alpha') np.random.seed(0) skl.randomly_select(5) obs = skl.alpha_table(['faith_pd', 'observed_otus']) self.assertTrue(skl._alpha_diversity_values is not None) columns = [ 'faith_pd_absolute_sum_of_diff', 'faith_pd_abs_mean_diff', 'faith_pd_variance_larger_than_standard_deviation', 'faith_pd_abs_energy', 'observed_otus_absolute_sum_of_diff', 'observed_otus_abs_mean_diff', 'observed_otus_variance_larger_than_standard_deviation', 'observed_otus_abs_energy' ] data = [[ 2.1999999999999993, 0.5499999999999998, 0.0, 23.919999999999995, 2, 0.5, False, 32 ], [ 2.200000000000001, 0.5500000000000003, 0.0, 6.760000000000001, 3, 0.75, False, 22 ]] exp = pd.DataFrame(data=data, index=pd.Index(['A', 'B'], name='Host'), columns=columns) pd.util.testing.assert_frame_equal(obs, exp) def test_alpha_errors(self): skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'random-select-errors') skl.randomly_select(5) with self.assertRaisesRegex(ValueError, 'find one or more metrics'): skl.alpha_table(metrics=['does_not_exist']) def test_beta(self): skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'unittest-test-beta') path = 'roc-curves/%s/cached-matrices/' % skl.name # avoid any unwanted accidents self.to_delete.append('roc-curves/%s/' % skl.name) np.random.seed(0) skl.randomly_select(5) obs = skl.beta_table(['unweighted_unifrac', 'jaccard']) data = [[ 0.3927777777777778, 0.4126532637086283, 0.9375, 0.12499999999999999 ], [0.6557886557886559, 0.1365522219610505, 1.0, 0.0]] columns = [ 'unweighted_unifrac_mean', 'unweighted_unifrac_std', 'jaccard_mean', 'jaccard_std' ] exp = pd.DataFrame(data=data, columns=columns, index=pd.Index(['A', 'B'], name='Host')) pd.util.testing.assert_frame_equal(obs, exp) self.assertTrue(os.path.exists(path)) self.assertTrue( os.path.exists(os.path.join(path, 'unweighted_unifrac.full.' 'txt'))) self.assertTrue(os.path.exists(os.path.join(path, 'jaccard.full.txt'))) def test_beta_errors(self): skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'unittest-beta-errors') self.to_delete.append('roc-curves/%s' % skl.name) skl.randomly_select(5) with self.assertRaisesRegex(ValueError, 'find one or more metrics'): skl.beta_table(metrics=['does_not_exist']) def test_microbes_over_time(self): skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'microbes-over-time') np.random.seed(0) skl.randomly_select(5) obs = skl.microbes_over_time() metrics = ['mean', 'abs_energy', 'non_zero_samples', 'abs_mean_diff'] columns = ['%s_%s' % (a, b) for a, b in product(range(1, 8), metrics)] index = ['A', 'B'] self.assertEqual(obs.columns.tolist(), columns) self.assertEqual(obs.index.tolist(), index) self.assertEqual(obs.values.shape, (2, 28))
def _update_table_sample_ids(mapping: dict, table: biom.Table) -> biom.Table: return table.update_ids(mapping, axis='sample', inplace=False)
def _update_table_feature_ids(mapping: dict, table: biom.Table) -> biom.Table: return table.update_ids(mapping, axis='observation', inplace=False)