def test_biom_to_table(self): data = [[4, 2, 0], [5, 0, 3], [8, 0, 0], [0, 3, 0], [0, 7, 5]] observs = ['G1', 'G2', 'G3', 'G4', 'G5'] samples = ['S1', 'S2', 'S3'] metadata = [{ 'Name': 'Actinobacteria', 'Rank': 'phylum', 'Lineage': '2;72;74' }, { 'Name': 'Firmicutes', 'Rank': 'phylum', 'Lineage': '2;72' }, { 'Name': 'Bacteroidetes', 'Rank': 'phylum', 'Lineage': '2;70' }, { 'Name': 'Cyanobacteria', 'Rank': 'phylum', 'Lineage': '2;72' }, { 'Name': '', 'Rank': '', 'Lineage': '' }] table = table_to_biom(data, observs, samples, metadata) obs = biom_to_table(table) self.assertListEqual(obs[0], data) self.assertListEqual(obs[1], observs) self.assertListEqual(obs[2], samples) self.assertListEqual(obs[3], metadata)
def gotu(alignment: str) -> biom.Table: """Generate a gOTU table based on sequence alignments. """ profile = cwf(mapper=plain_mapper, files=[alignment], demux=True, ranks=['none'], chunk=1000, zippers={})['none'] table = table_to_biom(*prep_table(profile)) table.generated_by = f'{__name__}-{__version__}' return table
def coverage(table: biom.Table, mapping: str, threshold: int = None, count: bool = False) -> biom.Table: """Calculate a feature table's coverage over feature groups. """ with open(mapping, 'r') as fh: mapping = dict(read_map_all(fh)) table = calc_coverage(table, mapping, threshold, count) table = table_to_biom(*table) table.generated_by = f'{__name__}-{__version__}' return table
def test_write_table(self): table = ([[4, 2, 0], [5, 0, 3], [8, 0, 0], [0, 3, 0], [0, 7, 5]], ['G1', 'G2', 'G3', 'G4', 'G5'], ['S1', 'S2', 'S3'], [{ 'Name': 'Actinobacteria' }, { 'Name': 'Firmicutes' }, { 'Name': 'Bacteroidetes' }, { 'Name': 'Cyanobacteria' }, { 'Name': '' }]) biota = table_to_biom(*table) # tuple to TSV fp = join(self.tmpdir, 'output.tsv') write_table(table, fp) with open(fp, 'r') as f: obs = f.read().splitlines() exp = [ '#FeatureID\tS1\tS2\tS3\tName', 'G1\t4\t2\t0\tActinobacteria', 'G2\t5\t0\t3\tFirmicutes', 'G3\t8\t0\t0\tBacteroidetes', 'G4\t0\t3\t0\tCyanobacteria', 'G5\t0\t7\t5\t' ] self.assertListEqual(obs, exp) # BIOM to TSV write_table(biota, fp) with open(fp, 'r') as f: obs = f.read().splitlines() self.assertListEqual(obs, exp) remove(fp) # BIOM to BIOM fp = join(self.tmpdir, 'output.biom') write_table(biota, fp) obs = load_table(fp) self.assertEqual(obs.descriptive_equality(biota), 'Tables appear equal') # tuple to BIOM write_table(table, fp) obs = load_table(fp) self.assertEqual(obs.descriptive_equality(biota), 'Tables appear equal') remove(fp)
def test_table_to_biom(self): data = [[4, 2, 0], [5, 0, 3], [8, 0, 0], [0, 3, 0], [0, 7, 5]] observs = ['G1', 'G2', 'G3', 'G4', 'G5'] samples = ['S1', 'S2', 'S3'] metadata = [{ 'Name': 'Actinobacteria', 'Rank': 'phylum', 'Lineage': '2;72;74' }, { 'Name': 'Firmicutes', 'Rank': 'phylum', 'Lineage': '2;72' }, { 'Name': 'Bacteroidetes', 'Rank': 'phylum', 'Lineage': '2;70' }, { 'Name': 'Cyanobacteria', 'Rank': 'phylum', 'Lineage': '2;72' }, { 'Name': '', 'Rank': '', 'Lineage': '' }] obs = table_to_biom(data, observs, samples, metadata) exp = pd.read_csv(StringIO( '\tS1\tS2\tS3\tName\tRank\tLineage\n' 'G1\t4\t2\t0\tActinobacteria\tphylum\t2;72;74\n' 'G2\t5\t0\t3\tFirmicutes\tphylum\t2;72\n' 'G3\t8\t0\t0\tBacteroidetes\tphylum\t2;70\n' 'G4\t0\t3\t0\tCyanobacteria\tphylum\t2;72\n' 'G5\t0\t7\t5\t\t\t\n'), sep='\t', index_col=0, na_filter=False) assert_frame_equal( obs.to_dataframe(dense=True).astype(int), exp.iloc[:, :3]) assert_frame_equal( obs.metadata_to_dataframe('observation')[[ 'Name', 'Rank', 'Lineage' ]], exp.iloc[:, -3:])
def test_write_biom(self): profile = { 'S1': { 'G1': 4, 'G2': 5, 'G3': 8 }, 'S2': { 'G1': 2, 'G4': 3, 'G5': 7 }, 'S3': { 'G2': 3, 'G5': 5 } } exp = table_to_biom(*prep_table(profile)) fp = join(self.tmpdir, 'tmp.biom') write_biom(exp, fp) obs = load_table(fp) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') remove(fp)
def test_merge_tables(self): # just data t1 = prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8 }, 'S2': { 'G1': 2, 'G4': 3, 'G5': 7 }, 'S3': { 'G2': 3, 'G5': 5 } }) t2 = prep_table({ 'S3': { 'G3': 1, 'G5': 1 }, 'S4': { 'G2': 5, 'G3': 3, 'G6': 9 }, 'S5': { 'G5': 2, 'G6': 4 } }) t3 = prep_table({ 'S2': { 'G3': 2, 'G5': 2, 'G6': 8 }, 'S6': { 'G3': 1, 'G6': 6 } }) obs = merge_tables([t1, t2, t3]) exp = prep_table({ 'S1': { 'G1': 4, 'G2': 5, 'G3': 8, 'G4': 0, 'G5': 0, 'G6': 0 }, 'S2': { 'G1': 2, 'G2': 0, 'G3': 2, 'G4': 3, 'G5': 9, 'G6': 8 }, 'S3': { 'G1': 0, 'G2': 3, 'G3': 1, 'G4': 0, 'G5': 6, 'G6': 0 }, 'S4': { 'G1': 0, 'G2': 5, 'G3': 3, 'G4': 0, 'G5': 0, 'G6': 9 }, 'S5': { 'G1': 0, 'G2': 0, 'G3': 0, 'G4': 0, 'G5': 2, 'G6': 4 }, 'S6': { 'G1': 0, 'G2': 0, 'G3': 1, 'G4': 0, 'G5': 0, 'G6': 6 } }) for i in range(4): self.assertListEqual(obs[i], exp[i]) # with metadata names = { 'G1': 'Actinobacteria', 'G2': 'Firmicutes', 'G3': 'Bacteroidetes', 'G4': 'Cyanobacteria', 'G5': 'Proteobacteria', 'G6': 'Fusobacteria' } for t in (t1, t2, t3, exp): t[3].clear() t[3].extend({'Name': names[x]} for x in t[1]) obs = merge_tables([t1, t2, t3]) for i in range(4): self.assertListEqual(obs[i], exp[i]) # some biom tables obs = merge_tables([t1, table_to_biom(*t2), t3]) for i in range(4): self.assertListEqual(obs[i], exp[i]) # all biom tables obs = merge_tables([table_to_biom(*x) for x in (t1, t2, t3)]) self.assertTrue(isinstance(obs, Table)) exp = table_to_biom(*exp) self.assertEqual(obs.descriptive_equality(exp), 'Tables appear equal') # inconsistent metadata t3[3][1]['Name'] = 'This is not right.' with self.assertRaises(ValueError) as ctx: merge_tables([t1, t2, t3]) errmsg = 'Conflicting metadata found in tables.' self.assertEqual(str(ctx.exception), errmsg)
def classify(alignment: str, target_rank: str, reference_taxonomy: Series = None, reference_tree: TreeNode = None, reference_nodes: str = None, taxon_map: str = None, trim_subject: bool = False, gene_coordinates: str = None, overlap_threshold: int = 80, unique_assignment: bool = False, majority_threshold: int = None, above_given_rank: bool = False, subject_is_okay: bool = False, report_unassigned: bool = False) -> biom.Table: """Classify sequences based on their alignments to references through a hierarchical classification system. """ # validate classification system num_ref = len(list(filter(None.__ne__, ( reference_taxonomy, reference_tree, reference_nodes)))) if num_ref > 1: raise ValueError('Only one reference classification system can be ' 'specified.') elif num_ref == 0 and target_rank != 'none': raise ValueError('A reference classification system must be specified ' f'for classification at the rank "{target_rank}".') # build classification hierarchy tree, rankdic, namedic = {}, {}, {} # read taxonomy if reference_taxonomy is not None: tree, rankdic = read_lineage(StringIO(reference_taxonomy.to_csv( sep='\t', header=False))) # read phylogeny if reference_tree is not None: tree = read_newick(StringIO(str(reference_tree))) # read taxdump if reference_nodes is not None: with open(reference_nodes, 'r') as fh: tree, rankdic = read_nodes(fh) # read taxon mapping if taxon_map is not None: with open(taxon_map, 'r') as fh: tree.update(read_map_1st(fh)) # fill root root = fill_root(tree) # build mapping module mapper, chunk = build_mapper( coords_fp=gene_coordinates, overlap=overlap_threshold) # classify query sequences profile = cwf(mapper=mapper, files=[alignment], demux=True, trimsub=trim_subject and '_', tree=tree, rankdic=rankdic, namedic=namedic, root=root, ranks=[target_rank], uniq=unique_assignment, major=majority_threshold, above=above_given_rank, subok=subject_is_okay, unasgd=report_unassigned, chunk=chunk, zippers={})[target_rank] # generate feature table table = table_to_biom(*prep_table( profile, rankdic=rankdic, namedic=namedic)) table.generated_by = f'{__name__}-{__version__}' return table