def scaffold_hybrid_tree_foundation_tree( otu_map: OtuMapFormat, extension_taxonomy: TSVTaxonomyFormat, extension_sequences: DNAFASTAFormat, foundation_tree: NewickFormat, foundation_taxonomy: TSVTaxonomyFormat, graft_level: str = _ghost_tree_defaults['graft_level'], ) -> NewickFormat: otu_map_fh = otu_map.open() extension_taxonomy_fh = extension_taxonomy.open() extension_sequences_fh = extension_sequences.open() foundation_alignment_fh = foundation_tree.open() if foundation_taxonomy: foundation_taxonomy_fh = foundation_taxonomy.open() else: foundation_taxonomy_fh = None with tempfile.TemporaryDirectory() as tmp: # need ghost_tree.nwk here otherwise file exists gt_path = os.path.join(tmp, 'ghost_tree') thetree = extensions_onto_foundation(otu_map_fh, extension_taxonomy_fh, extension_sequences_fh, foundation_alignment_fh, gt_path, graft_level, foundation_taxonomy_fh)[0] # write new file to tmp file; gets deleted when this block is done gt_temp_file = open(tmp + 'ghost_tree', 'w') gt_temp_file.write(thetree) gt_temp_file.close() return NewickFormat(tmp + 'ghost_tree', 'r')
def test_tsv_taxonomy_format_validate_positive(self): filenames = ['2-column.tsv', '3-column.tsv', 'valid-but-messy.tsv', 'many-rows.tsv'] filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) for filename in filenames] for filepath in filepaths: format = TSVTaxonomyFormat(filepath, mode='r') format.validate()
def test_tsv_taxonomy_format_validate_negative(self): filenames = ['empty', 'blanks', '1-column.tsv', 'headerless.tsv', 'header-only.tsv', 'jagged.tsv'] filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) for filename in filenames] for filepath in filepaths: format = TSVTaxonomyFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, 'TSVTaxonomy'): format.validate()
def test_tsv_taxonomy_format_validate_negative(self): filenames = ['empty', 'blanks-and-comments', '1-column.tsv', 'headerless.tsv', 'header-only.tsv', 'jagged.tsv'] filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) for filename in filenames] for filepath in filepaths: format = TSVTaxonomyFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, 'TSVTaxonomy'): format.validate()
def test_tsv_taxonomy_format_column_header_lengths(self): filenames = ['greater-column-length.tsv', 'greater-header-length.tsv'] filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) for filename in filenames] for filepath in filepaths: format = TSVTaxonomyFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, 'line 2.*3 values.*expected 2'): format.validate()
def _0(ff: TSVTaxonomyFormat) -> skbio.TreeNode: root = skbio.TreeNode('root', length=0) with ff.open() as fh: reader = iter(csv.reader(fh, delimiter='\t')) next(reader) # skip header for row in reader: id_, taxonomy = row[:2] taxonomy = taxonomy.split(';') node = root for taxon in taxonomy: for child in node.children: if child.name == taxon: node = child break else: child = skbio.TreeNode(taxon, length=1) node.append(child) node = child node.append(skbio.TreeNode(id_, length=1)) return root
def setUp(self): super().setUp() # setup taxonomy to be edited tax_fp = self.get_data_path('escherichia_shigella_taxonomy.txt') self.taxonomy = TSVTaxonomyFormat(tax_fp, mode='r').view(pd.Series) # setup full string replacement replc = self.get_data_path('taxonomy-replacement-full-strings.txt') md_replc = Metadata.load(replc) self.md_replc_col = md_replc.get_column('replacements') # setup substring replacement ss_replc = self.get_data_path('taxonomy-replacement-pass.txt') md_ss_replc = Metadata.load(ss_replc) self.md_ss_replc_col = md_ss_replc.get_column('replacements') # setup substring regex replacement ssr_replc = self.get_data_path('taxonomy-replacement-regex.txt') md_ssr_replc = Metadata.load(ssr_replc) self.md_ssr_replc_col = md_ssr_replc.get_column('replacements') # setup reusable dicts self.exp_dict_00 = { 'Sal01': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'Sal02': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'UncultSal': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__uncultured_Salmonella'), 'Esch01': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; s__'), 'Shig01': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; s__') } self.exp_dict_01 = { 'Sal01': ('d__Bacteria; p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'Sal02': ('d__Bacteria; p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'UncultSal': ('d__Bacteria; ' 'p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__UNCIVILIZED_Salmonella'), 'Esch01': ('d__Bacteria; p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; s__'), 'Shig01': ('d__Bacteria; p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; s__') } self.exp_dict_02 = { 'Sal01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'Sal02': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'UncultSal': ('d__Bacteria; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__uncultured_Salmonella'), 'Esch01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__unknown_Escherichia-Shigella'), 'Shig01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__unknown_Escherichia-Shigella') } self.exp_dict_03 = { 'Sal01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'Sal02': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'UncultSal': ('d__Bacteria; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__UNCIVIL_Salmonella'), 'Esch01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__unknown_Escherichia-Shigella'), 'Shig01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__unknown_Escherichia-Shigella') }