def setUp(self): super().setUp() empty_table_fp = self.get_data_path('empty_table.biom') self.empty_table_as_BIOMV210Format = \ BIOMV210Format(empty_table_fp, mode='r') input_table_fp = self.get_data_path('faith_test_table.biom') self.input_table_as_BIOMV210Format = \ BIOMV210Format(input_table_fp, mode='r') rf_table_fp = self.get_data_path('faith_test_table_rf.biom') self.rf_table_as_BIOMV210Format = BIOMV210Format(rf_table_fp, mode='r') pa_table_fp = self.get_data_path('faith_test_table_pa.biom') self.pa_table_as_BIOMV210Format = BIOMV210Format(pa_table_fp, mode='r') empty_tree_fp = self.get_data_path('empty.tree') self.empty_tree_as_NewickFormat = NewickFormat(empty_tree_fp, mode='r') input_tree_fp = self.get_data_path('faith_test.tree') self.input_tree_as_NewickFormat = NewickFormat(input_tree_fp, mode='r') root_only_tree_fp = self.get_data_path('root_only.tree') self.root_only_tree_as_NewickFormat = \ NewickFormat(root_only_tree_fp, mode='r') missing_tip_tree_fp = self.get_data_path('missing_tip.tree') self.missing_tip_tree_as_NewickFormat = \ NewickFormat(missing_tip_tree_fp, mode='r') self.expected = pd.Series( { 'S1': 0.5, 'S2': 0.7, 'S3': 1.0, 'S4': 100.5, 'S5': 101 }, name='faith_pd')
def scaffold_hybrid_tree_foundation_tree( otu_map: OtuMapFormat, extension_taxonomy: TSVTaxonomyFormat, extension_sequences: DNAFASTAFormat, foundation_tree: NewickFormat, foundation_taxonomy: TSVTaxonomyFormat, graft_level: str = _ghost_tree_defaults['graft_level'], ) -> NewickFormat: otu_map_fh = otu_map.open() extension_taxonomy_fh = extension_taxonomy.open() extension_sequences_fh = extension_sequences.open() foundation_alignment_fh = foundation_tree.open() if foundation_taxonomy: foundation_taxonomy_fh = foundation_taxonomy.open() else: foundation_taxonomy_fh = None with tempfile.TemporaryDirectory() as tmp: # need ghost_tree.nwk here otherwise file exists gt_path = os.path.join(tmp, 'ghost_tree') thetree = extensions_onto_foundation(otu_map_fh, extension_taxonomy_fh, extension_sequences_fh, foundation_alignment_fh, gt_path, graft_level, foundation_taxonomy_fh)[0] # write new file to tmp file; gets deleted when this block is done gt_temp_file = open(tmp + 'ghost_tree', 'w') gt_temp_file.write(thetree) gt_temp_file.close() return NewickFormat(tmp + 'ghost_tree', 'r')
def setUp(self): super().setUp() # expected computed with skbio.diversity.beta_diversity self.expected = skbio.DistanceMatrix([[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]], ids=['S1', 'S2', 'S3']) table_fp = self.get_data_path('two_feature_table.biom') self.table_as_BIOMV210Format = BIOMV210Format(table_fp, mode='r') rf_table_fp = self.get_data_path('two_feature_rf_table.biom') self.rf_table_as_BIOMV210Format = BIOMV210Format(rf_table_fp, mode='r') p_a_table_fp = self.get_data_path('two_feature_p_a_table.biom') self.p_a_table_as_BIOMV210Format = BIOMV210Format(p_a_table_fp, mode='r') self.table_as_artifact = Artifact.import_data( 'FeatureTable[Frequency]', self.table_as_BIOMV210Format) tree_fp = self.get_data_path('three_feature.tree') self.tree_as_NewickFormat = NewickFormat(tree_fp, mode='r') self.tree_as_artifact = Artifact.import_data( 'Phylogeny[Rooted]', self.tree_as_NewickFormat) self.unweighted_unifrac_thru_framework = self.plugin.actions[ 'unweighted_unifrac']
def classify_paths(representative_sequences: DNASequencesDirectoryFormat, tree: NewickFormat) -> pd.DataFrame: # Traverse trees from bottom-up for nodes that are inserted fragments and # collect taxonomic labels upon traversal. tree = skbio.TreeNode.read(str(tree)) taxonomy = [] for fragment in representative_sequences.file.view(DNAIterator): lineage = [] try: for ancestor in tree.find(fragment.metadata['id']).ancestors(): if (ancestor.name is not None) and ('__' in ancestor.name): lineage.append(ancestor.name) lineage_str = '; '.join(reversed(lineage)) except skbio.tree.MissingNodeError: lineage_str = np.nan taxonomy.append({ 'Feature ID': fragment.metadata['id'], 'Taxon': lineage_str }) pd_taxonomy = pd.DataFrame(taxonomy).set_index('Feature ID') if pd_taxonomy['Taxon'].dropna().shape[0] == 0: raise ValueError( ('None of the representative-sequences can be found in the ' 'insertion tree. Please double check that both inputs match up, ' 'i.e. are results from the same \'sepp\' run.')) return pd_taxonomy
def fasttree(alignment: AlignedDNAFASTAFormat) -> NewickFormat: result = NewickFormat() aligned_fp = str(alignment) tree_fp = str(result) cmd = ['FastTree', '-nt', aligned_fp] run_command(cmd, tree_fp) return result
def setUp(self): super().setUp() # expected computed with diversity.beta_phylogenetic (weighted_unifrac) self.expected = skbio.DistanceMatrix( np.array([0.44656238, 0.23771096, 0.30489123, 0.23446002, 0.65723575, 0.44911772, 0.381904, 0.69144829, 0.39611776, 0.36568012, 0.53377975, 0.48908025, 0.35155196, 0.28318669, 0.57376916, 0.23395746, 0.24658122, 0.60271637, 0.39802552, 0.36567394, 0.68062701, 0.36862049, 0.48350632, 0.33024631, 0.33266697, 0.53464744, 0.74605075, 0.53951035, 0.49680733, 0.79178838, 0.37109012, 0.52629343, 0.22118218, 0.32400805, 0.43189708, 0.59705893]), ids=('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355', '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607', '10084.PC.634')) table_fp = self.get_data_path('crawford.biom') self.table_as_BIOMV210Format = BIOMV210Format(table_fp, mode='r') rel_freq_table_fp = self.get_data_path('crawford_rf.biom') self.rf_table_as_BIOMV210Format = BIOMV210Format(rel_freq_table_fp, mode='r') tree_fp = self.get_data_path('crawford.nwk') self.tree_as_NewickFormat = NewickFormat(tree_fp, mode='r')
def raxml_rapid_bootstrap(alignment: AlignedDNAFASTAFormat, seed: int = None, rapid_bootstrap_seed: int = None, bootstrap_replicates: int = 100, n_threads: int = 1, raxml_version: str = 'Standard', substitution_model: str = 'GTRGAMMA' ) -> NewickFormat: result = NewickFormat() cmd = _set_raxml_version(raxml_version=raxml_version, n_threads=n_threads) if seed is None: seed = randint(1000, 10000) if rapid_bootstrap_seed is None: rapid_bootstrap_seed = randint(1000, 10000) runname = 'q2bootstrap' with tempfile.TemporaryDirectory() as temp_dir: cmd += _build_rapid_bootstrap_command(alignment, seed, rapid_bootstrap_seed, bootstrap_replicates, substitution_model, temp_dir, runname) run_command(cmd) tree_tmp_fp = os.path.join(temp_dir, 'RAxML_bipartitions.%s' % runname) os.rename(tree_tmp_fp, str(result)) return result
def setUp(self): super().setUp() with open(self.get_data_path('asv_table.tsv')) as fh: self.table = biom.Table.from_tsv(fh, None, None, None) self.phylogeny = NewickFormat(self.get_data_path('tree.nwk'), mode='r') self.metadata = (qiime2.Metadata.load( self.get_data_path('metadata.tsv')))
def raxml(alignment: AlignedDNAFASTAFormat, seed: int = None, n_searches: int = 1, n_threads: int = 1, raxml_version: str = 'Standard', substitution_model: str = 'GTRGAMMA') -> NewickFormat: result = NewickFormat() cmd = _set_raxml_version(raxml_version=raxml_version, n_threads=n_threads) if seed is None: seed = randint(1000, 10000) runname = 'q2' with tempfile.TemporaryDirectory() as temp_dir: cmd += ['-m', str(substitution_model), '-p', str(seed), '-N', str(n_searches), '-s', str(alignment), '-w', temp_dir, '-n', runname] run_command(cmd) tree_tmp_fp = os.path.join(temp_dir, 'RAxML_bestTree.%s' % runname) os.rename(tree_tmp_fp, str(result)) return result
def iqtree_ultrafast_bootstrap( alignment: AlignedDNAFASTAFormat, seed: int = _iqtree_defaults['seed'], n_cores: int = _iqtree_defaults['n_cores'], n_cores_max: int = _iqtree_defaults['n_cores_max'], n_runs: int = _iqtree_defaults['n_runs'], substitution_model: str = _iqtree_defaults['substitution_model'], bootstrap_replicates: int = _iqtree_defaults['bootstrap_replicates'], n_init_pars_trees: int = _iqtree_defaults['n_init_pars_trees'], n_top_init_trees: int = _iqtree_defaults['n_top_init_trees'], n_best_retain_trees: int = _iqtree_defaults['n_best_retain_trees'], stop_iter: int = _iqtree_defaults['stop_iter'], perturb_nni_strength: float = _iqtree_defaults['perturb_nni_strength'], spr_radius: int = _iqtree_defaults['spr_radius'], n_max_ufboot_iter: int = _iqtree_defaults['n_max_ufboot_iter'], n_ufboot_steps: int = _iqtree_defaults['n_ufboot_steps'], min_cor_ufboot: float = _iqtree_defaults['min_cor_ufboot'], ep_break_ufboot: float = _iqtree_defaults['ep_break_ufboot'], allnni: bool = _iqtree_defaults['allnni'], alrt: int = _iqtree_defaults['alrt'], abayes: bool = _iqtree_defaults['abayes'], lbp: int = _iqtree_defaults['lbp'], bnni: bool = _iqtree_defaults['bnni'], safe: bool = _iqtree_defaults['safe']) -> NewickFormat: # NOTE: the IQ-TREE commands `-n` (called as `n_iter` in the `iqtree` # method) and `-fast` are not compatable with ultrafast_bootstrap `-bb`. result = NewickFormat() with tempfile.TemporaryDirectory() as temp_dir: run_prefix = os.path.join(temp_dir, 'q2iqtreeufboot') cmd = _build_iqtree_ufbs_command( alignment, seed=seed, n_cores=n_cores, n_cores_max=n_cores_max, n_runs=n_runs, substitution_model=substitution_model, bootstrap_replicates=bootstrap_replicates, run_prefix=run_prefix, n_init_pars_trees=n_init_pars_trees, n_top_init_trees=n_top_init_trees, n_best_retain_trees=n_best_retain_trees, stop_iter=stop_iter, perturb_nni_strength=perturb_nni_strength, spr_radius=spr_radius, n_max_ufboot_iter=n_max_ufboot_iter, n_ufboot_steps=n_ufboot_steps, min_cor_ufboot=min_cor_ufboot, ep_break_ufboot=ep_break_ufboot, allnni=allnni, alrt=alrt, abayes=abayes, lbp=lbp, bnni=bnni, safe=safe) run_command(cmd) tree_tmp_fp = os.path.join(temp_dir, '%s.treefile' % run_prefix) os.rename(tree_tmp_fp, str(result)) return result
def sepp(representative_sequences: DNASequencesDirectoryFormat, reference_database: SeppReferenceDirFmt, alignment_subset_size: int = 1000, placement_subset_size: int = 5000, threads: int = 1, debug: bool = False, ) -> (NewickFormat, PlacementsFormat): placements = 'q2-fragment-insertion_placement.json' tree = 'q2-fragment-insertion_placement.tog.relabelled.tre' placements_result = PlacementsFormat() tree_result = NewickFormat() with tempfile.TemporaryDirectory() as tmp: _run(str(representative_sequences.file.view(DNAFASTAFormat)), str(threads), tmp, str(alignment_subset_size), str(placement_subset_size), str(reference_database.alignment.path_maker()), str(reference_database.phylogeny.path_maker()), str(reference_database.raxml_info.path_maker()), debug) outtree = os.path.join(tmp, tree) outplacements = os.path.join(tmp, placements) _add_missing_branch_length(outtree) shutil.copyfile(outtree, str(tree_result)) shutil.copyfile(outplacements, str(placements_result)) return tree_result, placements_result
def filter_features(table: biom.Table, tree: NewickFormat) -> (biom.Table, biom.Table): # load the insertion tree tree = skbio.TreeNode.read(str(tree)) # collect all tips=inserted fragments+reference taxa names fragments_tree = { str(tip.name) for tip in tree.tips() if tip.name is not None} # collect all fragments/features from table fragments_table = set(map(str, table.ids(axis='observation'))) if len(fragments_table & fragments_tree) <= 0: raise ValueError(('Not a single fragment of your table is part of your' ' tree. The resulting table would be empty.')) tbl_positive = table.filter(fragments_table & fragments_tree, axis='observation', inplace=False) tbl_negative = table.filter(fragments_table - fragments_tree, axis='observation', inplace=False) # print some information for quality control, # which user can request via --verbose results = pd.DataFrame( data={'kept_reads': tbl_positive.sum(axis='sample'), 'removed_reads': tbl_negative.sum(axis='sample')}, index=tbl_positive.ids()) results['removed_ratio'] = results['removed_reads'] / \ (results['kept_reads'] + results['removed_reads']) return (tbl_positive, tbl_negative)
def test_failed_run_not_verbose(self): input_fp = self.get_data_path('aligned-dna-sequences-1.fasta') input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r') result = NewickFormat() aligned_fp = str(input_sequences) tree_fp = str(result) cmd = ['FastTree', '-nt', '-not-a-real-parameter', aligned_fp] with self.assertRaises(subprocess.CalledProcessError): with redirected_stdio(stderr=os.devnull): run_command(cmd, tree_fp, verbose=False)
def setUp(self): super().setUp() valid_table_fp = self.get_data_path('two_feature_table.biom') self.valid_table_as_BIOMV210Format = \ BIOMV210Format(valid_table_fp, mode='r') # empty table fp generated from self.empty_table with biom v2.1.7 self.empty_table = biom.Table(np.array([]), [], []) empty_table_fp = self.get_data_path('empty_table.biom') self.empty_table_as_BIOMV210Format = \ BIOMV210Format(empty_table_fp, mode='r') empty_tree_fp = self.get_data_path('empty.tree') self.empty_tree_as_NewickFormat = NewickFormat(empty_tree_fp, mode='r') root_only_tree_fp = self.get_data_path('root_only.tree') self.root_only_tree_as_NewickFormat = NewickFormat(root_only_tree_fp, mode='r') missing_tip_tree_fp = self.get_data_path('missing_tip.tree') self.missing_tip_tree_as_NewickFormat = \ NewickFormat(missing_tip_tree_fp, mode='r') two_feature_tree_fp = self.get_data_path('two_feature.tree') self.two_feature_tree_as_NewickFormat = \ NewickFormat(two_feature_tree_fp, mode='r') extra_tip_tree_fp = self.get_data_path('extra_tip.tree') self.extra_tip_tree_as_NewickFormat = NewickFormat(extra_tip_tree_fp, mode='r') valid_tree_fp = self.get_data_path('three_feature.tree') self.valid_tree_as_NewickFormat = NewickFormat(valid_tree_fp, mode='r')
def tip_to_tip_distances(output_dir: str, tree_1: NewickFormat, tree_2: NewickFormat, method: str=_ghost_tree_defaults['method']): tree1_fh = tree_1.open() tree2_fh = tree_2.open() stats_results = compare_tip_to_tip_distances( tree1_fh, tree2_fh, method) data_dict = { 'Correlation Coefficient': str(round(stats_results[0], 5)), 'p-value': str(stats_results[1]), 'Number of Overlapping Tips': str(stats_results[2]), } df = pd.Series(data=data_dict).to_frame() df.columns = ['Tree Comparison Statistics'] index = os.path.join(output_dir, 'index.html') with open(index, 'w') as fh: fh.write(df.to_html())
def iqtree( alignment: AlignedDNAFASTAFormat, seed: int = _iqtree_defaults['seed'], n_cores: int = _iqtree_defaults['n_cores'], n_cores_max: int = _iqtree_defaults['n_cores_max'], n_runs: int = _iqtree_defaults['n_runs'], substitution_model: str = _iqtree_defaults['substitution_model'], n_init_pars_trees: int = _iqtree_defaults['n_init_pars_trees'], n_top_init_trees: int = _iqtree_defaults['n_top_init_trees'], n_best_retain_trees: int = _iqtree_defaults['n_best_retain_trees'], n_iter: int = _iqtree_defaults['n_iter'], stop_iter: int = _iqtree_defaults['stop_iter'], perturb_nni_strength: float = _iqtree_defaults['perturb_nni_strength'], spr_radius: int = _iqtree_defaults['spr_radius'], allnni: bool = _iqtree_defaults['allnni'], fast: bool = _iqtree_defaults['fast'], alrt: int = _iqtree_defaults['alrt'], abayes: bool = _iqtree_defaults['abayes'], lbp: int = _iqtree_defaults['lbp'], safe: bool = _iqtree_defaults['safe'], ) -> NewickFormat: result = NewickFormat() with tempfile.TemporaryDirectory() as temp_dir: run_prefix = os.path.join(temp_dir, 'q2iqtree') cmd = _build_iqtree_command(alignment, seed=seed, n_cores=n_cores, n_cores_max=n_cores_max, n_runs=n_runs, substitution_model=substitution_model, run_prefix=run_prefix, n_init_pars_trees=n_init_pars_trees, n_top_init_trees=n_top_init_trees, n_best_retain_trees=n_best_retain_trees, n_iter=n_iter, stop_iter=stop_iter, perturb_nni_strength=perturb_nni_strength, spr_radius=spr_radius, allnni=allnni, fast=fast, alrt=alrt, abayes=abayes, lbp=lbp, safe=safe) run_command(cmd) tree_tmp_fp = os.path.join(temp_dir, '%s.treefile' % run_prefix) os.rename(tree_tmp_fp, str(result)) return result
def setUp(self): super().setUp() @_validate_requested_cpus def function_no_params(): pass self.function_no_params = function_no_params @_validate_requested_cpus def function_w_param(n_jobs=3): return n_jobs self.function_w_n_jobs_param = function_w_param @_validate_requested_cpus def function_w_threads(threads=2): return threads self.function_w_threads_param = function_w_threads @_validate_requested_cpus def function_w_duplicate_params(n_jobs=3, threads=2): pass self.function_w_both = function_w_duplicate_params self.jaccard_thru_framework = self.plugin.actions['jaccard'] self.unweighted_unifrac_thru_framework = self.plugin.actions[ 'unweighted_unifrac'] two_feature_table_fp = self.get_data_path('two_feature_table.biom') self.two_feature_table = biom.load_table(two_feature_table_fp) self.two_feature_table_as_BIOMV210Format = BIOMV210Format( two_feature_table_fp, mode='r') self.two_feature_table_as_artifact = Artifact.import_data( 'FeatureTable[Frequency]', two_feature_table_fp) larger_table_fp = self.get_data_path('crawford.biom') self.larger_table_as_artifact = Artifact.import_data( 'FeatureTable[Frequency]', larger_table_fp) valid_tree_fp = self.get_data_path('three_feature.tree') self.valid_tree_as_NewickFormat = NewickFormat(valid_tree_fp, mode='r') self.valid_tree_as_artifact = Artifact.import_data( 'Phylogeny[Rooted]', valid_tree_fp) larger_tree_fp = self.get_data_path('crawford.nwk') self.larger_tree_as_artifact = Artifact.import_data( 'Phylogeny[Rooted]', larger_tree_fp)
def fasttree(alignment: AlignedDNAFASTAFormat, n_threads: int = 1) -> NewickFormat: result = NewickFormat() aligned_fp = str(alignment) tree_fp = str(result) env = None if n_threads == 1: cmd = ['FastTree'] else: env = os.environ.copy() env.update({'OMP_NUM_THREADS': str(n_threads)}) cmd = ['FastTreeMP'] cmd.extend(['-quote', '-nt', aligned_fp]) run_command(cmd, tree_fp, env=env) return result
def sepp( representative_sequences: DNASequencesDirectoryFormat, threads: int = 1, alignment_subset_size: int = 1000, placement_subset_size: int = 5000, reference_alignment: AlignedDNASequencesDirectoryFormat = None, reference_phylogeny: NewickFormat = None, debug: bool = False, ) -> (NewickFormat, PlacementsFormat): _sanity() # check if sequences and tips in reference match if not _reference_matches(reference_alignment, reference_phylogeny): raise ValueError( ('Reference alignment and phylogeny do not match up. Please ensure' ' that all sequences in the alignment correspond to exactly one ' 'tip name in the phylogeny.')) placements = 'q2-fragment-insertion_placement.json' tree = 'q2-fragment-insertion_placement.tog.relabelled.tre' placements_result = PlacementsFormat() tree_result = NewickFormat() with tempfile.TemporaryDirectory() as tmp: _run(str(representative_sequences.file.view(DNAFASTAFormat)), str(threads), tmp, str(alignment_subset_size), str(placement_subset_size), reference_alignment, reference_phylogeny, debug) outtree = os.path.join(tmp, tree) outplacements = os.path.join(tmp, placements) _add_missing_branch_length(outtree) shutil.copyfile(outtree, str(tree_result)) shutil.copyfile(outplacements, str(placements_result)) return tree_result, placements_result
def setUp(self): super().setUp() self.empty_table = biom.Table(np.array([]), [], []) # empty table generated from self.empty_table with biom v2.1.7 empty_table_fp = self.get_data_path('empty_table.biom') self.empty_table_as_BIOMV210Format = BIOMV210Format(empty_table_fp, mode='r') valid_table_fp = self.get_data_path('crawford.biom') self.valid_table_as_BIOMV210Format = BIOMV210Format(valid_table_fp, mode='r') not_a_table_fp = self.get_data_path('crawford.nwk') self.invalid_view_type = NewickFormat(not_a_table_fp, mode='r') self.valid_table_list = [ self.valid_table_as_BIOMV210Format, self.valid_table_as_BIOMV210Format ] self.invalid_table_list = [ self.valid_table_as_BIOMV210Format, self.invalid_view_type ] self.has_empty_table_list = [ self.empty_table_as_BIOMV210Format, self.valid_table_as_BIOMV210Format ] @_disallow_empty_tables def f1(table: biom.Table): pass self.function_with_table_param = f1 @_disallow_empty_tables def f2(): pass self.function_without_table_param = f2
def test_newick_format_validate_positive(self): filepath = self.get_data_path('tree.nwk') format = NewickFormat(filepath, mode='r') format.validate()
def classify_otus_experimental( representative_sequences: DNASequencesDirectoryFormat, tree: NewickFormat, reference_taxonomy: pd.DataFrame) -> pd.DataFrame: # convert type of feature IDs to str (depending on pandas type inference # they might come as integers), to make sure they are of the same type as # in the tree. reference_taxonomy.index = map(str, reference_taxonomy.index) # load the insertion tree tree = skbio.TreeNode.read(str(tree)) # ensure that all reference tips in the tree (those without the inserted # fragments) have a mapping in the user provided taxonomy table names_tips = {node.name for node in tree.tips()} names_fragments = {fragment.metadata['id'] for fragment in representative_sequences.file.view(DNAIterator)} missing_features = (names_tips - names_fragments) -\ set(reference_taxonomy.index) if len(missing_features) > 0: raise ValueError("Not all OTUs in the provided insertion tree have " "mappings in the provided reference taxonomy. " "Taxonomy missing for the following %i feature(s):" "\n%s" % (len(missing_features), "\n".join(missing_features))) taxonomy = [] for fragment in representative_sequences.file.view(DNAIterator): # for every inserted fragment we now try to find the closest OTU tip # in the tree and available mapping from the OTU-ID to a lineage # string: lineage_str = np.nan # first, let us check if the fragment has been inserted at all ... try: curr_node = tree.find(fragment.metadata['id']) except skbio.tree.MissingNodeError: continue # if yes, we start from the inserted node and traverse the tree as less # as possible towards the root and check at every level if one or # several OTU-tips are within the sub-tree. if curr_node is not None: foundOTUs = [] # Traversal is stopped at a certain level, if one or more OTU-tips # have been found in the sub-tree OR ... (see break below) while len(foundOTUs) == 0: # SEPP insertion - especially for multiple very similar # sequences - can result in a rather complex topology change # if all those sequences are inserted into the same branch # leading to one OTU-tip. Thus, we cannot simply visit only # all siblings or decendents and rather need to traverse the # whole sub-tree. Average case should be well behaved, # thus I think it is ok. for node in curr_node.postorder(): if (node.name is not None) and \ (node.name in reference_taxonomy.index): # if a suitable OTU-tip node is found AND this OTU-ID # has a mapping in the user provided reference_taxonomy # we store the OTU-ID in the growing result list foundOTUs.append(node.name) # ... if the whole tree has been traversed without success, # e.g. if user provided reference_taxonomy did not contain any # matching OTU-IDs. if curr_node.is_root(): break # prepare next while iteration, by changing to the parent node curr_node = curr_node.parent if len(foundOTUs) > 0: # If the above method has identified exactly one OTU-tip, # resulting lineage string would simple be the one provided by # the user reference_taxonomy. However, if the inserted # fragment cannot unambiguously places into the reference tree, # the above method will find multiple OTU-IDs, which might have # lineage strings in the user provided reference_taxonomy that # are similar up to a certain rank and differ e.g. for genus # and species. # Thus, we here find the longest common prefix of all lineage # strings. We don't operate per character, but per taxonomic # rank. Therefore, we first "convert" every lineage sting into # a list of taxa, one per rank. split_lineages = [] for otu in foundOTUs: # find lineage string for OTU lineage = reference_taxonomy.loc[otu, 'Taxon'] # necessary to split lineage apart to ensure that # the longest common prefix operates on atomic ranks # instead of characters split_lineages.append(list( map(str.strip, lineage.split(';')))) # find the longest common prefix rank-wise and concatenate to # one lineage string, separated by ; lineage_str = "; ".join(os.path.commonprefix(split_lineages)) taxonomy.append({'Feature ID': fragment.metadata['id'], 'Taxon': lineage_str}) pd_taxonomy = pd.DataFrame(taxonomy) # test if dataframe is completely empty, or if no lineages could be found if (len(taxonomy) == 0) or \ (pd_taxonomy['Taxon'].dropna().shape[0] == 0): raise ValueError( ("None of the representative-sequences can be found in the " "insertion tree. Please double check that both inputs match up, " "i.e. are results from the same 'sepp' run.")) return pd_taxonomy.set_index('Feature ID')
def test_newick_format_validate_negative(self): filepath = self.get_data_path('not-tree.nwk') format = NewickFormat(filepath, mode='r') with self.assertRaisesRegex(ValueError, 'NewickFormat'): format.validate()