Пример #1
0
    def setUp(self):
        super().setUp()
        empty_table_fp = self.get_data_path('empty_table.biom')
        self.empty_table_as_BIOMV210Format = \
            BIOMV210Format(empty_table_fp, mode='r')
        input_table_fp = self.get_data_path('faith_test_table.biom')
        self.input_table_as_BIOMV210Format = \
            BIOMV210Format(input_table_fp, mode='r')
        rf_table_fp = self.get_data_path('faith_test_table_rf.biom')
        self.rf_table_as_BIOMV210Format = BIOMV210Format(rf_table_fp, mode='r')
        pa_table_fp = self.get_data_path('faith_test_table_pa.biom')
        self.pa_table_as_BIOMV210Format = BIOMV210Format(pa_table_fp, mode='r')

        empty_tree_fp = self.get_data_path('empty.tree')
        self.empty_tree_as_NewickFormat = NewickFormat(empty_tree_fp, mode='r')
        input_tree_fp = self.get_data_path('faith_test.tree')
        self.input_tree_as_NewickFormat = NewickFormat(input_tree_fp, mode='r')
        root_only_tree_fp = self.get_data_path('root_only.tree')
        self.root_only_tree_as_NewickFormat = \
            NewickFormat(root_only_tree_fp, mode='r')
        missing_tip_tree_fp = self.get_data_path('missing_tip.tree')
        self.missing_tip_tree_as_NewickFormat = \
            NewickFormat(missing_tip_tree_fp, mode='r')

        self.expected = pd.Series(
            {
                'S1': 0.5,
                'S2': 0.7,
                'S3': 1.0,
                'S4': 100.5,
                'S5': 101
            },
            name='faith_pd')
Пример #2
0
def scaffold_hybrid_tree_foundation_tree(
    otu_map: OtuMapFormat,
    extension_taxonomy: TSVTaxonomyFormat,
    extension_sequences: DNAFASTAFormat,
    foundation_tree: NewickFormat,
    foundation_taxonomy: TSVTaxonomyFormat,
    graft_level: str = _ghost_tree_defaults['graft_level'],
) -> NewickFormat:

    otu_map_fh = otu_map.open()
    extension_taxonomy_fh = extension_taxonomy.open()
    extension_sequences_fh = extension_sequences.open()
    foundation_alignment_fh = foundation_tree.open()
    if foundation_taxonomy:
        foundation_taxonomy_fh = foundation_taxonomy.open()
    else:
        foundation_taxonomy_fh = None

    with tempfile.TemporaryDirectory() as tmp:

        # need ghost_tree.nwk here otherwise file exists
        gt_path = os.path.join(tmp, 'ghost_tree')
        thetree = extensions_onto_foundation(otu_map_fh, extension_taxonomy_fh,
                                             extension_sequences_fh,
                                             foundation_alignment_fh, gt_path,
                                             graft_level,
                                             foundation_taxonomy_fh)[0]

        # write new file to tmp file; gets deleted when this block is done
        gt_temp_file = open(tmp + 'ghost_tree', 'w')
        gt_temp_file.write(thetree)
        gt_temp_file.close()

        return NewickFormat(tmp + 'ghost_tree', 'r')
Пример #3
0
    def setUp(self):
        super().setUp()
        # expected computed with skbio.diversity.beta_diversity
        self.expected = skbio.DistanceMatrix([[0.00, 0.25, 0.25],
                                             [0.25, 0.00, 0.00],
                                             [0.25, 0.00, 0.00]],
                                             ids=['S1', 'S2', 'S3'])

        table_fp = self.get_data_path('two_feature_table.biom')
        self.table_as_BIOMV210Format = BIOMV210Format(table_fp, mode='r')
        rf_table_fp = self.get_data_path('two_feature_rf_table.biom')
        self.rf_table_as_BIOMV210Format = BIOMV210Format(rf_table_fp, mode='r')
        p_a_table_fp = self.get_data_path('two_feature_p_a_table.biom')
        self.p_a_table_as_BIOMV210Format = BIOMV210Format(p_a_table_fp,
                                                          mode='r')
        self.table_as_artifact = Artifact.import_data(
                    'FeatureTable[Frequency]', self.table_as_BIOMV210Format)

        tree_fp = self.get_data_path('three_feature.tree')
        self.tree_as_NewickFormat = NewickFormat(tree_fp, mode='r')
        self.tree_as_artifact = Artifact.import_data(
                    'Phylogeny[Rooted]', self.tree_as_NewickFormat)

        self.unweighted_unifrac_thru_framework = self.plugin.actions[
                    'unweighted_unifrac']
Пример #4
0
def classify_paths(representative_sequences: DNASequencesDirectoryFormat,
                   tree: NewickFormat) -> pd.DataFrame:
    # Traverse trees from bottom-up for nodes that are inserted fragments and
    # collect taxonomic labels upon traversal.
    tree = skbio.TreeNode.read(str(tree))
    taxonomy = []
    for fragment in representative_sequences.file.view(DNAIterator):
        lineage = []
        try:
            for ancestor in tree.find(fragment.metadata['id']).ancestors():
                if (ancestor.name is not None) and ('__' in ancestor.name):
                    lineage.append(ancestor.name)
            lineage_str = '; '.join(reversed(lineage))
        except skbio.tree.MissingNodeError:
            lineage_str = np.nan
        taxonomy.append({
            'Feature ID': fragment.metadata['id'],
            'Taxon': lineage_str
        })
    pd_taxonomy = pd.DataFrame(taxonomy).set_index('Feature ID')
    if pd_taxonomy['Taxon'].dropna().shape[0] == 0:
        raise ValueError(
            ('None of the representative-sequences can be found in the '
             'insertion tree. Please double check that both inputs match up, '
             'i.e. are results from the same \'sepp\' run.'))
    return pd_taxonomy
Пример #5
0
def fasttree(alignment: AlignedDNAFASTAFormat) -> NewickFormat:
    result = NewickFormat()
    aligned_fp = str(alignment)
    tree_fp = str(result)
    cmd = ['FastTree', '-nt', aligned_fp]
    run_command(cmd, tree_fp)
    return result
Пример #6
0
    def setUp(self):
        super().setUp()
        # expected computed with diversity.beta_phylogenetic (weighted_unifrac)
        self.expected = skbio.DistanceMatrix(
            np.array([0.44656238, 0.23771096, 0.30489123, 0.23446002,
                      0.65723575, 0.44911772, 0.381904, 0.69144829,
                      0.39611776, 0.36568012, 0.53377975, 0.48908025,
                      0.35155196, 0.28318669, 0.57376916, 0.23395746,
                      0.24658122, 0.60271637, 0.39802552, 0.36567394,
                      0.68062701, 0.36862049, 0.48350632, 0.33024631,
                      0.33266697, 0.53464744, 0.74605075, 0.53951035,
                      0.49680733, 0.79178838, 0.37109012, 0.52629343,
                      0.22118218, 0.32400805, 0.43189708, 0.59705893]),
            ids=('10084.PC.481', '10084.PC.593', '10084.PC.356',
                 '10084.PC.355', '10084.PC.354', '10084.PC.636',
                 '10084.PC.635', '10084.PC.607', '10084.PC.634'))

        table_fp = self.get_data_path('crawford.biom')
        self.table_as_BIOMV210Format = BIOMV210Format(table_fp, mode='r')
        rel_freq_table_fp = self.get_data_path('crawford_rf.biom')
        self.rf_table_as_BIOMV210Format = BIOMV210Format(rel_freq_table_fp,
                                                         mode='r')

        tree_fp = self.get_data_path('crawford.nwk')
        self.tree_as_NewickFormat = NewickFormat(tree_fp, mode='r')
Пример #7
0
def raxml_rapid_bootstrap(alignment: AlignedDNAFASTAFormat,
                          seed: int = None, rapid_bootstrap_seed: int = None,
                          bootstrap_replicates: int = 100, n_threads: int = 1,
                          raxml_version: str = 'Standard',
                          substitution_model: str = 'GTRGAMMA'
                          ) -> NewickFormat:
    result = NewickFormat()
    cmd = _set_raxml_version(raxml_version=raxml_version, n_threads=n_threads)

    if seed is None:
        seed = randint(1000, 10000)

    if rapid_bootstrap_seed is None:
        rapid_bootstrap_seed = randint(1000, 10000)

    runname = 'q2bootstrap'
    with tempfile.TemporaryDirectory() as temp_dir:
        cmd += _build_rapid_bootstrap_command(alignment, seed,
                                              rapid_bootstrap_seed,
                                              bootstrap_replicates,
                                              substitution_model, temp_dir,
                                              runname)
        run_command(cmd)

        tree_tmp_fp = os.path.join(temp_dir, 'RAxML_bipartitions.%s' % runname)
        os.rename(tree_tmp_fp, str(result))

    return result
Пример #8
0
 def setUp(self):
     super().setUp()
     with open(self.get_data_path('asv_table.tsv')) as fh:
         self.table = biom.Table.from_tsv(fh, None, None, None)
     self.phylogeny = NewickFormat(self.get_data_path('tree.nwk'), mode='r')
     self.metadata = (qiime2.Metadata.load(
         self.get_data_path('metadata.tsv')))
Пример #9
0
def raxml(alignment: AlignedDNAFASTAFormat,
          seed: int = None,
          n_searches: int = 1,
          n_threads: int = 1,
          raxml_version: str = 'Standard',
          substitution_model: str = 'GTRGAMMA') -> NewickFormat:
    result = NewickFormat()

    cmd = _set_raxml_version(raxml_version=raxml_version, n_threads=n_threads)

    if seed is None:
        seed = randint(1000, 10000)

    runname = 'q2'
    with tempfile.TemporaryDirectory() as temp_dir:
        cmd += ['-m', str(substitution_model),
                '-p', str(seed),
                '-N', str(n_searches),
                '-s', str(alignment),
                '-w', temp_dir,
                '-n', runname]
        run_command(cmd)

        tree_tmp_fp = os.path.join(temp_dir, 'RAxML_bestTree.%s' % runname)
        os.rename(tree_tmp_fp, str(result))

    return result
Пример #10
0
def iqtree_ultrafast_bootstrap(
        alignment: AlignedDNAFASTAFormat,
        seed: int = _iqtree_defaults['seed'],
        n_cores: int = _iqtree_defaults['n_cores'],
        n_cores_max: int = _iqtree_defaults['n_cores_max'],
        n_runs: int = _iqtree_defaults['n_runs'],
        substitution_model: str = _iqtree_defaults['substitution_model'],
        bootstrap_replicates: int = _iqtree_defaults['bootstrap_replicates'],
        n_init_pars_trees: int = _iqtree_defaults['n_init_pars_trees'],
        n_top_init_trees: int = _iqtree_defaults['n_top_init_trees'],
        n_best_retain_trees: int = _iqtree_defaults['n_best_retain_trees'],
        stop_iter: int = _iqtree_defaults['stop_iter'],
        perturb_nni_strength: float = _iqtree_defaults['perturb_nni_strength'],
        spr_radius: int = _iqtree_defaults['spr_radius'],
        n_max_ufboot_iter: int = _iqtree_defaults['n_max_ufboot_iter'],
        n_ufboot_steps: int = _iqtree_defaults['n_ufboot_steps'],
        min_cor_ufboot: float = _iqtree_defaults['min_cor_ufboot'],
        ep_break_ufboot: float = _iqtree_defaults['ep_break_ufboot'],
        allnni: bool = _iqtree_defaults['allnni'],
        alrt: int = _iqtree_defaults['alrt'],
        abayes: bool = _iqtree_defaults['abayes'],
        lbp: int = _iqtree_defaults['lbp'],
        bnni: bool = _iqtree_defaults['bnni'],
        safe: bool = _iqtree_defaults['safe']) -> NewickFormat:
    # NOTE: the IQ-TREE commands `-n` (called as `n_iter` in the `iqtree`
    # method) and `-fast` are not compatable with ultrafast_bootstrap `-bb`.
    result = NewickFormat()

    with tempfile.TemporaryDirectory() as temp_dir:
        run_prefix = os.path.join(temp_dir, 'q2iqtreeufboot')
        cmd = _build_iqtree_ufbs_command(
            alignment,
            seed=seed,
            n_cores=n_cores,
            n_cores_max=n_cores_max,
            n_runs=n_runs,
            substitution_model=substitution_model,
            bootstrap_replicates=bootstrap_replicates,
            run_prefix=run_prefix,
            n_init_pars_trees=n_init_pars_trees,
            n_top_init_trees=n_top_init_trees,
            n_best_retain_trees=n_best_retain_trees,
            stop_iter=stop_iter,
            perturb_nni_strength=perturb_nni_strength,
            spr_radius=spr_radius,
            n_max_ufboot_iter=n_max_ufboot_iter,
            n_ufboot_steps=n_ufboot_steps,
            min_cor_ufboot=min_cor_ufboot,
            ep_break_ufboot=ep_break_ufboot,
            allnni=allnni,
            alrt=alrt,
            abayes=abayes,
            lbp=lbp,
            bnni=bnni,
            safe=safe)
        run_command(cmd)
        tree_tmp_fp = os.path.join(temp_dir, '%s.treefile' % run_prefix)
        os.rename(tree_tmp_fp, str(result))

    return result
Пример #11
0
def sepp(representative_sequences: DNASequencesDirectoryFormat,
         reference_database: SeppReferenceDirFmt,
         alignment_subset_size: int = 1000,
         placement_subset_size: int = 5000,
         threads: int = 1,
         debug: bool = False,
         ) -> (NewickFormat, PlacementsFormat):

    placements = 'q2-fragment-insertion_placement.json'
    tree = 'q2-fragment-insertion_placement.tog.relabelled.tre'

    placements_result = PlacementsFormat()
    tree_result = NewickFormat()

    with tempfile.TemporaryDirectory() as tmp:
        _run(str(representative_sequences.file.view(DNAFASTAFormat)),
             str(threads), tmp,
             str(alignment_subset_size), str(placement_subset_size),
             str(reference_database.alignment.path_maker()),
             str(reference_database.phylogeny.path_maker()),
             str(reference_database.raxml_info.path_maker()),
             debug)
        outtree = os.path.join(tmp, tree)
        outplacements = os.path.join(tmp, placements)

        _add_missing_branch_length(outtree)

        shutil.copyfile(outtree, str(tree_result))
        shutil.copyfile(outplacements, str(placements_result))

    return tree_result, placements_result
Пример #12
0
def filter_features(table: biom.Table,
                    tree: NewickFormat) -> (biom.Table, biom.Table):

    # load the insertion tree
    tree = skbio.TreeNode.read(str(tree))
    # collect all tips=inserted fragments+reference taxa names
    fragments_tree = {
        str(tip.name)
        for tip in tree.tips()
        if tip.name is not None}

    # collect all fragments/features from table
    fragments_table = set(map(str, table.ids(axis='observation')))

    if len(fragments_table & fragments_tree) <= 0:
        raise ValueError(('Not a single fragment of your table is part of your'
                          ' tree. The resulting table would be empty.'))

    tbl_positive = table.filter(fragments_table & fragments_tree,
                                axis='observation', inplace=False)
    tbl_negative = table.filter(fragments_table - fragments_tree,
                                axis='observation', inplace=False)

    # print some information for quality control,
    # which user can request via --verbose
    results = pd.DataFrame(
        data={'kept_reads': tbl_positive.sum(axis='sample'),
              'removed_reads': tbl_negative.sum(axis='sample')},
        index=tbl_positive.ids())
    results['removed_ratio'] = results['removed_reads'] / \
        (results['kept_reads'] + results['removed_reads'])

    return (tbl_positive, tbl_negative)
Пример #13
0
    def test_failed_run_not_verbose(self):
        input_fp = self.get_data_path('aligned-dna-sequences-1.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        result = NewickFormat()
        aligned_fp = str(input_sequences)
        tree_fp = str(result)

        cmd = ['FastTree', '-nt', '-not-a-real-parameter', aligned_fp]
        with self.assertRaises(subprocess.CalledProcessError):
            with redirected_stdio(stderr=os.devnull):
                run_command(cmd, tree_fp, verbose=False)
Пример #14
0
    def setUp(self):
        super().setUp()
        valid_table_fp = self.get_data_path('two_feature_table.biom')
        self.valid_table_as_BIOMV210Format = \
            BIOMV210Format(valid_table_fp, mode='r')
        # empty table fp generated from self.empty_table with biom v2.1.7
        self.empty_table = biom.Table(np.array([]), [], [])
        empty_table_fp = self.get_data_path('empty_table.biom')
        self.empty_table_as_BIOMV210Format = \
            BIOMV210Format(empty_table_fp, mode='r')

        empty_tree_fp = self.get_data_path('empty.tree')
        self.empty_tree_as_NewickFormat = NewickFormat(empty_tree_fp, mode='r')
        root_only_tree_fp = self.get_data_path('root_only.tree')
        self.root_only_tree_as_NewickFormat = NewickFormat(root_only_tree_fp,
                                                           mode='r')
        missing_tip_tree_fp = self.get_data_path('missing_tip.tree')
        self.missing_tip_tree_as_NewickFormat = \
            NewickFormat(missing_tip_tree_fp, mode='r')
        two_feature_tree_fp = self.get_data_path('two_feature.tree')
        self.two_feature_tree_as_NewickFormat = \
            NewickFormat(two_feature_tree_fp, mode='r')
        extra_tip_tree_fp = self.get_data_path('extra_tip.tree')
        self.extra_tip_tree_as_NewickFormat = NewickFormat(extra_tip_tree_fp,
                                                           mode='r')
        valid_tree_fp = self.get_data_path('three_feature.tree')
        self.valid_tree_as_NewickFormat = NewickFormat(valid_tree_fp, mode='r')
Пример #15
0
def tip_to_tip_distances(output_dir: str, tree_1: NewickFormat,
                         tree_2: NewickFormat,
                         method: str=_ghost_tree_defaults['method']):

    tree1_fh = tree_1.open()
    tree2_fh = tree_2.open()

    stats_results = compare_tip_to_tip_distances(
        tree1_fh, tree2_fh, method)

    data_dict = {
        'Correlation Coefficient': str(round(stats_results[0], 5)),
        'p-value': str(stats_results[1]),
        'Number of Overlapping Tips': str(stats_results[2]),
    }

    df = pd.Series(data=data_dict).to_frame()
    df.columns = ['Tree Comparison Statistics']

    index = os.path.join(output_dir, 'index.html')
    with open(index, 'w') as fh:
        fh.write(df.to_html())
Пример #16
0
def iqtree(
    alignment: AlignedDNAFASTAFormat,
    seed: int = _iqtree_defaults['seed'],
    n_cores: int = _iqtree_defaults['n_cores'],
    n_cores_max: int = _iqtree_defaults['n_cores_max'],
    n_runs: int = _iqtree_defaults['n_runs'],
    substitution_model: str = _iqtree_defaults['substitution_model'],
    n_init_pars_trees: int = _iqtree_defaults['n_init_pars_trees'],
    n_top_init_trees: int = _iqtree_defaults['n_top_init_trees'],
    n_best_retain_trees: int = _iqtree_defaults['n_best_retain_trees'],
    n_iter: int = _iqtree_defaults['n_iter'],
    stop_iter: int = _iqtree_defaults['stop_iter'],
    perturb_nni_strength: float = _iqtree_defaults['perturb_nni_strength'],
    spr_radius: int = _iqtree_defaults['spr_radius'],
    allnni: bool = _iqtree_defaults['allnni'],
    fast: bool = _iqtree_defaults['fast'],
    alrt: int = _iqtree_defaults['alrt'],
    abayes: bool = _iqtree_defaults['abayes'],
    lbp: int = _iqtree_defaults['lbp'],
    safe: bool = _iqtree_defaults['safe'],
) -> NewickFormat:
    result = NewickFormat()

    with tempfile.TemporaryDirectory() as temp_dir:
        run_prefix = os.path.join(temp_dir, 'q2iqtree')
        cmd = _build_iqtree_command(alignment,
                                    seed=seed,
                                    n_cores=n_cores,
                                    n_cores_max=n_cores_max,
                                    n_runs=n_runs,
                                    substitution_model=substitution_model,
                                    run_prefix=run_prefix,
                                    n_init_pars_trees=n_init_pars_trees,
                                    n_top_init_trees=n_top_init_trees,
                                    n_best_retain_trees=n_best_retain_trees,
                                    n_iter=n_iter,
                                    stop_iter=stop_iter,
                                    perturb_nni_strength=perturb_nni_strength,
                                    spr_radius=spr_radius,
                                    allnni=allnni,
                                    fast=fast,
                                    alrt=alrt,
                                    abayes=abayes,
                                    lbp=lbp,
                                    safe=safe)
        run_command(cmd)

        tree_tmp_fp = os.path.join(temp_dir, '%s.treefile' % run_prefix)
        os.rename(tree_tmp_fp, str(result))

    return result
Пример #17
0
    def setUp(self):
        super().setUp()

        @_validate_requested_cpus
        def function_no_params():
            pass

        self.function_no_params = function_no_params

        @_validate_requested_cpus
        def function_w_param(n_jobs=3):
            return n_jobs

        self.function_w_n_jobs_param = function_w_param

        @_validate_requested_cpus
        def function_w_threads(threads=2):
            return threads

        self.function_w_threads_param = function_w_threads

        @_validate_requested_cpus
        def function_w_duplicate_params(n_jobs=3, threads=2):
            pass

        self.function_w_both = function_w_duplicate_params

        self.jaccard_thru_framework = self.plugin.actions['jaccard']
        self.unweighted_unifrac_thru_framework = self.plugin.actions[
            'unweighted_unifrac']

        two_feature_table_fp = self.get_data_path('two_feature_table.biom')
        self.two_feature_table = biom.load_table(two_feature_table_fp)
        self.two_feature_table_as_BIOMV210Format = BIOMV210Format(
            two_feature_table_fp, mode='r')
        self.two_feature_table_as_artifact = Artifact.import_data(
            'FeatureTable[Frequency]', two_feature_table_fp)

        larger_table_fp = self.get_data_path('crawford.biom')
        self.larger_table_as_artifact = Artifact.import_data(
            'FeatureTable[Frequency]', larger_table_fp)

        valid_tree_fp = self.get_data_path('three_feature.tree')
        self.valid_tree_as_NewickFormat = NewickFormat(valid_tree_fp, mode='r')
        self.valid_tree_as_artifact = Artifact.import_data(
            'Phylogeny[Rooted]', valid_tree_fp)

        larger_tree_fp = self.get_data_path('crawford.nwk')
        self.larger_tree_as_artifact = Artifact.import_data(
            'Phylogeny[Rooted]', larger_tree_fp)
Пример #18
0
def fasttree(alignment: AlignedDNAFASTAFormat,
             n_threads: int = 1) -> NewickFormat:
    result = NewickFormat()
    aligned_fp = str(alignment)
    tree_fp = str(result)

    env = None
    if n_threads == 1:
        cmd = ['FastTree']
    else:
        env = os.environ.copy()
        env.update({'OMP_NUM_THREADS': str(n_threads)})
        cmd = ['FastTreeMP']

    cmd.extend(['-quote', '-nt', aligned_fp])
    run_command(cmd, tree_fp, env=env)
    return result
Пример #19
0
def sepp(
    representative_sequences: DNASequencesDirectoryFormat,
    threads: int = 1,
    alignment_subset_size: int = 1000,
    placement_subset_size: int = 5000,
    reference_alignment: AlignedDNASequencesDirectoryFormat = None,
    reference_phylogeny: NewickFormat = None,
    debug: bool = False,
) -> (NewickFormat, PlacementsFormat):

    _sanity()
    # check if sequences and tips in reference match
    if not _reference_matches(reference_alignment, reference_phylogeny):
        raise ValueError(
            ('Reference alignment and phylogeny do not match up. Please ensure'
             ' that all sequences in the alignment correspond to exactly one '
             'tip name in the phylogeny.'))

    placements = 'q2-fragment-insertion_placement.json'
    tree = 'q2-fragment-insertion_placement.tog.relabelled.tre'

    placements_result = PlacementsFormat()
    tree_result = NewickFormat()

    with tempfile.TemporaryDirectory() as tmp:
        _run(str(representative_sequences.file.view(DNAFASTAFormat)),
             str(threads), tmp, str(alignment_subset_size),
             str(placement_subset_size), reference_alignment,
             reference_phylogeny, debug)
        outtree = os.path.join(tmp, tree)
        outplacements = os.path.join(tmp, placements)

        _add_missing_branch_length(outtree)

        shutil.copyfile(outtree, str(tree_result))
        shutil.copyfile(outplacements, str(placements_result))

    return tree_result, placements_result
Пример #20
0
    def setUp(self):
        super().setUp()
        self.empty_table = biom.Table(np.array([]), [], [])
        # empty table generated from self.empty_table with biom v2.1.7
        empty_table_fp = self.get_data_path('empty_table.biom')
        self.empty_table_as_BIOMV210Format = BIOMV210Format(empty_table_fp,
                                                            mode='r')
        valid_table_fp = self.get_data_path('crawford.biom')
        self.valid_table_as_BIOMV210Format = BIOMV210Format(valid_table_fp,
                                                            mode='r')
        not_a_table_fp = self.get_data_path('crawford.nwk')
        self.invalid_view_type = NewickFormat(not_a_table_fp, mode='r')

        self.valid_table_list = [
            self.valid_table_as_BIOMV210Format,
            self.valid_table_as_BIOMV210Format
        ]
        self.invalid_table_list = [
            self.valid_table_as_BIOMV210Format, self.invalid_view_type
        ]
        self.has_empty_table_list = [
            self.empty_table_as_BIOMV210Format,
            self.valid_table_as_BIOMV210Format
        ]

        @_disallow_empty_tables
        def f1(table: biom.Table):
            pass

        self.function_with_table_param = f1

        @_disallow_empty_tables
        def f2():
            pass

        self.function_without_table_param = f2
Пример #21
0
    def test_newick_format_validate_positive(self):
        filepath = self.get_data_path('tree.nwk')
        format = NewickFormat(filepath, mode='r')

        format.validate()
Пример #22
0
def classify_otus_experimental(
        representative_sequences: DNASequencesDirectoryFormat,
        tree: NewickFormat,
        reference_taxonomy: pd.DataFrame) -> pd.DataFrame:

    # convert type of feature IDs to str (depending on pandas type inference
    # they might come as integers), to make sure they are of the same type as
    # in the tree.
    reference_taxonomy.index = map(str, reference_taxonomy.index)

    # load the insertion tree
    tree = skbio.TreeNode.read(str(tree))

    # ensure that all reference tips in the tree (those without the inserted
    # fragments) have a mapping in the user provided taxonomy table
    names_tips = {node.name for node in tree.tips()}
    names_fragments = {fragment.metadata['id']
                       for fragment
                       in representative_sequences.file.view(DNAIterator)}
    missing_features = (names_tips - names_fragments) -\
        set(reference_taxonomy.index)
    if len(missing_features) > 0:
        raise ValueError("Not all OTUs in the provided insertion tree have "
                         "mappings in the provided reference taxonomy. "
                         "Taxonomy missing for the following %i feature(s):"
                         "\n%s" % (len(missing_features),
                                   "\n".join(missing_features)))

    taxonomy = []
    for fragment in representative_sequences.file.view(DNAIterator):
        # for every inserted fragment we now try to find the closest OTU tip
        # in the tree and available mapping from the OTU-ID to a lineage
        # string:
        lineage_str = np.nan
        # first, let us check if the fragment has been inserted at all ...
        try:
            curr_node = tree.find(fragment.metadata['id'])
        except skbio.tree.MissingNodeError:
            continue
        # if yes, we start from the inserted node and traverse the tree as less
        # as possible towards the root and check at every level if one or
        # several OTU-tips are within the sub-tree.
        if curr_node is not None:
            foundOTUs = []
            # Traversal is stopped at a certain level, if one or more OTU-tips
            # have been found in the sub-tree OR ... (see break below)
            while len(foundOTUs) == 0:
                # SEPP insertion - especially for multiple very similar
                # sequences - can result in a rather complex topology change
                # if all those sequences are inserted into the same branch
                # leading to one OTU-tip. Thus, we cannot simply visit only
                # all siblings or decendents and rather need to traverse the
                # whole sub-tree. Average case should be well behaved,
                # thus I think it is ok.
                for node in curr_node.postorder():
                    if (node.name is not None) and \
                       (node.name in reference_taxonomy.index):
                        # if a suitable OTU-tip node is found AND this OTU-ID
                        # has a mapping in the user provided reference_taxonomy
                        # we store the OTU-ID in the growing result list
                        foundOTUs.append(node.name)
                # ... if the whole tree has been traversed without success,
                # e.g. if user provided reference_taxonomy did not contain any
                # matching OTU-IDs.
                if curr_node.is_root():
                    break
                # prepare next while iteration, by changing to the parent node
                curr_node = curr_node.parent

            if len(foundOTUs) > 0:
                # If the above method has identified exactly one OTU-tip,
                # resulting lineage string would simple be the one provided by
                # the user reference_taxonomy. However, if the inserted
                # fragment cannot unambiguously places into the reference tree,
                # the above method will find multiple OTU-IDs, which might have
                # lineage strings in the user provided reference_taxonomy that
                # are similar up to a certain rank and differ e.g. for genus
                # and species.
                # Thus, we here find the longest common prefix of all lineage
                # strings. We don't operate per character, but per taxonomic
                # rank. Therefore, we first "convert" every lineage sting into
                # a list of taxa, one per rank.
                split_lineages = []
                for otu in foundOTUs:
                    # find lineage string for OTU
                    lineage = reference_taxonomy.loc[otu, 'Taxon']
                    # necessary to split lineage apart to ensure that
                    # the longest common prefix operates on atomic ranks
                    # instead of characters
                    split_lineages.append(list(
                        map(str.strip, lineage.split(';'))))
                # find the longest common prefix rank-wise and concatenate to
                # one lineage string, separated by ;
                lineage_str = "; ".join(os.path.commonprefix(split_lineages))
            taxonomy.append({'Feature ID': fragment.metadata['id'],
                             'Taxon': lineage_str})
    pd_taxonomy = pd.DataFrame(taxonomy)
    # test if dataframe is completely empty, or if no lineages could be found
    if (len(taxonomy) == 0) or \
       (pd_taxonomy['Taxon'].dropna().shape[0] == 0):
        raise ValueError(
            ("None of the representative-sequences can be found in the "
             "insertion tree. Please double check that both inputs match up, "
             "i.e. are results from the same 'sepp' run."))

    return pd_taxonomy.set_index('Feature ID')
Пример #23
0
    def test_newick_format_validate_negative(self):
        filepath = self.get_data_path('not-tree.nwk')
        format = NewickFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValueError, 'NewickFormat'):
            format.validate()