示例#1
0
文件: _table.py 项目: chuckpr/lca
def _lineage(TreeNode):
    lineage = [node.name for node in TreeNode.ancestors()]
    lineage = lineage[::-1] #lowest to highest node order
    if not TreeNode.is_tip():
        lineage.append(TreeNode.name)
    lineage = ";".join(lineage[1:]) #first node -- the root -- has no name
    return lineage
    def test_defaults(self):

        exp_basis = pd.read_csv(
            self.get_data_path('expected/categorical_basis.tsv'), sep='\t')
        exp_data = pd.read_csv(
            self.get_data_path('expected/categorical_data.tsv'), sep='\t')
        exp_groups = pd.read_csv(
            self.get_data_path('expected/categorical_groups.tsv'), sep='\t')
        exp_factors = pd.read_csv(
            self.get_data_path('expected/categorical_factors.tsv'), sep='\t')
        exp_tree = TreeNode.read(
            self.get_data_path('expected/categorical_tree.nwk'))

        pf = phylofactor(self.table,
                         self.phylogeny,
                         self.metadata,
                         formula='Categorical~Data',
                         nfactors=3,
                         family='binomial')

        data, basis, out_tree, groups, factors = pf

        assert_frame_equal(basis, exp_basis)
        assert_frame_equal(groups, exp_groups)
        assert_frame_equal(factors, exp_factors)
        assert_frame_equal(data, exp_data)
        self.assertEqual(TreeNode.compare_rfd(exp_tree, out_tree), 0)
    def test_continous(self):
        exp_basis = pd.read_csv(
            self.get_data_path('expected/numeric_basis.tsv'), sep='\t')
        exp_data = pd.read_csv(self.get_data_path('expected/numeric_data.tsv'),
                               sep='\t')
        exp_groups = pd.read_csv(
            self.get_data_path('expected/numeric_groups.tsv'), sep='\t')
        exp_factors = pd.read_csv(
            self.get_data_path('expected/numeric_factors.tsv'), sep='\t')
        exp_tree = (TreeNode.read(
            self.get_data_path('expected/numeric_tree.nwk')))

        pf = phylofactor(self.table,
                         self.phylogeny,
                         self.metadata,
                         formula='Continuous~Data',
                         nfactors=3,
                         family='poisson')

        data, basis, out_tree, groups, factors = pf

        assert_frame_equal(basis, exp_basis)
        assert_frame_equal(groups, exp_groups)
        assert_frame_equal(factors, exp_factors)
        assert_frame_equal(data, exp_data)
        self.assertEqual(TreeNode.compare_rfd(exp_tree, out_tree), 0)
示例#4
0
def main_calc_tree_distance(lang_set_mat, dist_metric="rfd"):
    """Calculate Tree Distance."""
    pred_linkage = get_linkage_matrix(lang_set_mat)
    pred_tree = TreeNode.from_linkage_matrix(pred_linkage,
                                             INDO_EURO_LANG_NAMES)

    pred_tree_string_io = StringIO()
    pred_tree.write(pred_tree_string_io)
    pred_tree_string = pred_tree_string_io.getvalue()

    # Replace distances with 1
    unweighted_tree_string = re.sub(r"\d+\.\d+", "1", pred_tree_string)
    pred_tree = TreeNode.read(StringIO(unweighted_tree_string))

    if dist_metric == "rfd":
        tree_dist = pred_tree.compare_rfd(GT_INDO_EUROPEAN_TREE)
    else:
        gt_distances_struct = GT_INDO_EUROPEAN_TREE.tip_tip_distances()
        gt_distances = gt_distances_struct.data
        gt_ids = gt_distances_struct.ids

        pred_distances = pred_tree.tip_tip_distances(
            endpoints=list(gt_ids)).data
        tree_dist = np.sum((gt_distances - pred_distances)**2)

    return tree_dist, pred_tree
示例#5
0
def unifrac(classifications, weighted=True,
            field='readcount_w_children', rank='species', strict=False):
    """
    A beta diversity metric that takes into account the relative relatedness of community members.
    Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence
    """
    assert field in ACCEPTABLE_FIELDS
    counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)

    tree = None
    for c in classifications:
        if strict and c.job.id != classifications[0].job.id:
            raise OneCodexException('All Classifications must have the same Job for Unifrac')
        tree = generate_skbio_tree(c, existing_tree=tree)

    # there's a bug (?) in skbio where it expects the root to only have
    # one child, so we do a little faking here
    new_tree = TreeNode(name='fake root')
    new_tree.rank = 'no rank'
    new_tree.append(tree)

    # prune low-level nodes off the tree so the tips are what we're comparing
    prune_to_rank(new_tree, rank=rank)

    if weighted:
        return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
    else:
        return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
示例#6
0
def _newick_to_tree_node(fh, convert_underscores=True):
    tree_stack = []
    current_depth = 0
    last_token = ''
    next_is_distance = False
    root = TreeNode()
    tree_stack.append((root, current_depth))
    for token in _tokenize_newick(fh, convert_underscores=convert_underscores):
        # Check for a label
        if last_token not in '(,):':
            if not next_is_distance:
                tree_stack[-1][0].name = last_token if last_token else None
            else:
                next_is_distance = False
        # Check for a distance
        if token == ':':
            next_is_distance = True
        elif last_token == ':':
            try:
                tree_stack[-1][0].length = float(token)
            except ValueError:
                raise NewickFormatError("Could not read length as numeric type"
                                        ": %s." % token)

        elif token == '(':
            current_depth += 1
            tree_stack.append((TreeNode(), current_depth))
        elif token == ',':
            tree_stack.append((TreeNode(), current_depth))
        elif token == ')':
            if len(tree_stack) < 2:
                raise NewickFormatError("Could not parse file as newick."
                                        " Parenthesis are unbalanced.")
            children = []
            # Pop all nodes at this depth as they belong to the remaining
            # node on the top of the stack as children.
            while current_depth == tree_stack[-1][1]:
                node, _ = tree_stack.pop()
                children.insert(0, node)
            parent = tree_stack[-1][0]
            if parent.children:
                raise NewickFormatError("Could not parse file as newick."
                                        " Contains unnested children.")
            # This is much faster than TreeNode.extend
            for child in children:
                child.parent = parent
            parent.children = children
            current_depth -= 1
        elif token == ';':
            if len(tree_stack) == 1:
                return root
            break

        last_token = token

    raise NewickFormatError("Could not parse file as newick."
                            " `(Parenthesis)`, `'single-quotes'`,"
                            " `[comments]` may be unbalanced, or tree may be"
                            " missing its root.")
示例#7
0
def main():
    args = parse_args()

    tree1 = TreeNode.read(open(args.tree1_file))
    tree2 = TreeNode.read(open(args.tree2_file))

    tree_dist = calc_tree_distance(tree1, tree2)
    print ("Tree distance: %d" %tree_dist)
示例#8
0
    def unifrac(self, weighted=True, rank="auto"):
        """Calculate the UniFrac beta diversity metric.

        UniFrac takes into account the relatedness of community members. Weighted UniFrac considers
        abundances, unweighted UniFrac considers presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        # needs read counts, not relative abundances
        import skbio.diversity

        if self._guess_normalized():
            raise OneCodexException("UniFrac requires unnormalized read counts.")

        df = self.to_df(rank=rank, normalize=False)

        counts = []
        for c_id in df.index:
            counts.append(df.loc[c_id].tolist())

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=df.ocx_rank)

        # there's a bug (?) in skbio where it expects the root to only have
        # one child, so we do a little faking here
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
        else:
            return skbio.diversity.beta_diversity(
                "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
示例#9
0
 def testSistersOneIncompleteSister(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(
         StringIO("((A:1, B:2):3, ((C:1,D:1):1, (E:1,F:5)'g3':6):10)root;"))
     print(tree.ascii_art())
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals(sorted(['g3']), sorted([s.name for s in sisters]))
示例#10
0
 def testNoPairs(self):
     tree = TreeNode.read(
         StringIO(
             "(((A:1, B:2):3, (C:4, D:5):6)'f__family; g__genoos':10)root;")
     )
     examples = ThresholdFinder().find_examples(tree, 'f', 'g')
     self.assertSameCladeDistanceSet([], examples)
示例#11
0
    def test__generate_html_summary_phylogeny(self):
        fp_biom = join('qtp_biom', 'support_files', 'sepp.biom')
        fp_tree = join('qtp_biom', 'support_files', 'sepp.tre')

        # load metadata
        qurl = '/qiita_db/analysis/%s/metadata/' % 1
        md = self.qclient.get(qurl)

        # load phylogeny
        tree = TreeNode.read(fp_tree)

        obs_index_fp, obs_viz_fp, qza_fp = _generate_html_summary(fp_biom,
                                                                  md,
                                                                  self.out_dir,
                                                                  True,
                                                                  tree=tree)

        # test if two expected tags show up in the html summary page
        with open(obs_index_fp) as f:
            obs_html = ''.join(f.readlines())
            self.assertTrue('<th>Number placed fragments</th>' in obs_html)
            self.assertTrue('<td>434</td>' in obs_html)

        # test that phylogeny specific html content does not show up if no
        # tree is given
        obs_index_fp, obs_viz_fp, qza_fp = _generate_html_summary(fp_biom,
                                                                  md,
                                                                  self.out_dir,
                                                                  True,
                                                                  tree=None)
        with open(obs_index_fp) as f:
            obs_html = ''.join(f.readlines())
            self.assertTrue('<th>Number placed fragments</th>' not in obs_html)
def build_base_silva_taxonomy(tree_file, tax_dict):
    """Returns {TaxonomyID : [(rank, taxonomy), ...]} """
    print("Building base SILVA taxonomy...")
    tree = TreeNode.read(tree_file)
    ml = {}
    for node in tree.postorder():  # tree.tips():
        if node.is_root():
            break

        l = []
        rank, taxonomy = tax_dict[node.name]
        clean_taxonomy_str = filter_characters(taxonomy)

        if rank in allowed_ranks:
            l.append((allowed_ranks_dict[rank], clean_taxonomy_str))

        for ancestor in node.ancestors():
            if ancestor.is_root():
                break
            else:
                arank, ataxonomy = tax_dict[ancestor.name]
                cleaned_ataxonomy = filter_characters(ataxonomy)
                if arank in allowed_ranks:
                    l.append((allowed_ranks_dict[arank], cleaned_ataxonomy))

        #l.reverse()
        ml[node.name.strip()] = dict(l)

    return ml
示例#13
0
 def testSimpleTwice(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters_for_several_thresholds(
         tree, [0.25, 0.25])
     self.assertSameClusterSets(
         [[0.25, [['A', 'B'], ['D']]], [0.25, [['A', 'B'], ['D']]]],
         clusters)
示例#14
0
def _build_trees(clade_counts, edge_lengths, support_attr):
    """Construct the trees with support

    Parameters
    ----------
    clade_counts : dict
        Keyed by the frozenset of the clade and valued by the support
    edge_lengths : dict
        Keyed by the frozenset of the clade and valued by the weighted length
    support_attr : str
        The name of the attribute to hold the support value

    Returns
    -------
    list of TreeNode
        A list of the constructed trees
    """
    nodes = {}
    queue = [(len(clade), clade) for clade in clade_counts]
    while queue:
        # The values within the queue are updated on each iteration, so it
        # doesn't look like an insertion sort will make sense unfortunately
        queue.sort()
        (clade_size, clade) = queue.pop(0)
        new_queue = []

        # search for ancestors of clade
        for (_, ancestor) in queue:
            if clade.issubset(ancestor):
                # update ancestor such that, in the following example:
                # ancestor == {1, 2, 3, 4}
                # clade == {2, 3}
                # new_ancestor == {1, {2, 3}, 4}
                new_ancestor = (ancestor - clade) | frozenset([clade])

                # update references for counts and lengths
                clade_counts[new_ancestor] = clade_counts.pop(ancestor)
                edge_lengths[new_ancestor] = edge_lengths.pop(ancestor)

                ancestor = new_ancestor

            new_queue.append((len(ancestor), ancestor))

        # if the clade is a tip, then we have a name
        if clade_size == 1:
            name = list(clade)[0]
        else:
            name = None

        # the clade will not be in nodes if it is a tip
        children = [nodes.pop(c) for c in clade if c in nodes]
        length = edge_lengths[clade]

        node = TreeNode(children=children, length=length, name=name)
        setattr(node, support_attr, clade_counts[clade])
        nodes[clade] = node

        queue = new_queue

    return list(nodes.values())
示例#15
0
 def testClusterNamingWithBootstraps(self):
     tree = TreeNode.read(
         StringIO(
             "((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)'0.7:G':30)root;"))
     clusters = Tree2Tax().named_clusters(tree, 40)
     self.assertSameClusters([['F'], _('A B D H')], clusters)
     assert_equals(_('G.2 G.1'), [c.name() for c in clusters])
示例#16
0
    def test_ilr_ordination(self):
        np.random.seed(0)
        table = pd.DataFrame([[1, 1, 2, 2],
                              [1, 2, 2, 1],
                              [2, 2, 1, 1]],
                             index=[1, 2, 3],
                             columns=['a', 'b', 'c', 'd'])
        table = table.reindex(columns=np.random.permutation(table.columns))
        tree = TreeNode.read([
            '((c:0.025,d:0.025,f:0.1,e:0.025):0.2,(b:0.025,a:0.025):0.2);'])
        res_ord, res_tree, res_md = ilr_phylogenetic_ordination(
            table, tree, top_k_var=3)
        exp_balances = pd.DataFrame(
            [[0.693147, 0.0, 3.892122e-17],
             [0.0, -4.901291e-01, -4.901291e-01],
             [-0.693147, -5.551115e-17, -3.892122e-17]],
            columns=['y0', 'y1', 'y2'],
            index=[1, 2, 3])

        exp_balances = exp_balances[['y0', 'y1', 'y2']]
        exp_balances.index.name = 'sampleid'
        pdt.assert_frame_equal(res_ord.samples, exp_balances)
        exp_tree_str = ('((b:0.025,a:0.025)y1:0.2,'
                        '(c:0.025,d:0.025)y2:0.2)y0;\n')
        self.assertEqual(str(res_tree), exp_tree_str)

        exp_md = pd.DataFrame([[-0.5, -0.707107, 0.000000],
                               [-0.5, 0.707107, 0.000000],
                               [0.5, 0.000000, -0.707107],
                               [0.5, 0.000000, 0.707107]],
                              columns=['y0', 'y1', 'y2'],
                              index=['b', 'a', 'c', 'd'])
        exp_md.index.name = 'featureid'
        pdt.assert_frame_equal(res_md, exp_md)
示例#17
0
def make_modules(dist, min_dist, obs_ids):
    # create linkage matrix using complete linkage
    z = complete(dist)
    # make tree from linkage matrix with names from dist
    tree = TreeNode.from_linkage_matrix(z, obs_ids)
    # get all tips so in the end we can check if we are done
    all_tips = len([i for i in tree.postorder() if i.is_tip()])
    modules = set()
    seen = set()
    dist = pd.DataFrame(squareform(dist), index=obs_ids, columns=obs_ids)
    for node in tree.levelorder():
        if node.is_tip():
            seen.add(node.name)
        else:
            tip_names = frozenset(
                (i.name for i in node.postorder() if i.is_tip()))
            if tip_names.issubset(seen):
                continue
            dists = (dist.loc[tip1, tip2] > min_dist
                     for tip1, tip2 in combinations(tip_names, 2))
            if any(dists):
                continue
            else:
                modules.add(tip_names)
                seen.update(tip_names)
        if len(seen) == all_tips:
            modules = sorted(modules, key=len, reverse=True)
            return modules
    raise ValueError("Well, how did I get here?")
 def depth_partition(self, input_tree, percentile, output_tree):
     '''
     Attempt to cluster tree with nodes of tip-to-tip distrubution <
     an nth percentile cutoff of the whole-tree distance distribution. 
     A better description can be found in the citation below.
     
     Parameters
     ----------
     tree: skbio TreeNode obj
         http://scikit-bio.org/docs/latest/generated/skbio.tree.TreeNode.html #skbio.tree.TreeNode
             
     percentile: float
         The percentile cutoff to use to determine the cutoff from clading
         from a given node.
     
     Clustering method modified from Prosperi et al method:
     Prosperi, M.C.F., et al. A novel methodology for large-scale phylogeny 
     partition. Nat. Commun. 2:321 doi: 10.1038/ncomms1325 (2011).
     
     http://www.nature.com/ncomms/journal/v2/n5/full/ncomms1325.html
     '''
     tree = TreeNode.read(input_tree)
     tree = tree.root_at_midpoint()
     cluster_count = 1
     clustered = set()
     clusters = {}
     logging.debug("Calculating %ith percentile cutoff from root" \
                                             % (percentile))
     whole_tree_distribution = self._node_dist(tree)
     
     cutoff = np.percentile(whole_tree_distribution, percentile)
     logging.debug("Cutoff (%ith percentile): %f" % (percentile,
                                                     cutoff))
     for node in tree.preorder():
         if node in clustered:
             continue
         elif node.is_tip():
             continue
         else:
             node_distribution = self._node_dist(node)
             median=np.median(node_distribution)
             logging.debug("Median of node: %f" % median)
             if median <= cutoff:
                 logging.debug("Cluster found!")
                 cluster_name =  "partition_%i" % (cluster_count)
                 clusters[cluster_name] = [x.name.replace(' ','_') 
                                           for x in node.tips()]
                 self._rename(node, cluster_name)
                 cluster_count+=1
                 for descenent in node.traverse():
                     clustered.add(descenent)
     logging.info("%i depth cluster(s) found in tree" % (cluster_count-1))
     tree.write(output_tree, "newick")
     
     logging.debug("Recording tips that were not partitioned")
     clusters[self.UNCLUSTERED] = []
     for tip in tree.tips():
         if tip not in clustered:
             clusters[self.UNCLUSTERED].append(tip.name.replace(' ','_'))
     return clusters
    def depth_partition(self, input_tree, percentile, output_tree):
        '''
        Attempt to cluster tree with nodes of tip-to-tip distrubution <
        an nth percentile cutoff of the whole-tree distance distribution. 
        A better description can be found in the citation below.
        
        Parameters
        ----------
        tree: skbio TreeNode obj
            http://scikit-bio.org/docs/latest/generated/skbio.tree.TreeNode.html #skbio.tree.TreeNode
                
        percentile: float
            The percentile cutoff to use to determine the cutoff from clading
            from a given node.
        
        Clustering method modified from Prosperi et al method:
        Prosperi, M.C.F., et al. A novel methodology for large-scale phylogeny 
        partition. Nat. Commun. 2:321 doi: 10.1038/ncomms1325 (2011).
        
        http://www.nature.com/ncomms/journal/v2/n5/full/ncomms1325.html
        '''
        tree = TreeNode.read(input_tree)
        tree = tree.root_at_midpoint()
        cluster_count = 1
        clustered = set()
        clusters = {}
        logging.debug("Calculating %ith percentile cutoff from root" \
                                                % (percentile))
        whole_tree_distribution = self._node_dist(tree)

        cutoff = np.percentile(whole_tree_distribution, percentile)
        logging.debug("Cutoff (%ith percentile): %f" % (percentile, cutoff))
        for node in tree.preorder():
            if node in clustered:
                continue
            elif node.is_tip():
                continue
            else:
                node_distribution = self._node_dist(node)
                median = np.median(node_distribution)
                logging.debug("Median of node: %f" % median)
                if median <= cutoff:
                    logging.debug("Cluster found!")
                    cluster_name = "partition_%i" % (cluster_count)
                    clusters[cluster_name] = [
                        x.name.replace(' ', '_') for x in node.tips()
                    ]
                    self._rename(node, cluster_name)
                    cluster_count += 1
                    for descenent in node.traverse():
                        clustered.add(descenent)
        logging.info("%i depth cluster(s) found in tree" % (cluster_count - 1))
        tree.write(output_tree, "newick")

        logging.debug("Recording tips that were not partitioned")
        clusters[self.UNCLUSTERED] = []
        for tip in tree.tips():
            if tip not in clustered:
                clusters[self.UNCLUSTERED].append(tip.name.replace(' ', '_'))
        return clusters
示例#20
0
    def unifrac(self, weighted=True, rank="auto"):
        """A beta diversity metric that takes into account the relative relatedness of community
        members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        # needs read counts, not relative abundances
        if self._guess_normalized():
            raise OneCodexException("UniFrac requires unnormalized read counts.")

        df = self.to_df(rank=rank, normalize=False)

        counts = []
        for c_id in df.index:
            counts.append(df.loc[c_id].tolist())

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=df.ocx_rank)

        # there's a bug (?) in skbio where it expects the root to only have
        # one child, so we do a little faking here
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
        else:
            return skbio.diversity.beta_diversity(
                "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
示例#21
0
 def testFullTaxonomy(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(
         StringIO(
             "(((A:1, B:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")
     )
     assert_equals('f__family; g__genus2',
                   ann.full_taxonomy(tree, tree.find('D')))
示例#22
0
 def testSistersSelfNoParent(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(
         StringIO(
             "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"
         ))
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals([], [s.name for s in sisters])
示例#23
0
 def testFindParents(self):
     ann = TreeAnnotator()
     
     tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('B')).name, 'self is named')
     
     tree = TreeNode.read(StringIO("(((A:1, 2475:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('2475')).name, 'parent directly above')
     
     tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     assert_equals('f__family', ann.find_named_parent(tree, tree.find('2475')).name, 'parent 2 above')
     
     tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10);"))
     assert_equals(None, ann.find_named_parent(tree, tree.find('f__family').parent), 'parent of root')
     
     tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6):10);"))
     assert_equals(None, ann.find_named_parent(tree, tree.find('g__genus2').parent), 'no parent before root')
示例#24
0
 def testClusterNamingConventionsWithSomeUnnamed(self):
     tree = TreeNode.read(
         StringIO('((((A:11, B:12):10, D:9):20, F:20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(
         tree, 0.05)  #i.e. everything is a separate cluster
     self.assertSameClusters([['A'], ['B'], ['D'], ['F']], clusters)
     assert_equals(['G.1', 'G.2', 'G.3', 'G.4'],
                   [c.name() for c in clusters])
示例#25
0
 def testMultiplyNamedNode(self):
     tree = TreeNode.read(
         StringIO(
             "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2; s__spec':6)'f__family':10)root;"
         ))
     examples = ThresholdFinder().find_examples(tree, 'f', 'g')
     self.assertSameCladeDistanceSet(
         [['f__family', 'g__genus1', 'g__genus2; s__spec', 16.0]], examples)
示例#26
0
def generate_html_summary(qclient, job_id, parameters, out_dir):
    """Generates the HTML summary of a BIOM artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to validate and create the artifact
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, None, str
        Whether the job is successful
        Ignored
        The error message, if not successful
    """
    # Step 1: gather file information from qiita using REST api
    artifact_id = parameters['input_data']
    qclient_url = "/qiita_db/artifacts/%s/" % artifact_id
    artifact_info = qclient.get(qclient_url)

    # Step 2: get the mapping file, depends if analysis or not
    if artifact_info['analysis'] is None:
        is_analysis = False
        qurl = ('/qiita_db/prep_template/%s/' %
                artifact_info['prep_information'][0])
        md = qclient.get(qurl)['qiime-map']
    else:
        is_analysis = True
        qurl = '/qiita_db/analysis/%s/metadata/' % artifact_info['analysis']
        md = qclient.get(qurl)

    tree = None
    if 'plain_text' in artifact_info['files']:
        tree = TreeNode.read(artifact_info['files']['plain_text'][0])

    # Step 3: generate HTML summary
    # if we get to this point of the code we are sure that this is a biom file
    # and that it only has one element
    index_fp, viz_fp, qza_fp = _generate_html_summary(
        artifact_info['files']['biom'][0], md, out_dir, is_analysis, tree)

    # Step 4: add the new file to the artifact using REST api
    success = True
    error_msg = ""
    try:
        qclient.patch(qclient_url, 'add', '/html_summary/',
                      value=dumps({'html': index_fp, 'dir': viz_fp}))
    except Exception as e:
        success = False
        error_msg = str(e)

    return success, None, error_msg
示例#27
0
 def testOppositeSorting(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters_for_several_thresholds(
         tree, [0.05, 0.25])
     self.assertSameClusterSets(
         [[0.05, [['A'], ['B'], ['D']]], [0.25, [['A', 'B'], ['D']]]],
         clusters)
     assert_equals(_('C.1 C.2 Root'),
                   [c.name() for c in clusters[0].clusters])
示例#28
0
 def testTreeSubtree2(self):
     '''one genus is a subtree of another, and the longest branch is in both subtrees'''
     tree = TreeNode.read(
         StringIO(
             "((((A:1, B:52)'g__genus1':3, D:50)'g__genus2':6)'f__family':10)root;"
         ))
     examples = ThresholdFinder().find_examples(tree, 'f', 'g')
     self.assertSameCladeDistanceSet(
         [['f__family', 'g__genus1', 'g__genus2', 105.0]], examples)
示例#29
0
 def testSistersSisterWithDescendentNames(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(
         StringIO(
             "((A:1, B:2):3, (((a:1,b:1)'s1':1,D:1)'g2':1, (E:1,F:5)'g3':6):10)root;"
         ))
     print(tree.ascii_art())
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals(sorted(['g2', 'g3']), sorted([s.name for s in sisters]))
def write_tree():
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    triu = np.square(dmx.as_matrix())
    hclust = weighted(triu)
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
示例#31
0
 def testNaming(self):
     tree = TreeNode.read(
         StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;'))
     clusters = Tree2Tax().named_clusters_for_several_thresholds(
         tree, [40, 25])
     self.assertSameClusterSets(
         [[25, [['F'], _('A B'), _('D H')]],
          [40, [['F'], _('A B D H')]]], clusters)
     assert_equals(_('G.3 G.1 G.2'),
                   [c.name() for c in clusters[0].clusters])
     assert_equals(_('G.2 G.1'), [c.name() for c in clusters[1].clusters])
示例#32
0
    def testFindParents(self):
        ann = TreeAnnotator()

        tree = TreeNode.read(
            StringIO(
                "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"
            ))
        assert_equals('g__genus1',
                      ann.find_named_parent(tree, tree.find('B')).name,
                      'self is named')

        tree = TreeNode.read(
            StringIO(
                "(((A:1, 2475:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"
            ))
        assert_equals('g__genus1',
                      ann.find_named_parent(tree, tree.find('2475')).name,
                      'parent directly above')

        tree = TreeNode.read(
            StringIO(
                "(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"
            ))
        assert_equals('f__family',
                      ann.find_named_parent(tree, tree.find('2475')).name,
                      'parent 2 above')

        tree = TreeNode.read(
            StringIO(
                "(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10);"))
        assert_equals(
            None, ann.find_named_parent(tree,
                                        tree.find('f__family').parent),
            'parent of root')

        tree = TreeNode.read(
            StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6):10);"))
        assert_equals(
            None, ann.find_named_parent(tree,
                                        tree.find('g__genus2').parent),
            'no parent before root')
示例#33
0
 def test_missing_taxonomy(self):
     tree = TreeNode.read(
         StringIO('((((A:11, B:12)C:10, D:9)E:20, F:20)G:30)root;'))
     assert_equals(['C'],
                   TaxonomyFunctions().missing_taxonomy(
                       tree, tree.find('A'), tree.find('E')))
     assert_equals([],
                   TaxonomyFunctions().missing_taxonomy(
                       tree, tree.find('A'), tree.find('A')))
     assert_equals(['E', 'C'],
                   TaxonomyFunctions().missing_taxonomy(
                       tree, tree.find('A'), tree.find('G')))
示例#34
0
def get_clusters(x_original, axis='row'):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    row_dissims = pw_distances(x, ids=map(str, range(nr)), metric='euclidean')
    # do upgma - rows
    # Average in SciPy's cluster.hierarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    return [int(tip.name) for tip in tree.tips()]
示例#35
0
def get_clusters(x_original, axis='row'):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    row_dissims = pw_distances(x, ids=map(str, range(nr)), metric='euclidean')
    # do upgma - rows
    # Average in SciPy's cluster.hierarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    return [int(tip.name) for tip in tree.tips()]
示例#36
0
    def tree_build(self):
        """Build a tree from the taxonomy data present in this object.

        This is designed for use with `ClassificationsDataFrame` or `SampleCollection`.

        Returns
        -------
        `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current
        analysis and their parents leading back to the root node.
        """
        from skbio.tree import TreeNode

        # build all the nodes
        nodes = {}

        for tax_id in self.taxonomy.index:
            node = TreeNode(name=tax_id, length=1)
            node.tax_name = self.taxonomy["name"][tax_id]
            node.rank = self.taxonomy["rank"][tax_id]
            node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id]

            nodes[tax_id] = node

        # generate all the links
        for tax_id in self.taxonomy.index:
            try:
                parent = nodes[nodes[tax_id].parent_tax_id]
            except KeyError:
                if tax_id != "1":
                    warnings.warn(
                        "tax_id={} has parent_tax_id={} which is not in tree"
                        "".format(tax_id, nodes[tax_id].parent_tax_id))

                continue

            parent.append(nodes[tax_id])

        return nodes["1"]
示例#37
0
    def tree_build(self):
        """Build a tree from the taxonomy data present in this `ClassificationsDataFrame` or
        `SampleCollection`.

        Returns
        -------
        `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current
        analysis and their parents leading back to the root node.
        """
        from skbio.tree import TreeNode

        # build all the nodes
        nodes = {}

        for tax_id in self.taxonomy.index:
            node = TreeNode(name=tax_id, length=1)
            node.tax_name = self.taxonomy["name"][tax_id]
            node.rank = self.taxonomy["rank"][tax_id]
            node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id]

            nodes[tax_id] = node

        # generate all the links
        for tax_id in self.taxonomy.index:
            try:
                parent = nodes[nodes[tax_id].parent_tax_id]
            except KeyError:
                if tax_id != "1":
                    warnings.warn(
                        "tax_id={} has parent_tax_id={} which is not in tree"
                        "".format(tax_id, nodes[tax_id].parent_tax_id)
                    )

                continue

            parent.append(nodes[tax_id])

        return nodes["1"]
示例#38
0
def get_clusters(x_original, axis=['row', 'column'][0]):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    metric_f = get_nonphylogenetic_metric('euclidean')
    row_dissims = DistanceMatrix(metric_f(x), map(str, range(nr)))
    # do upgma - rows
    # Average in SciPy's cluster.heirarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    row_order = [int(tip.name) for tip in tree.tips()]
    return row_order
示例#39
0
    def testTipToCluster(self):
        tree = TreeNode.read(StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;'))
        clusters = Tree2Tax().named_clusters_for_several_thresholds(tree, [40, 25])
        self.assertSameClusterSets([[25,[['F'], _('A B'), _('D H')]], [40,[['F'], _('A B D H')]]], clusters)
        assert_equals(_('G.3 G.1 G.2'), [c.name() for c in clusters[0].clusters])
        assert_equals(_('G.2 G.1'), [c.name() for c in clusters[1].clusters])
         
        tip = tree.find('F')
        assert_equals('G.3', clusters[0].tip_to_cluster(tip).name())
        assert_equals('G.2', clusters[1].tip_to_cluster(tip).name())
 
        tip = tree.find('D')
        assert_equals('G.2', clusters[0].tip_to_cluster(tip).name())
        assert_equals('G.1', clusters[1].tip_to_cluster(tip).name())
示例#40
0
def load_tree_files(tree_dir):
    """Load trees from filepaths

    checks if  filenames indicate that trees are from different
    distance methods.  If so, warns user.
    loads trees into phylonode objects
    returns [trees]
    raises a RuntimeError if no  trees are loaded
    """
    tree_file_names = os.listdir(tree_dir)
    # ignore invisible files like .DS_Store
    tree_file_names = [fname for fname in tree_file_names if not
                       fname.startswith('.')]

    # try to warn user if using multiple types of trees {
    try:
        base_names = []
        for fname in tree_file_names:
            base_names.append(parse_rarefaction_fname(fname)[0])
    except ValueError:
        pass
    else:
        if len(set(base_names)) > 1:
            warnstr = """
warning: trees are named differently, please be sure you're not
comparing trees generated in different manners, unless you're quite sure
that's what you intend to do.  types: """ + str(set(base_names)) + """
continuing anyway..."""
            warn(warnstr)
    # }
    trees = []
    for fname in tree_file_names:
        try:
            f = open(os.path.join(tree_dir, fname), 'U')
            tree = TreeNode.from_newick(f)
            tree.filepath = fname
            trees.append(tree)
            f.close()
        except IOError as err:
            sys.stderr.write('error loading tree ' + fname + '\n')
            exit(1)
    if len(trees) == 0:
        raise RuntimeError('Error: no trees loaded' +
                           ', check that tree directory has has valid trees')
    return trees
def write_tree(cluster_method):
    import scipy.spatial.distance as ssd
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    #triu = np.square(dmx.as_matrix())
    triu = np.square(dmx.values)
    distArray = ssd.squareform(triu)
    if cluster_method == "average":
        hclust = average(distArray)
    elif cluster_method == "weighted":
        hclust = weighted(distArray)
    else:
        print("invalid cluster method chosen")
        sys.exit()
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
示例#42
0
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    dist_mat = DistanceMatrix.read(input_file)

    # SciPy uses average as UPGMA:
    # http://docs.scipy.org/doc/scipy/reference/generated/
    #    scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    linkage_matrix = linkage(dist_mat.condensed_form(), method='average')

    tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(tree.to_newick(with_distances=True))
    except AttributeError:
        if c is None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file),))
        raise
    f.close()
示例#43
0
文件: fundec.py 项目: geronimp/fundec
    def _open_tree(self, tree_path):
        '''
        Open a tree file, determine what decorations are already present. Strip
        Unwanted decoration
        
        Parameters
        ----------
        tree_path: str
            Path to a file containing a phylogenetic tree, in Newick format.
            
        Returns
        -------
        skbio TreeNode object
        '''
        tree_obj=TreeNode.read(open(tree_path))
        bootstrapped = True
        for node in tree_obj.non_tips():
            if node.name:
                try:
                    float(node.name)
                except:
                    logging.debug("Tree is decorated already. Stripping all \
    previous decoration from the tree.")
                    bootstrapped = False
                    tree_obj = self._strip_tree(tree_obj)
                    break
            else:
                if bootstrapped:
                    logging.warning("This tree doesn't appear correctly \
formatted or there is information missing. No boostrap value or decoration \
found for bare node. ")
                    bootstrapped = False
        if bootstrapped:
            logging.debug("Tree is bootstrap or has confidence values \
assigned to the nodes.")
        return tree_obj
示例#44
0
 def test_missing_taxonomy(self):
     tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, D:9)E:20, F:20)G:30)root;'))
     assert_equals(['C'], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('E')))
     assert_equals([], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('A')))
     assert_equals(['E','C'], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('G')))
示例#45
0
 def result_constructor(x):
     return TreeNode.read(StringIO(x), format='newick')
示例#46
0
 def testSimple(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.25)
     self.assertSameClusters([['A','B'],['D']], clusters)
示例#47
0
 def testNoClusering(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.05)
     self.assertSameClusters([['A'],['B'],['D']], clusters)
     assert_equals(_('C.1 C.2 Root'), [c.name() for c in clusters])
示例#48
0
 def testClusterEverything(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.5)
     self.assertSameClusters([['A','B','D']], clusters)
     assert_equals('Root',clusters[0].name())
示例#49
0
 def testClusterOnTwoInternalNodes(self):
     tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, (H:8, D:9)I:3)E:20, F:20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(tree, 40)
     self.assertSameClusters([_('A B D H'), ['F']], clusters)
示例#50
0
 def testFullTaxonomy(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(StringIO("(((A:1, B:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     assert_equals('f__family; g__genus2', ann.full_taxonomy(tree, tree.find('D')))
示例#51
0
 def testNamingWithBootstraps(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)0.091:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.05)
     self.assertSameClusters([['A'],['B'],['D']], clusters)
     assert_equals(_('Root.1 Root.2 Root.3'), [c.name() for c in clusters])
示例#52
0
 def testClusterNamingWithBootstraps(self):
     tree = TreeNode.read(StringIO("((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)'0.7:G':30)root;"))
     clusters = Tree2Tax().named_clusters(tree, 40)
     self.assertSameClusters([['F'], _('A B D H')], clusters)
     assert_equals(_('G.2 G.1'), [c.name() for c in clusters])
示例#53
0
 def testClusterNamingConventionsWithSomeUnnamed(self):
     tree = TreeNode.read(StringIO('((((A:11, B:12):10, D:9):20, F:20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.05) #i.e. everything is a separate cluster
     self.assertSameClusters([['A'],['B'],['D'],['F']], clusters)
     assert_equals(['G.1', 'G.2', 'G.3', 'G.4'], [c.name() for c in clusters])
示例#54
0
    def test_run_pick_de_novo_otus_parallel(self):
        """run_pick_de_novo_otus generates expected results in parallel
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = \
            {'template_fp': self.test_data['refseqs_aligned'][0]}
        self.params['filter_alignment'] = \
            {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]}
        actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=True,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out,
                            'pynast_aligned_seqs', '%s_rep_set_aligned.fasta' %
                            input_file_basename)
        failures_fp = join(self.test_out,
                           'pynast_aligned_seqs', '%s_rep_set_failures.fasta' %
                           input_file_basename)
        taxonomy_assignments_fp = join(self.test_out,
                                       'uclust_assigned_taxonomy',
                                       '%s_rep_set_tax_assignments.txt' %
                                       input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        self.assertEqual(actual_tree_fp, tree_fp)
        self.assertEqual(actual_otu_table_fp, otu_table_fp)

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # number of seqs which aligned + num of seqs which failed to
        # align sum to the number of OTUs
        self.assertEqual(
            count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus)

        # number of tips in the tree equals the number of sequences that
        # aligned
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0])

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1',
            'f2',
            'f3',
            'f4',
            'p1',
            'p2',
            't1',
            't2',
            'not16S.1']
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # otu ids are as expected
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum([v.sum()
                                       for v in otu_table.iter_data()])
        self.assertEqual(
            number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)
示例#55
0
    def test_run_pick_de_novo_otus_muscle(self):
        """run_pick_de_novo_otus w muscle generates expected results
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = {'alignment_method': 'muscle'}
        self.params['filter_alignment'] = \
            {'suppress_lane_mask_filter': None,
             'entropy_threshold': '0.10'}

        run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=False,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out,
                            'muscle_aligned_seqs', '%s_rep_set_aligned.fasta' %
                            input_file_basename)
        taxonomy_assignments_fp = join(self.test_out,
                                       'uclust_assigned_taxonomy',
                                       '%s_rep_set_tax_assignments.txt' %
                                       input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # all OTUs align
        self.assertEqual(count_seqs(alignment_fp)[0], num_otus)

        # all OTUs in tree
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), num_otus)

        # check that the two final output files have non-zero size
        self.assertTrue(getsize(tree_fp) > 0)
        self.assertTrue(getsize(otu_table_fp) > 0)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1',
            'f2',
            'f3',
            'f4',
            'p1',
            'p2',
            't1',
            't2',
            'not16S.1']
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # expected OTUs
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum([v.sum()
                                       for v in otu_table.iter_data()])
        self.assertEqual(
            number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])
示例#56
0
 def testSistersSelfNoParent(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals([], [s.name for s in sisters])  
示例#57
0
 def testSistersOneIncompleteSister(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(StringIO("((A:1, B:2):3, ((C:1,D:1):1, (E:1,F:5)'g3':6):10)root;"))
     print(tree.ascii_art())
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals(sorted(['g3']), sorted([s.name for s in sisters]))
示例#58
0
 def testClusterNamingOnTwoInternalNodesReverseOrder(self):
     tree = TreeNode.read(StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(tree, 40)
     self.assertSameClusters([['F'], _('A B D H')], clusters)
     assert_equals(_('G.2 G.1'), [c.name() for c in clusters])
示例#59
0
 def testClusterIntoThree(self):
     tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, (H:8, D:9)I:3)E:20, F:20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(tree, 25)
     self.assertSameClusters([_('A B'), _('D H'), ['F']], clusters)
示例#60
0
 def testSistersSisterWithDescendentNames(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(StringIO("((A:1, B:2):3, (((a:1,b:1)'s1':1,D:1)'g2':1, (E:1,F:5)'g3':6):10)root;"))
     print(tree.ascii_art())
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals(sorted(['g2','g3']), sorted([s.name for s in sisters]))