Python TreeNode.read 예제들, skbio.tree.TreeNode.read Python 예제들

예제 #1

0

파일 보기

def main():
    args = parse_args()

    tree1 = TreeNode.read(open(args.tree1_file))
    tree2 = TreeNode.read(open(args.tree2_file))

    tree_dist = calc_tree_distance(tree1, tree2)
    print ("Tree distance: %d" %tree_dist)

예제 #2

0

파일 보기

파일: test_phylofactor.py 프로젝트: johnchase/q2-phylofactor

    def test_continous(self):
        exp_basis = pd.read_csv(
            self.get_data_path('expected/numeric_basis.tsv'), sep='\t')
        exp_data = pd.read_csv(self.get_data_path('expected/numeric_data.tsv'),
                               sep='\t')
        exp_groups = pd.read_csv(
            self.get_data_path('expected/numeric_groups.tsv'), sep='\t')
        exp_factors = pd.read_csv(
            self.get_data_path('expected/numeric_factors.tsv'), sep='\t')
        exp_tree = (TreeNode.read(
            self.get_data_path('expected/numeric_tree.nwk')))

        pf = phylofactor(self.table,
                         self.phylogeny,
                         self.metadata,
                         formula='Continuous~Data',
                         nfactors=3,
                         family='poisson')

        data, basis, out_tree, groups, factors = pf

        assert_frame_equal(basis, exp_basis)
        assert_frame_equal(groups, exp_groups)
        assert_frame_equal(factors, exp_factors)
        assert_frame_equal(data, exp_data)
        self.assertEqual(TreeNode.compare_rfd(exp_tree, out_tree), 0)

예제 #3

0

파일 보기

파일: sim_to_tree.py 프로젝트: shtoshni92/lang-dist

def main_calc_tree_distance(lang_set_mat, dist_metric="rfd"):
    """Calculate Tree Distance."""
    pred_linkage = get_linkage_matrix(lang_set_mat)
    pred_tree = TreeNode.from_linkage_matrix(pred_linkage,
                                             INDO_EURO_LANG_NAMES)

    pred_tree_string_io = StringIO()
    pred_tree.write(pred_tree_string_io)
    pred_tree_string = pred_tree_string_io.getvalue()

    # Replace distances with 1
    unweighted_tree_string = re.sub(r"\d+\.\d+", "1", pred_tree_string)
    pred_tree = TreeNode.read(StringIO(unweighted_tree_string))

    if dist_metric == "rfd":
        tree_dist = pred_tree.compare_rfd(GT_INDO_EUROPEAN_TREE)
    else:
        gt_distances_struct = GT_INDO_EUROPEAN_TREE.tip_tip_distances()
        gt_distances = gt_distances_struct.data
        gt_ids = gt_distances_struct.ids

        pred_distances = pred_tree.tip_tip_distances(
            endpoints=list(gt_ids)).data
        tree_dist = np.sum((gt_distances - pred_distances)**2)

    return tree_dist, pred_tree

예제 #4

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testSimpleTwice(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters_for_several_thresholds(
         tree, [0.25, 0.25])
     self.assertSameClusterSets(
         [[0.25, [['A', 'B'], ['D']]], [0.25, [['A', 'B'], ['D']]]],
         clusters)

예제 #5

0

파일 보기

파일: parse_silva_taxonomy.py 프로젝트: nvt-1009/make_SILVA_db

def build_base_silva_taxonomy(tree_file, tax_dict):
    """Returns {TaxonomyID : [(rank, taxonomy), ...]} """
    print("Building base SILVA taxonomy...")
    tree = TreeNode.read(tree_file)
    ml = {}
    for node in tree.postorder():  # tree.tips():
        if node.is_root():
            break

        l = []
        rank, taxonomy = tax_dict[node.name]
        clean_taxonomy_str = filter_characters(taxonomy)

        if rank in allowed_ranks:
            l.append((allowed_ranks_dict[rank], clean_taxonomy_str))

        for ancestor in node.ancestors():
            if ancestor.is_root():
                break
            else:
                arank, ataxonomy = tax_dict[ancestor.name]
                cleaned_ataxonomy = filter_characters(ataxonomy)
                if arank in allowed_ranks:
                    l.append((allowed_ranks_dict[arank], cleaned_ataxonomy))

        #l.reverse()
        ml[node.name.strip()] = dict(l)

    return ml

예제 #6

0

파일 보기

파일: test_threshold_finder.py 프로젝트: wwood/tree2tax

 def testNoPairs(self):
     tree = TreeNode.read(
         StringIO(
             "(((A:1, B:2):3, (C:4, D:5):6)'f__family; g__genoos':10)root;")
     )
     examples = ThresholdFinder().find_examples(tree, 'f', 'g')
     self.assertSameCladeDistanceSet([], examples)

예제 #7

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testClusterNamingWithBootstraps(self):
     tree = TreeNode.read(
         StringIO(
             "((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)'0.7:G':30)root;"))
     clusters = Tree2Tax().named_clusters(tree, 40)
     self.assertSameClusters([['F'], _('A B D H')], clusters)
     assert_equals(_('G.2 G.1'), [c.name() for c in clusters])

예제 #8

0

파일 보기

파일: test_summary.py 프로젝트: qiita-spots/qtp-biom

    def test__generate_html_summary_phylogeny(self):
        fp_biom = join('qtp_biom', 'support_files', 'sepp.biom')
        fp_tree = join('qtp_biom', 'support_files', 'sepp.tre')

        # load metadata
        qurl = '/qiita_db/analysis/%s/metadata/' % 1
        md = self.qclient.get(qurl)

        # load phylogeny
        tree = TreeNode.read(fp_tree)

        obs_index_fp, obs_viz_fp, qza_fp = _generate_html_summary(fp_biom,
                                                                  md,
                                                                  self.out_dir,
                                                                  True,
                                                                  tree=tree)

        # test if two expected tags show up in the html summary page
        with open(obs_index_fp) as f:
            obs_html = ''.join(f.readlines())
            self.assertTrue('<th>Number placed fragments</th>' in obs_html)
            self.assertTrue('<td>434</td>' in obs_html)

        # test that phylogeny specific html content does not show up if no
        # tree is given
        obs_index_fp, obs_viz_fp, qza_fp = _generate_html_summary(fp_biom,
                                                                  md,
                                                                  self.out_dir,
                                                                  True,
                                                                  tree=None)
        with open(obs_index_fp) as f:
            obs_html = ''.join(f.readlines())
            self.assertTrue('<th>Number placed fragments</th>' not in obs_html)

예제 #9

0

파일 보기

파일: partition_tree.py 프로젝트: geronimp/make_clade_specific_hmm

 def depth_partition(self, input_tree, percentile, output_tree):
     '''
     Attempt to cluster tree with nodes of tip-to-tip distrubution <
     an nth percentile cutoff of the whole-tree distance distribution. 
     A better description can be found in the citation below.
     
     Parameters
     ----------
     tree: skbio TreeNode obj
         http://scikit-bio.org/docs/latest/generated/skbio.tree.TreeNode.html #skbio.tree.TreeNode
             
     percentile: float
         The percentile cutoff to use to determine the cutoff from clading
         from a given node.
     
     Clustering method modified from Prosperi et al method:
     Prosperi, M.C.F., et al. A novel methodology for large-scale phylogeny 
     partition. Nat. Commun. 2:321 doi: 10.1038/ncomms1325 (2011).
     
     http://www.nature.com/ncomms/journal/v2/n5/full/ncomms1325.html
     '''
     tree = TreeNode.read(input_tree)
     tree = tree.root_at_midpoint()
     cluster_count = 1
     clustered = set()
     clusters = {}
     logging.debug("Calculating %ith percentile cutoff from root" \
                                             % (percentile))
     whole_tree_distribution = self._node_dist(tree)
     
     cutoff = np.percentile(whole_tree_distribution, percentile)
     logging.debug("Cutoff (%ith percentile): %f" % (percentile,
                                                     cutoff))
     for node in tree.preorder():
         if node in clustered:
             continue
         elif node.is_tip():
             continue
         else:
             node_distribution = self._node_dist(node)
             median=np.median(node_distribution)
             logging.debug("Median of node: %f" % median)
             if median <= cutoff:
                 logging.debug("Cluster found!")
                 cluster_name =  "partition_%i" % (cluster_count)
                 clusters[cluster_name] = [x.name.replace(' ','_') 
                                           for x in node.tips()]
                 self._rename(node, cluster_name)
                 cluster_count+=1
                 for descenent in node.traverse():
                     clustered.add(descenent)
     logging.info("%i depth cluster(s) found in tree" % (cluster_count-1))
     tree.write(output_tree, "newick")
     
     logging.debug("Recording tips that were not partitioned")
     clusters[self.UNCLUSTERED] = []
     for tip in tree.tips():
         if tip not in clustered:
             clusters[self.UNCLUSTERED].append(tip.name.replace(' ','_'))
     return clusters

예제 #10

0

파일 보기

 def testSistersOneIncompleteSister(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(
         StringIO("((A:1, B:2):3, ((C:1,D:1):1, (E:1,F:5)'g3':6):10)root;"))
     print(tree.ascii_art())
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals(sorted(['g3']), sorted([s.name for s in sisters]))

예제 #11

0

파일 보기

    def test_ilr_ordination(self):
        np.random.seed(0)
        table = pd.DataFrame([[1, 1, 2, 2],
                              [1, 2, 2, 1],
                              [2, 2, 1, 1]],
                             index=[1, 2, 3],
                             columns=['a', 'b', 'c', 'd'])
        table = table.reindex(columns=np.random.permutation(table.columns))
        tree = TreeNode.read([
            '((c:0.025,d:0.025,f:0.1,e:0.025):0.2,(b:0.025,a:0.025):0.2);'])
        res_ord, res_tree, res_md = ilr_phylogenetic_ordination(
            table, tree, top_k_var=3)
        exp_balances = pd.DataFrame(
            [[0.693147, 0.0, 3.892122e-17],
             [0.0, -4.901291e-01, -4.901291e-01],
             [-0.693147, -5.551115e-17, -3.892122e-17]],
            columns=['y0', 'y1', 'y2'],
            index=[1, 2, 3])

        exp_balances = exp_balances[['y0', 'y1', 'y2']]
        exp_balances.index.name = 'sampleid'
        pdt.assert_frame_equal(res_ord.samples, exp_balances)
        exp_tree_str = ('((b:0.025,a:0.025)y1:0.2,'
                        '(c:0.025,d:0.025)y2:0.2)y0;\n')
        self.assertEqual(str(res_tree), exp_tree_str)

        exp_md = pd.DataFrame([[-0.5, -0.707107, 0.000000],
                               [-0.5, 0.707107, 0.000000],
                               [0.5, 0.000000, -0.707107],
                               [0.5, 0.000000, 0.707107]],
                              columns=['y0', 'y1', 'y2'],
                              index=['b', 'a', 'c', 'd'])
        exp_md.index.name = 'featureid'
        pdt.assert_frame_equal(res_md, exp_md)

예제 #12

0

파일 보기

파일: test_phylofactor.py 프로젝트: johnchase/q2-phylofactor

    def test_defaults(self):

        exp_basis = pd.read_csv(
            self.get_data_path('expected/categorical_basis.tsv'), sep='\t')
        exp_data = pd.read_csv(
            self.get_data_path('expected/categorical_data.tsv'), sep='\t')
        exp_groups = pd.read_csv(
            self.get_data_path('expected/categorical_groups.tsv'), sep='\t')
        exp_factors = pd.read_csv(
            self.get_data_path('expected/categorical_factors.tsv'), sep='\t')
        exp_tree = TreeNode.read(
            self.get_data_path('expected/categorical_tree.nwk'))

        pf = phylofactor(self.table,
                         self.phylogeny,
                         self.metadata,
                         formula='Categorical~Data',
                         nfactors=3,
                         family='binomial')

        data, basis, out_tree, groups, factors = pf

        assert_frame_equal(basis, exp_basis)
        assert_frame_equal(groups, exp_groups)
        assert_frame_equal(factors, exp_factors)
        assert_frame_equal(data, exp_data)
        self.assertEqual(TreeNode.compare_rfd(exp_tree, out_tree), 0)

예제 #13

0

파일 보기

파일: partition_tree.py 프로젝트: geronimp/make_clade_specific_hmm

    def depth_partition(self, input_tree, percentile, output_tree):
        '''
        Attempt to cluster tree with nodes of tip-to-tip distrubution <
        an nth percentile cutoff of the whole-tree distance distribution. 
        A better description can be found in the citation below.
        
        Parameters
        ----------
        tree: skbio TreeNode obj
            http://scikit-bio.org/docs/latest/generated/skbio.tree.TreeNode.html #skbio.tree.TreeNode
                
        percentile: float
            The percentile cutoff to use to determine the cutoff from clading
            from a given node.
        
        Clustering method modified from Prosperi et al method:
        Prosperi, M.C.F., et al. A novel methodology for large-scale phylogeny 
        partition. Nat. Commun. 2:321 doi: 10.1038/ncomms1325 (2011).
        
        http://www.nature.com/ncomms/journal/v2/n5/full/ncomms1325.html
        '''
        tree = TreeNode.read(input_tree)
        tree = tree.root_at_midpoint()
        cluster_count = 1
        clustered = set()
        clusters = {}
        logging.debug("Calculating %ith percentile cutoff from root" \
                                                % (percentile))
        whole_tree_distribution = self._node_dist(tree)

        cutoff = np.percentile(whole_tree_distribution, percentile)
        logging.debug("Cutoff (%ith percentile): %f" % (percentile, cutoff))
        for node in tree.preorder():
            if node in clustered:
                continue
            elif node.is_tip():
                continue
            else:
                node_distribution = self._node_dist(node)
                median = np.median(node_distribution)
                logging.debug("Median of node: %f" % median)
                if median <= cutoff:
                    logging.debug("Cluster found!")
                    cluster_name = "partition_%i" % (cluster_count)
                    clusters[cluster_name] = [
                        x.name.replace(' ', '_') for x in node.tips()
                    ]
                    self._rename(node, cluster_name)
                    cluster_count += 1
                    for descenent in node.traverse():
                        clustered.add(descenent)
        logging.info("%i depth cluster(s) found in tree" % (cluster_count - 1))
        tree.write(output_tree, "newick")

        logging.debug("Recording tips that were not partitioned")
        clusters[self.UNCLUSTERED] = []
        for tip in tree.tips():
            if tip not in clustered:
                clusters[self.UNCLUSTERED].append(tip.name.replace(' ', '_'))
        return clusters

예제 #14

0

파일 보기

파일: test_threshold_finder.py 프로젝트: wwood/tree2tax

 def testMultiplyNamedNode(self):
     tree = TreeNode.read(
         StringIO(
             "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2; s__spec':6)'f__family':10)root;"
         ))
     examples = ThresholdFinder().find_examples(tree, 'f', 'g')
     self.assertSameCladeDistanceSet(
         [['f__family', 'g__genus1', 'g__genus2; s__spec', 16.0]], examples)

예제 #15

0

파일 보기

 def testFullTaxonomy(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(
         StringIO(
             "(((A:1, B:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")
     )
     assert_equals('f__family; g__genus2',
                   ann.full_taxonomy(tree, tree.find('D')))

예제 #16

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testClusterNamingConventionsWithSomeUnnamed(self):
     tree = TreeNode.read(
         StringIO('((((A:11, B:12):10, D:9):20, F:20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(
         tree, 0.05)  #i.e. everything is a separate cluster
     self.assertSameClusters([['A'], ['B'], ['D'], ['F']], clusters)
     assert_equals(['G.1', 'G.2', 'G.3', 'G.4'],
                   [c.name() for c in clusters])

예제 #17

0

파일 보기

파일: test_tree_sisters.py 프로젝트: wwood/tree_sisters

 def testFindParents(self):
     ann = TreeAnnotator()
     
     tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('B')).name, 'self is named')
     
     tree = TreeNode.read(StringIO("(((A:1, 2475:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('2475')).name, 'parent directly above')
     
     tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     assert_equals('f__family', ann.find_named_parent(tree, tree.find('2475')).name, 'parent 2 above')
     
     tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10);"))
     assert_equals(None, ann.find_named_parent(tree, tree.find('f__family').parent), 'parent of root')
     
     tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6):10);"))
     assert_equals(None, ann.find_named_parent(tree, tree.find('g__genus2').parent), 'no parent before root')

예제 #18

0

파일 보기

 def testSistersSelfNoParent(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(
         StringIO(
             "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"
         ))
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals([], [s.name for s in sisters])

예제 #19

0

파일 보기

파일: summary.py 프로젝트: qiita-spots/qtp-biom

def generate_html_summary(qclient, job_id, parameters, out_dir):
    """Generates the HTML summary of a BIOM artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to validate and create the artifact
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, None, str
        Whether the job is successful
        Ignored
        The error message, if not successful
    """
    # Step 1: gather file information from qiita using REST api
    artifact_id = parameters['input_data']
    qclient_url = "/qiita_db/artifacts/%s/" % artifact_id
    artifact_info = qclient.get(qclient_url)

    # Step 2: get the mapping file, depends if analysis or not
    if artifact_info['analysis'] is None:
        is_analysis = False
        qurl = ('/qiita_db/prep_template/%s/' %
                artifact_info['prep_information'][0])
        md = qclient.get(qurl)['qiime-map']
    else:
        is_analysis = True
        qurl = '/qiita_db/analysis/%s/metadata/' % artifact_info['analysis']
        md = qclient.get(qurl)

    tree = None
    if 'plain_text' in artifact_info['files']:
        tree = TreeNode.read(artifact_info['files']['plain_text'][0])

    # Step 3: generate HTML summary
    # if we get to this point of the code we are sure that this is a biom file
    # and that it only has one element
    index_fp, viz_fp, qza_fp = _generate_html_summary(
        artifact_info['files']['biom'][0], md, out_dir, is_analysis, tree)

    # Step 4: add the new file to the artifact using REST api
    success = True
    error_msg = ""
    try:
        qclient.patch(qclient_url, 'add', '/html_summary/',
                      value=dumps({'html': index_fp, 'dir': viz_fp}))
    except Exception as e:
        success = False
        error_msg = str(e)

    return success, None, error_msg

예제 #20

0

파일 보기

 def testSistersSisterWithDescendentNames(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(
         StringIO(
             "((A:1, B:2):3, (((a:1,b:1)'s1':1,D:1)'g2':1, (E:1,F:5)'g3':6):10)root;"
         ))
     print(tree.ascii_art())
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals(sorted(['g2', 'g3']), sorted([s.name for s in sisters]))

예제 #21

0

파일 보기

파일: test_threshold_finder.py 프로젝트: wwood/tree2tax

 def testTreeSubtree2(self):
     '''one genus is a subtree of another, and the longest branch is in both subtrees'''
     tree = TreeNode.read(
         StringIO(
             "((((A:1, B:52)'g__genus1':3, D:50)'g__genus2':6)'f__family':10)root;"
         ))
     examples = ThresholdFinder().find_examples(tree, 'f', 'g')
     self.assertSameCladeDistanceSet(
         [['f__family', 'g__genus1', 'g__genus2', 105.0]], examples)

예제 #22

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testOppositeSorting(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters_for_several_thresholds(
         tree, [0.05, 0.25])
     self.assertSameClusterSets(
         [[0.05, [['A'], ['B'], ['D']]], [0.25, [['A', 'B'], ['D']]]],
         clusters)
     assert_equals(_('C.1 C.2 Root'),
                   [c.name() for c in clusters[0].clusters])

예제 #23

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testNaming(self):
     tree = TreeNode.read(
         StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;'))
     clusters = Tree2Tax().named_clusters_for_several_thresholds(
         tree, [40, 25])
     self.assertSameClusterSets(
         [[25, [['F'], _('A B'), _('D H')]],
          [40, [['F'], _('A B D H')]]], clusters)
     assert_equals(_('G.3 G.1 G.2'),
                   [c.name() for c in clusters[0].clusters])
     assert_equals(_('G.2 G.1'), [c.name() for c in clusters[1].clusters])

예제 #24

0

파일 보기

    def testFindParents(self):
        ann = TreeAnnotator()

        tree = TreeNode.read(
            StringIO(
                "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"
            ))
        assert_equals('g__genus1',
                      ann.find_named_parent(tree, tree.find('B')).name,
                      'self is named')

        tree = TreeNode.read(
            StringIO(
                "(((A:1, 2475:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"
            ))
        assert_equals('g__genus1',
                      ann.find_named_parent(tree, tree.find('2475')).name,
                      'parent directly above')

        tree = TreeNode.read(
            StringIO(
                "(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"
            ))
        assert_equals('f__family',
                      ann.find_named_parent(tree, tree.find('2475')).name,
                      'parent 2 above')

        tree = TreeNode.read(
            StringIO(
                "(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10);"))
        assert_equals(
            None, ann.find_named_parent(tree,
                                        tree.find('f__family').parent),
            'parent of root')

        tree = TreeNode.read(
            StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6):10);"))
        assert_equals(
            None, ann.find_named_parent(tree,
                                        tree.find('g__genus2').parent),
            'no parent before root')

예제 #25

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def test_missing_taxonomy(self):
     tree = TreeNode.read(
         StringIO('((((A:11, B:12)C:10, D:9)E:20, F:20)G:30)root;'))
     assert_equals(['C'],
                   TaxonomyFunctions().missing_taxonomy(
                       tree, tree.find('A'), tree.find('E')))
     assert_equals([],
                   TaxonomyFunctions().missing_taxonomy(
                       tree, tree.find('A'), tree.find('A')))
     assert_equals(['E', 'C'],
                   TaxonomyFunctions().missing_taxonomy(
                       tree, tree.find('A'), tree.find('G')))

예제 #26

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

    def testTipToCluster(self):
        tree = TreeNode.read(StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;'))
        clusters = Tree2Tax().named_clusters_for_several_thresholds(tree, [40, 25])
        self.assertSameClusterSets([[25,[['F'], _('A B'), _('D H')]], [40,[['F'], _('A B D H')]]], clusters)
        assert_equals(_('G.3 G.1 G.2'), [c.name() for c in clusters[0].clusters])
        assert_equals(_('G.2 G.1'), [c.name() for c in clusters[1].clusters])
         
        tip = tree.find('F')
        assert_equals('G.3', clusters[0].tip_to_cluster(tip).name())
        assert_equals('G.2', clusters[1].tip_to_cluster(tip).name())
 
        tip = tree.find('D')
        assert_equals('G.2', clusters[0].tip_to_cluster(tip).name())
        assert_equals('G.1', clusters[1].tip_to_cluster(tip).name())

예제 #27

0

파일 보기

파일: phylip.py 프로젝트: edgartanaka/mo640

def get_dist_matrix_from_tree(file_path):
    with open(file_path, 'r') as myfile:
        data = myfile.read().replace('\n', '')
        t = TreeNode.read(StringIO(data))
        df = t.tip_tip_distances().to_data_frame()

        #df.index = df.index.astype(int)

        # sort rows and cols
        df.sort_index(inplace=True)
        #df.columns = df.columns.values.astype(np.int32)
        df = df[sorted(df.columns)]
        print(df)
        return df.as_matrix()

예제 #28

0

파일 보기

파일: check_analyses_sepp.py 프로젝트: afcarl/ggmap

    def setUp(self):
        self.file_ref_phylo = get_data_path(
            'analyses/sepp/reference_phylogeny_small.qza')
        self.file_ref_aln = get_data_path(
            'analyses/sepp/reference_alignment_small.qza')
        self.fragments = pd.read_csv(
            get_data_path('analyses/sepp/fragments.tsv'),
            sep='\t',
            index_col=0)

        self.exp_taxonomy = pd.read_csv(
            get_data_path('analyses/sepp/exp_taxonomy.tsv'),
            sep='\t',
            index_col=0)
        self.exp_tree = TreeNode.read(
            get_data_path('analyses/sepp/exp_tree.nwk'))

예제 #29

0

파일 보기

def _prune_features_from_phylogeny(table: biom.Table,
                                   phylogeny_fp: NewickFormat) -> NewickFormat:
    print('Will prune the phylogeny')
    tree = TreeNode.read(str(phylogeny_fp))
    obs = table.ids('observation')
    tip_names_set = set([x.name for x in tree.tips()])
    to_delete_names = tip_names_set - set(obs)
    to_delete_set = to_delete_names

    if len(set(obs) - tip_names_set) > 0:
        raise ValueError(
            "There are",  len(set(obs) - tip_names_set),
            "features in the feature table not present "
            "in the phylogeny! Please check your tree"
        )
    else:
        print("All", len(obs), "features present in the "
                               "feature table are also in the phylogeny.")

    if len(to_delete_set) > 0:
        t0 = time()
        print("The set of features in the phylogeny and the table "
              "are not the same.", len(to_delete_set),
              "features will be pruned from the tree.")
        tree_pruned = tree.shear(set(obs))
        print("It takes", time()-t0, "seconds to prune the phylogeny")
        to_delete_set = set([x.name for x in tree_pruned.tips()]) - set(obs)
        to_delete_rev_set = set(obs) - set([x.name for x in tree_pruned.tips()])
        if len(to_delete_set) > 0 or len(to_delete_rev_set):
            raise ValueError(
                "Pruning the phylogeny failed! There are", len(to_delete_set),
                "features in the phylogeny not present in "
                "the feature table, and",
                len(to_delete_rev_set),
                "features in the feature table not available in the phylogeny!"
                "Both should be 0"
            )
        else:
            print("The phylogeny was pruned successfully!")
    else:
        print("The set of features in the phylogeny and the table "
              "are the same. No feature will be pruned from the tree.")
        tree_pruned = tree
    tree_pruned_out = _1(tree_pruned)

    return tree_pruned_out

예제 #30

0

파일 보기

 def shear_tree(self):
     self.project.set_tree_paths(self.config)
     if len(Data.wols):
         i_wol_tree = get_wol_tree(self.config.i_wol_tree)
         wol = TreeNode.read(i_wol_tree)
         for dat, data in self.project.datasets.items():
             if dat in Datasets.filt_raw:
                 continue
             if data.phylo and data.phylo[0] == 'wol':
                 if self.config.force or not isfile(data.tree[1]):
                     wol_features = wol.shear(list(data.features.keys()))
                     for tip in wol_features.tips():
                         tip.name = data.features[tip.name]
                     wol_features.write(data.tree[2])
                     cmd = run_import(data.tree[2], data.tree[1],
                                      "Phylogeny[Rooted]")
                     self.cmds.setdefault(dat, []).append(cmd)
     self.register_command('wol')

예제 #31

0

파일 보기

파일: tree.py 프로젝트: kellyhuang21/deep-taxon

def read_tree(nwk_path, leaf_names=None, trim_src_tag=False):
    """
    Read a tree in Newick format

    Returns:
        TreeNode object for the root of the tree
    """
    tree = TreeNode.read(nwk_path, format='newick')
    swap_space(tree)
    if leaf_names is not None:
        tree = tree.shear(leaf_names)
    tree = tree.unrooted_copy()
    tree.assign_ids()

    if trim_src_tag:
        for n in tree.tips():
            n.name = check_accession(n.name)
    return tree

예제 #32

0

파일 보기

def read_tree(tree_path):
    """
    Read Newick formatted tree from GTDB.

    Only tips that have a NCBI accession will be kept.
    i.e. GTDB MAGs are pruned from the tree
    """
    tree = TreeNode.read(tree_path)
    leaves = list()
    for tip in tree.tips():
        if 'GC' in tip.name:
            tip.name = tip.name[3:].replace(' ', '_')
            leaves.append(tip.name)
    tree = tree.shear(leaves)
    tree.prune()
    for node in tree.non_tips():
        node.name = node.name.replace(' ', '_')
    return tree

예제 #33

0

파일 보기

파일: test_method.py 프로젝트: andrewsanchez/q2-gneiss

 def test_ilr_phylogenetic(self):
     np.random.seed(0)
     table = pd.DataFrame([[1, 1, 2, 2], [1, 2, 2, 1], [2, 2, 1, 1]],
                          index=[1, 2, 3],
                          columns=['a', 'b', 'c', 'd'])
     table = table.reindex(columns=np.random.permutation(table.columns))
     tree = TreeNode.read(
         ['((c:0.025,d:0.025,f:0.1,e:0.025):0.2,(b:0.025,a:0.025):0.2);'])
     res_balances, res_tree = ilr_phylogenetic(table, tree)
     exp_balances = pd.DataFrame(
         [[0.693147, 0.0, 3.892122e-17], [
             0.0, -4.901291e-01, -4.901291e-01
         ], [-0.693147, -5.551115e-17, -3.892122e-17]],
         columns=['y0', 'y1', 'y2'],
         index=[1, 2, 3])
     pdt.assert_frame_equal(res_balances, exp_balances)
     exp_tree_str = ('((b:0.025,a:0.025)y1:0.2,'
                     '(c:0.025,d:0.025)y2:0.2)y0;\n')
     self.assertEqual(str(res_tree), exp_tree_str)

예제 #34

0

파일 보기

파일: figure3_data.py 프로젝트: segrelab/MiCoNE-pipeline-paper

def get_unifrac(
    otu_file_1: pathlib.Path,
    otu_file_2: pathlib.Path,
    tree_file: pathlib.Path,
    weighted: bool,
    threshold: int,
):
    otu_1 = load_table(str(otu_file_1)).to_dataframe(dense=True)
    otu_2 = load_table(str(otu_file_2)).to_dataframe(dense=True)
    tree = TreeNode.read(str(tree_file))
    unifrac_data = dict()
    for u, v, otu_ids, col in get_vectors(otu_1, otu_2, threshold):
        if weighted:
            unifrac_value = weighted_unifrac(
                u, v, otu_ids, tree, normalized=True, validate=True
            )
        else:
            unifrac_value = unweighted_unifrac(u, v, otu_ids, tree, validate=True)
        unifrac_data[col] = unifrac_value
    return pd.Series(unifrac_data), otu_1.shape[0], otu_2.shape[0]

예제 #35

0

파일 보기

파일: fundec.py 프로젝트: geronimp/fundec

    def _open_tree(self, tree_path):
        '''
        Open a tree file, determine what decorations are already present. Strip
        Unwanted decoration
        
        Parameters
        ----------
        tree_path: str
            Path to a file containing a phylogenetic tree, in Newick format.
            
        Returns
        -------
        skbio TreeNode object
        '''
        tree_obj=TreeNode.read(open(tree_path))
        bootstrapped = True
        for node in tree_obj.non_tips():
            if node.name:
                try:
                    float(node.name)
                except:
                    logging.debug("Tree is decorated already. Stripping all \
    previous decoration from the tree.")
                    bootstrapped = False
                    tree_obj = self._strip_tree(tree_obj)
                    break
            else:
                if bootstrapped:
                    logging.warning("This tree doesn't appear correctly \
formatted or there is information missing. No boostrap value or decoration \
found for bare node. ")
                    bootstrapped = False
        if bootstrapped:
            logging.debug("Tree is bootstrap or has confidence values \
assigned to the nodes.")
        return tree_obj

예제 #36

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testNoClusering(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.05)
     self.assertSameClusters([['A'],['B'],['D']], clusters)
     assert_equals(_('C.1 C.2 Root'), [c.name() for c in clusters])

예제 #37

0

파일 보기

파일: test_threshold_finder.py 프로젝트: wwood/tree2tax

 def testTreeSubtree2(self):
     '''one genus is a subtree of another, and the longest branch is in both subtrees'''
     tree = TreeNode.read(StringIO("((((A:1, B:52)'g__genus1':3, D:50)'g__genus2':6)'f__family':10)root;"))
     examples = ThresholdFinder().find_examples(tree, 'f', 'g')
     self.assertSameCladeDistanceSet([['f__family','g__genus1','g__genus2',105.0]],
                                      examples)

예제 #38

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testClusterEverything(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.5)
     self.assertSameClusters([['A','B','D']], clusters)
     assert_equals('Root',clusters[0].name())

예제 #39

0

파일 보기

파일: test_tree_sisters.py 프로젝트: wwood/tree_sisters

 def testFullTaxonomy(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(StringIO("(((A:1, B:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     assert_equals('f__family; g__genus2', ann.full_taxonomy(tree, tree.find('D')))

예제 #40

0

파일 보기

파일: test_threshold_finder.py 프로젝트: wwood/tree2tax

 def testMultiplyNamedNode(self):
     tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2; s__spec':6)'f__family':10)root;"))
     examples = ThresholdFinder().find_examples(tree, 'f', 'g')
     self.assertSameCladeDistanceSet([['f__family','g__genus1','g__genus2; s__spec',16.0]],
                                      examples)

예제 #41

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testClusterNamingOnTwoInternalNodesReverseOrder(self):
     tree = TreeNode.read(StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(tree, 40)
     self.assertSameClusters([['F'], _('A B D H')], clusters)
     assert_equals(_('G.2 G.1'), [c.name() for c in clusters])

예제 #42

0

파일 보기

파일: test_tree_sisters.py 프로젝트: wwood/tree_sisters

 def testSistersSisterWithDescendentNames(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(StringIO("((A:1, B:2):3, (((a:1,b:1)'s1':1,D:1)'g2':1, (E:1,F:5)'g3':6):10)root;"))
     print(tree.ascii_art())
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals(sorted(['g2','g3']), sorted([s.name for s in sisters]))

예제 #43

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testClusterOnTwoInternalNodes(self):
     tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, (H:8, D:9)I:3)E:20, F:20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(tree, 40)
     self.assertSameClusters([_('A B D H'), ['F']], clusters)

예제 #44

0

파일 보기

파일: test_tree_sisters.py 프로젝트: wwood/tree_sisters

 def testSistersSelfNoParent(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;"))
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals([], [s.name for s in sisters])

예제 #45

0

파일 보기

파일: test_threshold_finder.py 프로젝트: wwood/tree2tax

 def testNoPairs(self):
     tree = TreeNode.read(StringIO("(((A:1, B:2):3, (C:4, D:5):6)'f__family; g__genoos':10)root;"))
     examples = ThresholdFinder().find_examples(tree, 'f', 'g')
     self.assertSameCladeDistanceSet([],
                                      examples)

예제 #46

0

파일 보기

파일: validate.py 프로젝트: qiita-spots/qtp-biom

def validate(qclient, job_id, parameters, out_dir):
    """Validate and fix a new BIOM artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to validate and create the artifact
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, list of qiita_client.ArtifactInfo , str
        Whether the job is successful
        The artifact information, if successful
        The error message, if not successful
    """
    prep_id = parameters.get('template')
    analysis_id = parameters.get('analysis')
    files = loads(parameters['files'])
    a_type = parameters['artifact_type']

    if a_type != "BIOM":
        return (False, None, "Unknown artifact type %s. Supported types: BIOM"
                             % a_type)

    qclient.update_job_step(job_id, "Step 1: Collecting metadata")
    if prep_id is not None:
        is_analysis = False
        metadata = qclient.get("/qiita_db/prep_template/%s/data/" % prep_id)
        metadata = metadata['data']

        qurl = ('/qiita_db/prep_template/%s/' % prep_id)
        md = qclient.get(qurl)['qiime-map']
    elif analysis_id is not None:
        is_analysis = True
        metadata = qclient.get("/qiita_db/analysis/%s/metadata/" % analysis_id)

        md = metadata
    else:
        return (False, None, "Missing metadata information")

    # Check if the biom table has the same sample ids as the prep info
    qclient.update_job_step(job_id, "Step 2: Validating BIOM file")
    new_biom_fp = biom_fp = files['biom'][0]
    table = load_table(biom_fp)
    metadata_ids = set(metadata)
    biom_sample_ids = set(table.ids())

    if not metadata_ids.issuperset(biom_sample_ids):
        # The BIOM sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing BIOM sample ids")
        # Attempt 1: the user provided the run prefix column - in this case
        # the run prefix column holds the sample ids present in the BIOM file
        if 'run_prefix' in metadata[next(iter(metadata_ids))]:
            id_map = {v['run_prefix']: k for k, v in metadata.items()}
        else:
            # Attemp 2: the sample ids in the BIOM table are the same that in
            # the prep template but without the prefix
            prefix = next(iter(metadata_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in biom_sample_ids)
            if metadata_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in biom_sample_ids}
            else:
                # There is nothing we can do. The samples in the BIOM table do
                # not match the ones in the prep template and we can't fix it
                error_msg = ('The sample ids in the BIOM table do not match '
                             'the ones in the prep information. Please, '
                             'provide the column "run_prefix" in the prep '
                             'information to map the existing sample ids to '
                             'the prep information sample ids.')
                return False, None, error_msg

        # Fix the sample ids
        try:
            table.update_ids(id_map, axis='sample')
        except TableException:
            missing = biom_sample_ids - set(id_map)
            error_msg = ('Your prep information is missing samples that are '
                         'present in your BIOM table: %s' % ', '.join(missing))
            return False, None, error_msg

        new_biom_fp = join(out_dir, basename(biom_fp))
        with biom_open(new_biom_fp, 'w') as f:
            table.to_hdf5(f, "Qiita BIOM type plugin")

    filepaths = [(new_biom_fp, 'biom')]

    # Validate the representative set, if it exists
    if 'preprocessed_fasta' in files:
        repset_fp = files['preprocessed_fasta'][0]

        # The observations ids of the biom table should be the same
        # as the representative sequences ids found in the representative set
        observation_ids = table.ids(axis='observation').tolist()
        extra_ids = []
        for record in load([repset_fp], constructor=FastaIterator):
            rec_id = record['SequenceID'].split()[0]
            try:
                observation_ids.remove(rec_id)
            except ValueError:
                extra_ids.append(rec_id)

        error_msg = []
        if extra_ids:
            error_msg.append("The representative set sequence file includes "
                             "observations not found in the BIOM table: %s"
                             % ', '.join(extra_ids))
        if observation_ids:
            error_msg.append("The representative set sequence file is missing "
                             "observation ids found in the BIOM tabe: %s" %
                             ', '.join(observation_ids))

        if error_msg:
            return False, None, '\n'.join(error_msg)

        filepaths.append((repset_fp, 'preprocessed_fasta'))

    # Validate the sequence specific phylogenetic tree (e.g. generated
    # by SEPP for Deblur), if it exists
    tree = None
    if 'plain_text' in files:
        phylogeny_fp = files['plain_text'][0]

        try:
            tree = TreeNode.read(phylogeny_fp)
            filepaths.append((phylogeny_fp, 'plain_text'))
        except Exception:
            return False, None, ("Phylogenetic tree cannot be parsed "
                                 "via scikit-biom")

    for fp_type, fps in files.items():
        if fp_type not in ('biom', 'preprocessed_fasta', 'plain_text'):
            for fp in fps:
                filepaths.append((fp, fp_type))

    index_fp, viz_fp, qza_fp = _generate_html_summary(
        new_biom_fp, md, join(out_dir), is_analysis, tree)

    filepaths.append((index_fp, 'html_summary'))
    filepaths.append((viz_fp, 'html_summary_dir'))
    if 'qza' not in files:
        filepaths.append((qza_fp, 'qza'))

    return True, [ArtifactInfo(None, 'BIOM', filepaths)], ""

예제 #47

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testClusterNamingConventionsWithSomeUnnamed(self):
     tree = TreeNode.read(StringIO('((((A:11, B:12):10, D:9):20, F:20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.05) #i.e. everything is a separate cluster
     self.assertSameClusters([['A'],['B'],['D'],['F']], clusters)
     assert_equals(['G.1', 'G.2', 'G.3', 'G.4'], [c.name() for c in clusters])

예제 #48

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testClusterNamingWithBootstraps(self):
     tree = TreeNode.read(StringIO("((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)'0.7:G':30)root;"))
     clusters = Tree2Tax().named_clusters(tree, 40)
     self.assertSameClusters([['F'], _('A B D H')], clusters)
     assert_equals(_('G.2 G.1'), [c.name() for c in clusters])

예제 #49

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testNamingWithBootstraps(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)0.091:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.05)
     self.assertSameClusters([['A'],['B'],['D']], clusters)
     assert_equals(_('Root.1 Root.2 Root.3'), [c.name() for c in clusters])

예제 #50

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testSimple(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters(tree, 0.25)
     self.assertSameClusters([['A','B'],['D']], clusters)

예제 #51

0

파일 보기

파일: test_tree_sisters.py 프로젝트: wwood/tree_sisters

 def testSistersOneIncompleteSister(self):
     ann = TreeAnnotator()
     tree = TreeNode.read(StringIO("((A:1, B:2):3, ((C:1,D:1):1, (E:1,F:5)'g3':6):10)root;"))
     print(tree.ascii_art())
     sisters = ann.find_sisters(tree, tree.find('B'))
     assert_equals(sorted(['g3']), sorted([s.name for s in sisters]))

예제 #52

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def test_missing_taxonomy(self):
     tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, D:9)E:20, F:20)G:30)root;'))
     assert_equals(['C'], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('E')))
     assert_equals([], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('A')))
     assert_equals(['E','C'], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('G')))

예제 #53

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testClusterIntoThree(self):
     tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, (H:8, D:9)I:3)E:20, F:20)G:30)root;'))
     clusters = Tree2Tax().named_clusters(tree, 25)
     self.assertSameClusters([_('A B'), _('D H'), ['F']], clusters)

예제 #54

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testSimpleTwice(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters_for_several_thresholds(tree, [0.25, 0.25])
     self.assertSameClusterSets([[0.25,[['A','B'],['D']]], [0.25,[['A','B'],['D']]]], clusters)

예제 #55

0

파일 보기

파일: _nj.py 프로젝트: Kleptobismol/scikit-bio

def nj(dm, disallow_negative_branch_length=True, result_constructor=None):
    """ Apply neighbor joining for phylogenetic reconstruction.

    Parameters
    ----------
    dm : skbio.DistanceMatrix
        Input distance matrix containing distances between OTUs.
    disallow_negative_branch_length : bool, optional
        Neighbor joining can result in negative branch lengths, which don't
        make sense in an evolutionary context. If `True`, negative branch
        lengths will be returned as zero, a common strategy for handling this
        issue that was proposed by the original developers of the algorithm.
    result_constructor : function, optional
        Function to apply to construct the result object. This must take a
        newick-formatted string as input. The result of applying this function
        to a newick-formatted string will be returned from this function. This
        defaults to ``lambda x: TreeNode.read(StringIO(x), format='newick')``.

    Returns
    -------
    TreeNode
        By default, the result object is a `TreeNode`, though this can be
        overridden by passing `result_constructor`.

    See Also
    --------
    TreeNode.root_at_midpoint

    Notes
    -----
    Neighbor joining was initially described in Saitou and Nei (1987) [1]_. The
    example presented here is derived from the Wikipedia page on neighbor
    joining [2]_. The Phylip manual also describes the method [3]_ and Phylip
    itself provides an implementation which is useful for comparison.

    Neighbor joining, by definition, creates unrooted trees. One strategy for
    rooting the resulting trees is midpoint rooting, which is accessible as
    ``TreeNode.root_at_midpoint``.

    References
    ----------
    .. [1] Saitou N, and Nei M. (1987) "The neighbor-joining method: a new
       method for reconstructing phylogenetic trees." Molecular Biology and
       Evolution. PMID: 3447015.
    .. [2] http://en.wikipedia.org/wiki/Neighbour_joining
    .. [3] http://evolution.genetics.washington.edu/phylip/doc/neighbor.html

    Examples
    --------
    Define a new distance matrix object describing the distances between five
    OTUs: a, b, c, d, and e.

    >>> from skbio import DistanceMatrix
    >>> from skbio.tree import nj

    >>> data = [[0,  5,  9,  9,  8],
    ...         [5,  0, 10, 10,  9],
    ...         [9, 10,  0,  8,  7],
    ...         [9, 10,  8,  0,  3],
    ...         [8,  9,  7,  3,  0]]
    >>> ids = list('abcde')
    >>> dm = DistanceMatrix(data, ids)

    Contstruct the neighbor joining tree representing the relationship between
    those OTUs. This is returned as a TreeNode object.

    >>> tree = nj(dm)
    >>> print(tree.ascii_art())
              /-d
             |
             |          /-c
             |---------|
    ---------|         |          /-b
             |          \--------|
             |                    \-a
             |
              \-e

    Again, construct the neighbor joining tree, but instead return the newick
    string representing the tree, rather than the TreeNode object. (Note that
    in this example the string output is truncated when printed to facilitate
    rendering.)

    >>> newick_str = nj(dm, result_constructor=str)
    >>> print(newick_str[:55], "...")
    (d:2.000000, (c:4.000000, (b:3.000000, a:2.000000):3.00 ...

    """
    if dm.shape[0] < 3:
        raise ValueError(
            "Distance matrix must be at least 3x3 to "
            "generate a neighbor joining tree.")

    if result_constructor is None:
        result_constructor = \
            lambda x: TreeNode.read(StringIO(x), format='newick')

    # initialize variables
    node_definition = None

    # while there are still more than three distances in the distance matrix,
    # join neighboring nodes.
    while(dm.shape[0] > 3):
        # compute the Q matrix
        q = _compute_q(dm)

        # identify the pair of nodes that have the lowest Q value. if multiple
        # pairs have equally low Q values, the first pair identified (closest
        # to the top-left of the matrix) will be chosen. these will be joined
        # in the current node.
        idx1, idx2 = _lowest_index(q)
        pair_member_1 = dm.ids[idx1]
        pair_member_2 = dm.ids[idx2]
        # determine the distance of each node to the new node connecting them.
        pair_member_1_len, pair_member_2_len = _pair_members_to_new_node(
            dm, idx1, idx2, disallow_negative_branch_length)
        # define the new node in newick style
        node_definition = "(%s:%f, %s:%f)" % (pair_member_1,
                                              pair_member_1_len,
                                              pair_member_2,
                                              pair_member_2_len)
        # compute the new distance matrix, which will contain distances of all
        # other nodes to this new node
        dm = _compute_collapsed_dm(
            dm, pair_member_1, pair_member_2,
            disallow_negative_branch_length=disallow_negative_branch_length,
            new_node_id=node_definition)

    # When there are three distances left in the distance matrix, we have a
    # fully defined tree. The last node is internal, and its distances are
    # defined by these last three values.
    # First determine the distance between the last two nodes to be joined in
    # a pair...
    pair_member_1 = dm.ids[1]
    pair_member_2 = dm.ids[2]
    pair_member_1_len, pair_member_2_len = \
        _pair_members_to_new_node(dm, pair_member_1, pair_member_2,
                                  disallow_negative_branch_length)
    # ...then determine their distance to the other remaining node, but first
    # handle the trival case where the input dm was only 3 x 3
    node_definition = node_definition or dm.ids[0]
    internal_len = _otu_to_new_node(
        dm, pair_member_1, pair_member_2, node_definition,
        disallow_negative_branch_length=disallow_negative_branch_length)
    # ...and finally create the newick string describing the whole tree.
    newick = "(%s:%f, %s:%f, %s:%f);" % (pair_member_1, pair_member_1_len,
                                         node_definition, internal_len,
                                         pair_member_2, pair_member_2_len)

    # package the result as requested by the user and return it.
    return result_constructor(newick)

예제 #56

0

파일 보기

파일: _nj.py 프로젝트: vivekiitkgp/scikit-bio

 def result_constructor(x):
     return TreeNode.read(StringIO(x), format='newick')

예제 #57

0

파일 보기

파일: test_tree2tax.py 프로젝트: wwood/tree2tax

 def testOppositeSorting(self):
     tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;'))
     clusters = Tree2Tax().named_clusters_for_several_thresholds(tree, [0.05, 0.25])
     self.assertSameClusterSets([[0.05,[['A'],['B'],['D']]], [0.25,[['A','B'],['D']]]], clusters)
     assert_equals(_('C.1 C.2 Root'), [c.name() for c in clusters[0].clusters])