Exemplo n.º 1
0
 def test_fast_unifrac(self):
     """Should calc unifrac values for whole tree."""
     #Note: results not tested for correctness here as detailed tests
     #in fast_tree module.
     res = fast_unifrac(self.t, self.env_counts)
     res = fast_unifrac(self.t, self.missing_env_counts)
     res = fast_unifrac(self.t, self.extra_tip_counts)
     self.assertRaises(ValueError,  fast_unifrac, self.t, \
         self.wrong_tip_counts)
Exemplo n.º 2
0
 def test_fast_unifrac(self):
     """Should calc unifrac values for whole tree."""
     #Note: results not tested for correctness here as detailed tests
     #in fast_tree module.
     res = fast_unifrac(self.t, self.env_counts)
     res = fast_unifrac(self.t, self.missing_env_counts)
     res = fast_unifrac(self.t, self.extra_tip_counts)
     self.assertRaises(ValueError,  fast_unifrac, self.t, \
         self.wrong_tip_counts)
Exemplo n.º 3
0
def unifrac_distance_matrix(table, sample_ids, otu_ids, tree):
    """
    Parameters
    ----------
    table : np.array
       Contingency table
       samples = rows
       observations = columns
    sample_ids : list, str
       List of sample ids
    otu_ids : list, str
       List of otu ids
    tree : str
       newick tree

    Returns
    -------
    np.array :
       Unifrac distance matrix
    """
    df = pd.DataFrame(table, index=sample_ids, columns=otu_ids)
    env = df.to_dict()
    res = fast_unifrac(tree, env, weighted=True)
    dist_mat = pd.DataFrame(res['distance_matrix'][0],
                            index=res['distance_matrix'][1],
                            columns=res['distance_matrix'][1])
    return dist_mat
Exemplo n.º 4
0
def timing(tree_size, num_trees, num_samples):
	FastUnifrac_times = list()
	EMDUnifrac_times = list()
	EMDUnifrac_flow_times = list()
	for tree_it in range(num_trees):
		t = Tree()
		t.populate(tree_size, random_branches = True)
		tree_str = t.write(format=1)
		tr = DndParser(tree_str, UniFracTreeNode)
		(T,l,nodes_in_order) = EMDU.parse_tree(tree_str)
		for it in range(num_samples):
			envs = EMDU.simulate_data(t.get_leaf_names())  # FastUnifrac can only take weight on leaf nodes
			(envs_prob_dict, samples) = EMDU.parse_envs(envs, nodes_in_order)
			P = envs_prob_dict[samples[0]]
			Q = envs_prob_dict[samples[1]]
			#EMDUnifrac with flow
			t0 = timeit.default_timer()
			(Z, Flow, diffab) = EMDU.EMDUnifrac_weighted_flow(T, l, nodes_in_order, P, Q)
			t1 = timeit.default_timer()
			EMDUnifrac_flow_times.append(t1-t0)
			#EMDUnifrac no flow
			t0 = timeit.default_timer()
			(Z,diffab) = EMDU.EMDUnifrac_weighted(T, l, nodes_in_order, P, Q)
			t1 = timeit.default_timer()
			EMDUnifrac_times.append(t1-t0)
			#FastUnifrac weighted
			t0 = timeit.default_timer()
			res = fast_unifrac(tr, envs, weighted=True, modes=set(['distance_matrix']))
			t1 = timeit.default_timer()
			FastUnifrac_times.append(t1-t0)
	return  (np.array(EMDUnifrac_times).mean(), np.array(EMDUnifrac_flow_times).mean(), np.array(FastUnifrac_times).mean())
Exemplo n.º 5
0
def unifrac_distance_rows(data, samples_arg=None, otus_arg=None, tree_arg=None, sample_filter=None, otu_filter=None):
    DEBUG("Starting unifrac_distance_rows...")
    if sample_filter is None:
        sample_filter = []
    if otu_filter is None:
        otu_filter = []
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        from cogent.maths.unifrac.fast_unifrac import fast_unifrac
    if samples_arg is None:
        samples = get_default_samples()
    elif callable(samples_arg):
        samples = samples_arg()
    else:
        samples = samples_arg

    if otus_arg is None:
        otus = get_default_otus()
    elif callable(otus_arg):
        otus = otus_arg()
    else:
        otus = otus_arg

    if tree_arg is None:
        tree = get_default_tree(otus)
    elif callable(tree_arg):
        tree = tree_arg()
    else:
        tree = tree_arg

    mat = __get_precalculated_unifrac_file_if_exists_for_data(data, sample_filter, otu_filter)
    if mat is not None:
        DEBUG("Found previously calculated Unifrac data")
        return mat

    DEBUG("Preparing data dictionary...")
    data_dict = __unifrac_prepare_dictionary_from_matrix_rows(data, samples, otus, sample_filter, otu_filter)
    DEBUG("Running fast_unifrac...")
    unifrac = fast_unifrac(tree, data_dict, weighted=WEIGHTED_UNIFRAC)
    DEBUG("Unifrac results: {0}".format(unifrac))
    DEBUG("Reordering results...")
    mat = __reorder_unifrac_distance_matrix_by_original_samples(unifrac['distance_matrix'], samples, sample_filter, otu_filter)

    DEBUG("Setting distances for filtered items to large values...")
    filter_indices = [ ind for ind, samp in enumerate(samples) if samp in sample_filter ]
    mat = __increase_distance_for_filtered_samples(mat, filter_indices)

    DEBUG("Fixing NaN/inf values...")
    mat = np.nan_to_num(mat)

    if SQUARE_UNIFRAC_DISTANCE:
        mat = np.multiply(mat, mat)

    __save_calculated_unifrac_file_and_hash_for_data(data, sample_filter, otu_filter, mat)
    DEBUG("Finished calculating Samples distance matrix.")
    return mat
Exemplo n.º 6
0
def calculate_unifrac(abund, sample_names, taxa_tree):
    """
    calculates the unifrac distance between samples both
    weighted and unweighted
    @param abund: the abundance matrix
    @param sample_names: the sample names
    @param taxa_tree: the tree of data
    @return: (unweighted matrix, row names), (weighted matrix, row names)
    @rtype: tuple
    """
    unifrac_dict = _create_unifrac_dict(abund, sample_names, taxa_tree)
    tree = dendropy_to_cogent(taxa_tree)
    unweighted = fast_unifrac(tree, unifrac_dict, modes={UNIFRAC_DIST_MATRIX}, is_symmetric=True, weighted=False)
    un_matrix = unweighted[UNIFRAC_DIST_MATRIX][0]
    un_rows = unweighted[UNIFRAC_DIST_MATRIX][1]

    weighted = fast_unifrac(tree, unifrac_dict, modes={UNIFRAC_DIST_MATRIX}, is_symmetric=True, weighted=True)
    w_matrix = weighted[UNIFRAC_DIST_MATRIX][0]
    w_rows = weighted[UNIFRAC_DIST_MATRIX][1]
    return (un_matrix, un_rows), (w_matrix, w_rows)
Exemplo n.º 7
0
 def test_make_unifrac_metric(self):
     """ exercise of the unweighted unifrac metric should not throw errors"""
     tree = parse_newick(self.l19_treestr, PhyloNode)
     unif = make_unifrac_metric(False, unifrac, True)
     res = unif(self.l19_data, self.l19_taxon_names, tree, self.l19_sample_names)
     envs = make_envs_dict(self.l19_data, self.l19_sample_names, self.l19_taxon_names)
     unifrac_mat, unifrac_names = fast_unifrac(tree, envs, modes=["distance_matrix"])["distance_matrix"]
     self.assertFloatEqual(res, _reorder_unifrac_res([unifrac_mat, unifrac_names], self.l19_sample_names))
     self.assertEqual(res[0, 0], 0)
     self.assertEqual(res[0, 3], 0.0)
     self.assertNotEqual(res[0, 1], 1.0)
Exemplo n.º 8
0
 def unifrac_pycogent(self):
     """Step 3 with Pycogent"""
     tree_newick = open(self.fasttree_tree, 'r').read()
     from cogent.parse.tree import DndParser
     from cogent.maths.unifrac.fast_tree import UniFracTreeNode
     tree = DndParser(tree_newick, UniFracTreeNode)
     from cogent.maths.unifrac.fast_unifrac import fast_unifrac
     distances = fast_unifrac(tree, self.tax.otu_table.to_dict())
     # Make a dataframe #
     names = distances['distance_matrix'][1]
     df = pandas.DataFrame(distances['distance_matrix'][0], index=names, columns=names)
     df.to_csv(self.distances_csv, sep='\t', float_format='%.5g')
Exemplo n.º 9
0
    def result(data, taxon_names, tree, sample_names, **kwargs):
        """ wraps the fast_unifrac fn to return just a matrix, in correct order
        
            sample_names: list of unique strings
        """

        envs = make_envs_dict(data, sample_names, taxon_names)
        unifrac_res = fast_unifrac(
            tree, envs, weighted=weighted, metric=metric, is_symmetric=is_symmetric, modes=["distance_matrix"], **kwargs
        )
        dist_mtx = _reorder_unifrac_res(unifrac_res["distance_matrix"], sample_names)
        return dist_mtx
Exemplo n.º 10
0
    def result(data, taxon_names, tree, sample_names, **kwargs):
        """ wraps the fast_unifrac fn to return just a matrix, in correct order
        
            sample_names: list of unique strings
        """

        envs = make_envs_dict(data, sample_names, taxon_names)
        unifrac_res = fast_unifrac(tree, envs, weighted=weighted, metric=metric,
            is_symmetric=is_symmetric, modes=["distance_matrix"],**kwargs)
        dist_mtx = _reorder_unifrac_res(unifrac_res['distance_matrix'],
            sample_names)
        return dist_mtx
Exemplo n.º 11
0
    def test_unifrac_explicit(self):
        """unifrac should correctly compute correct values.
        
        environment M contains only tips not in tree, tip j is in no envs
        values were calculated by hand
        """
        t1 = DndParser('((a:1,b:2):4,((c:3, j:17),(d:1,e:1):2):3)', \
            UniFracTreeNode) # note c,j is len 0 node
        #           /-------- /-a
        # ---------|          \-b
        #          |          /-------- /-c
        #           \--------|          \-j
        #                     \-------- /-d
        #                               \-e

        env_str = """
        a   A   1
        a   C   2
        b   A   1
        b   B   1
        c   B   1
        d   B   3
        e   C   1
        m   M   88"""
        env_counts = count_envs(env_str.splitlines())
        self.assertFloatEqual(fast_unifrac(t1,env_counts)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
        # changing tree topology relative to c,j tips shouldn't change 
        # anything
        t2 = DndParser('((a:1,b:2):4,((c:2, j:16):1,(d:1,e:1):2):3)', \
            UniFracTreeNode)
        self.assertFloatEqual(fast_unifrac(t2,env_counts)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
Exemplo n.º 12
0
    def test_unifrac_explicit(self):
        """unifrac should correctly compute correct values.
        
        environment M contains only tips not in tree, tip j is in no envs
        values were calculated by hand
        """
        t1 = DndParser('((a:1,b:2):4,((c:3, j:17),(d:1,e:1):2):3)', \
            UniFracTreeNode) # note c,j is len 0 node
        #           /-------- /-a
        # ---------|          \-b
        #          |          /-------- /-c
        #           \--------|          \-j
        #                     \-------- /-d
        #                               \-e

        env_str = """
        a   A   1
        a   C   2
        b   A   1
        b   B   1
        c   B   1
        d   B   3
        e   C   1
        m   M   88"""
        env_counts = count_envs(env_str.splitlines())
        self.assertFloatEqual(fast_unifrac(t1,env_counts)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
        # changing tree topology relative to c,j tips shouldn't change
        # anything
        t2 = DndParser('((a:1,b:2):4,((c:2, j:16):1,(d:1,e:1):2):3)', \
            UniFracTreeNode)
        self.assertFloatEqual(fast_unifrac(t2,env_counts)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
Exemplo n.º 13
0
 def test_fast_unifrac_one_sample2(self):
     """fu one sam should match whole weighted unifrac result, for env 'B'"""
     # first get full unifrac matrix
     res = fast_unifrac(self.t, self.env_counts, weighted=True)
     dmtx, env_order =  res['distance_matrix']
     dmtx_vec = dmtx[env_order.index('B')]
     dmtx_vec = dmtx_vec[argsort(env_order)]
     
     # then get one sample unifrac vector
     one_sam_dvec, one_sam_env_order = \
         fast_unifrac_one_sample('B', self.t, self.env_counts,weighted=True)
     one_sam_dvec = one_sam_dvec[argsort(one_sam_env_order)]
     self.assertFloatEqual(one_sam_dvec, dmtx_vec)
Exemplo n.º 14
0
    def test_fast_unifrac_one_sample2(self):
        """fu one sam should match whole weighted unifrac result, for env 'B'"""
        # first get full unifrac matrix
        res = fast_unifrac(self.t, self.env_counts, weighted=True)
        dmtx, env_order = res['distance_matrix']
        dmtx_vec = dmtx[env_order.index('B')]
        dmtx_vec = dmtx_vec[argsort(env_order)]

        # then get one sample unifrac vector
        one_sam_dvec, one_sam_env_order = \
            fast_unifrac_one_sample('B', self.t, self.env_counts,weighted=True)
        one_sam_dvec = one_sam_dvec[argsort(one_sam_env_order)]
        self.assertFloatEqual(one_sam_dvec, dmtx_vec)
Exemplo n.º 15
0
 def unifrac_pycogent(self):
     """Step 3 with Pycogent"""
     tree_newick = open(self.fasttree_tree, 'r').read()
     from cogent.parse.tree import DndParser
     from cogent.maths.unifrac.fast_tree import UniFracTreeNode
     tree = DndParser(tree_newick, UniFracTreeNode)
     from cogent.maths.unifrac.fast_unifrac import fast_unifrac
     distances = fast_unifrac(tree, self.tax.otu_table.to_dict())
     # Make a dataframe #
     names = distances['distance_matrix'][1]
     df = pandas.DataFrame(distances['distance_matrix'][0],
                           index=names,
                           columns=names)
     df.to_csv(self.distances_csv, sep='\t', float_format='%.5g')
Exemplo n.º 16
0
 def test_make_unifrac_metric(self):
     """ exercise of the unweighted unifrac metric should not throw errors"""
     tree = parse_newick(self.l19_treestr, PhyloNode)
     unif = make_unifrac_metric(False, unifrac, True)
     res = unif(self.l19_data, self.l19_taxon_names, tree,
         self.l19_sample_names)
     envs = make_envs_dict(self.l19_data, self.l19_sample_names,
         self.l19_taxon_names)
     unifrac_mat, unifrac_names = fast_unifrac(tree, envs, 
             modes=['distance_matrix'])['distance_matrix']
     self.assertFloatEqual(res, _reorder_unifrac_res([unifrac_mat,
         unifrac_names], self.l19_sample_names))
     self.assertEqual(res[0,0], 0)
     self.assertEqual(res[0,3], 0.0)
     self.assertNotEqual(res[0,1], 1.0)
Exemplo n.º 17
0
def unifrac2(sample1, sample2, tree, repetitions=1, subsampleSize='auto'):
    distances = []
    if subsampleSize == 'auto':
        subsampleSize = int(min(sample1.size, sample2.size)*.8) # 80% of the smaller sample
    for i in range(repetitions):
        subsample1  = sample1.subsample(subsampleSize) 
        subsample2  = sample2.subsample(subsampleSize) 
        allOtus = set(subsample1.keys()).union(subsample2.keys())
        envs = dict([(otu, makeOTUdict(otu, subsample1, subsample2, sample1, sample2)) for otu in allOtus])
        #pdb.set_trace()
        res = fast_unifrac(tree, envs, weighted=True)
        try:
            distances.append(res['distance_matrix'][0][0,1])
        except:
            pdb.set_trace()
    print subsampleSize, np.array(distances).mean()
    return np.array(distances)
Exemplo n.º 18
0
def unifrac2(sample1, sample2, tree, repetitions=1, subsampleSize='auto'):
    distances = []
    if subsampleSize == 'auto':
        subsampleSize = int(min(sample1.size, sample2.size)*.8) # 80% of the smaller sample
    for i in range(repetitions):
        subsample1  = sample1.subsample(subsampleSize) 
        subsample2  = sample2.subsample(subsampleSize) 
        allOtus = set(subsample1.keys()).union(subsample2.keys())
        envs = dict([(otu, makeOTUdict(otu, subsample1, subsample2, sample1, sample2)) for otu in allOtus])
        #pdb.set_trace()
        res = fast_unifrac(tree, envs, weighted=True)
        try:
            distances.append(res['distance_matrix'][0][0,1])
        except:
            pdb.set_trace()
    print subsampleSize, np.array(distances).mean()
    return np.array(distances)
Exemplo n.º 19
0
 def test_fast_unifrac_one_sample3(self):
     """fu one sam should match missing env unifrac result, for env 'B'"""
     # first get full unifrac matrix
     res = fast_unifrac(self.t, self.missing_env_counts, weighted=False)
     dmtx, env_order =  res['distance_matrix']
     dmtx_vec = dmtx[env_order.index('C')]
     dmtx_vec = dmtx_vec[argsort(env_order)]
     
     # then get one sample unifrac vector
     one_sam_dvec, one_sam_env_order = \
         fast_unifrac_one_sample('C', self.t, 
         self.missing_env_counts,weighted=False)
     one_sam_dvec = one_sam_dvec[argsort(one_sam_env_order)]
     self.assertFloatEqual(one_sam_dvec, dmtx_vec)
     
     # and should raise valueerror when 'B'
     self.assertRaises(ValueError, fast_unifrac_one_sample, 'B', self.t, 
         self.missing_env_counts,weighted=False)
Exemplo n.º 20
0
    def test_fast_unifrac_one_sample3(self):
        """fu one sam should match missing env unifrac result, for env 'B'"""
        # first get full unifrac matrix
        res = fast_unifrac(self.t, self.missing_env_counts, weighted=False)
        dmtx, env_order = res['distance_matrix']
        dmtx_vec = dmtx[env_order.index('C')]
        dmtx_vec = dmtx_vec[argsort(env_order)]

        # then get one sample unifrac vector
        one_sam_dvec, one_sam_env_order = \
            fast_unifrac_one_sample('C', self.t,
            self.missing_env_counts,weighted=False)
        one_sam_dvec = one_sam_dvec[argsort(one_sam_env_order)]
        self.assertFloatEqual(one_sam_dvec, dmtx_vec)

        # and should raise valueerror when 'B'
        self.assertRaises(ValueError,
                          fast_unifrac_one_sample,
                          'B',
                          self.t,
                          self.missing_env_counts,
                          weighted=False)
Exemplo n.º 21
0
def unifrac(p1, p2, sample_ids, otu_ids, tree):
    """
    Creates UniFrac distance between two urns

    Parameters
    ----------
    p1 : np.array
      Urn 1
    p2 : np.array
      Urn 2

    Returns
    -------
    np.array :
       Unifrac distance matrix
    """
    env = df.to_dict()
    df = pd.DataFrame([p1, p2], index=sample_ids, columns=otu_ids)
    res = fast_unifrac(tree, env, weighted=True)
    dist_mat = pd.DataFrame(res['distance_matrix'][0],
                            index=res['distance_matrix'][1],
                            columns=res['distance_matrix'][1])
    return dist_mat.ix[1, 0]
Exemplo n.º 22
0
def unifrac_upgma(table, sample_ids, otu_ids, tree):
    """
    Parameters
    ----------
    table : np.array
       Contingency table
       samples = rows
       observations = columns
    sample_ids : list, str
       List of sample ids
    otu_ids : list, str
       List of otu ids
    tree : str
       newick tree

    Returns
    -------
    skbio.TreeNode :
       Tree representation of clustering
    """
    df = pd.DataFrame(mat, index=sample_ids, columns=otu_ids)
    env = df.to_dict()
    res = fast_unifrac(tree, env, weighted=True, modes=['cluster_envs'])
    return TreeNode.read(StringIO(str(res['cluster_envs'])))
Exemplo n.º 23
0
    def test_unifrac_make_subtree(self):
        """unifrac result should not depend on make_subtree
        
        environment M contains only tips not in tree, tip j, k is in no envs
        one clade is missing entirely
        values were calculated by hand
        we also test that we still have a valid tree at the end
        """
        t1 = DndParser('((a:1,b:2):4,((c:3, (j:1,k:2)mt:17),(d:1,e:1):2):3)',\
            UniFracTreeNode) # note c,j is len 0 node
        #           /-------- /-a
        # ---------|          \-b
        #          |          /-------- /-c
        #           \--------|          \mt------ /-j
        #                    |                    \-k
        #                     \-------- /-d
        #                               \-e
        # 

        env_str = """
        a   A   1
        a   C   2
        b   A   1
        b   B   1
        c   B   1
        d   B   3
        e   C   1
        m   M   88"""
        env_counts = count_envs(env_str.splitlines())
        self.assertFloatEqual(fast_unifrac(t1,env_counts,make_subtree=False)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
        self.assertFloatEqual(fast_unifrac(t1,env_counts,make_subtree=True)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
        # changing tree topology relative to c,j tips shouldn't change anything
        t2 = DndParser('((a:1,b:2):4,((c:2, (j:1,k:2)mt:17):1,(d:1,e:1):2):3)', \
            UniFracTreeNode)
        self.assertFloatEqual(fast_unifrac(t2,env_counts,make_subtree=False)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
        self.assertFloatEqual(fast_unifrac(t2,env_counts,make_subtree=True)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))

        # ensure we haven't meaningfully changed the tree 
        # by passing it to unifrac
        t3 = DndParser('((a:1,b:2):4,((c:3, (j:1,k:2)mt:17),(d:1,e:1):2):3)',\
            UniFracTreeNode) # note c,j is len 0 node
        t1_tips = [tip.Name for tip in t1.tips()]
        t1_tips.sort()
        t3_tips = [tip.Name for tip in t3.tips()]
        t3_tips.sort()
        
        self.assertEqual(t1_tips, t3_tips)
        tipj3 = t3.getNodeMatchingName('j')
        tipb3 = t3.getNodeMatchingName('b')
        tipj1 = t1.getNodeMatchingName('j')
        tipb1 = t1.getNodeMatchingName('b')
        self.assertFloatEqual(tipj1.distance(tipb1), tipj3.distance(tipb3))
Exemplo n.º 24
0
def table2dict(lines):
    '''Convert an OTU table into a nested dictionary of counts'''
    header_line = next(lines)
    header_fields = header_line.rstrip().split("\t")
    samples = header_fields[1:]

    dat = {}
    for line in lines:
        fields = line.rstrip().split("\t")
        otu = string.translate(fields[0], tr)
        counts = [int(x) for x in fields[1:]]
        dat[otu] = {s: c for s, c in zip(samples, counts)}

    return dat


with open('tree.newick') as f:
    raw_tree = f.read()

tree = DndParser(raw_tree, UniFracTreeNode)

with open('../../../../data/rdp_g.counts') as f:
    envs = table2dict(f)

# write the weighted and unweighted tables
for weighted, fn in [[True, 'unifrac-w.dat'], [False, 'unifrac-uw.dat']]:
    res = fast_unifrac(tree, envs, weighted=weighted)
    matrix, samples = res['distance_matrix']
    df = pd.DataFrame(data=matrix, index=samples, columns=samples)
    df.to_csv(fn, sep='\t', index=False)
Exemplo n.º 25
0
    def test_unifrac_make_subtree(self):
        """unifrac result should not depend on make_subtree
        
        environment M contains only tips not in tree, tip j, k is in no envs
        one clade is missing entirely
        values were calculated by hand
        we also test that we still have a valid tree at the end
        """
        t1 = DndParser('((a:1,b:2):4,((c:3, (j:1,k:2)mt:17),(d:1,e:1):2):3)',\
            UniFracTreeNode) # note c,j is len 0 node
        #           /-------- /-a
        # ---------|          \-b
        #          |          /-------- /-c
        #           \--------|          \mt------ /-j
        #                    |                    \-k
        #                     \-------- /-d
        #                               \-e
        #

        env_str = """
        a   A   1
        a   C   2
        b   A   1
        b   B   1
        c   B   1
        d   B   3
        e   C   1
        m   M   88"""
        env_counts = count_envs(env_str.splitlines())
        self.assertFloatEqual(fast_unifrac(t1,env_counts,make_subtree=False)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
        self.assertFloatEqual(fast_unifrac(t1,env_counts,make_subtree=True)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
        # changing tree topology relative to c,j tips shouldn't change anything
        t2 = DndParser('((a:1,b:2):4,((c:2, (j:1,k:2)mt:17):1,(d:1,e:1):2):3)', \
            UniFracTreeNode)
        self.assertFloatEqual(fast_unifrac(t2,env_counts,make_subtree=False)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))
        self.assertFloatEqual(fast_unifrac(t2,env_counts,make_subtree=True)['distance_matrix'], \
            (array(
            [[0,10/16, 8/13],
            [10/16,0,8/17],
            [8/13,8/17,0]]),['A','B','C']))

        # ensure we haven't meaningfully changed the tree
        # by passing it to unifrac
        t3 = DndParser('((a:1,b:2):4,((c:3, (j:1,k:2)mt:17),(d:1,e:1):2):3)',\
            UniFracTreeNode) # note c,j is len 0 node
        t1_tips = [tip.Name for tip in t1.tips()]
        t1_tips.sort()
        t3_tips = [tip.Name for tip in t3.tips()]
        t3_tips.sort()

        self.assertEqual(t1_tips, t3_tips)
        tipj3 = t3.getNodeMatchingName('j')
        tipb3 = t3.getNodeMatchingName('b')
        tipj1 = t1.getNodeMatchingName('j')
        tipb1 = t1.getNodeMatchingName('b')
        self.assertFloatEqual(tipj1.distance(tipb1), tipj3.distance(tipb3))
Exemplo n.º 26
0
			(envs_prob_dict, samples) = EMDU.parse_envs(envs, nodes_in_order)
			P = envs_prob_dict[samples[0]]
			Q = envs_prob_dict[samples[1]]
			#EMDUnifrac with flow
			t0 = timeit.default_timer()
			(Z, Flow, diffab) = EMDU.EMDUnifrac_weighted_flow(T, l, nodes_in_order, P, Q)
			t1 = timeit.default_timer()
			EMDUnifrac_flow_times.append(t1-t0)
			#EMDUnifrac no flow
			t0 = timeit.default_timer()
			(Z, diffab) = EMDU.EMDUnifrac_weighted(T, l, nodes_in_order, P, Q)
			t1 = timeit.default_timer()
			EMDUnifrac_times.append(t1-t0)
			#FastUnifrac_ weighted
			t0 = timeit.default_timer()
			res = fast_unifrac(tr, envs, weighted=True, modes=set(['distance_matrix']))
			t1 = timeit.default_timer()
			FastUnifrac_times.append(t1-t0)
			i = i+1
	#Save means
	mean_EMDUnifrac_times[tree_sizes.index(tree_size)] = np.array(EMDUnifrac_times).mean()
	mean_EMDUnifrac_flow_times[tree_sizes.index(tree_size)] = np.array(EMDUnifrac_flow_times).mean()
	mean_FastUnifrac_times[tree_sizes.index(tree_size)] = np.array(FastUnifrac_times).mean()

#  Export all mean times
np.savetxt('EMDU_mean_times.txt', mean_EMDUnifrac_times, delimiter=',')
np.savetxt('EMDU_flow_mean_times.txt', mean_EMDUnifrac_flow_times, delimiter=',')
np.savetxt('FastUnifrac__mean_times.txt', mean_FastUnifrac_times, delimiter=',')


Exemplo n.º 27
0
def unifrac_recursive_test(ref_tree, tree, sample_names,
                           taxon_names, data, permutations=1000):  # , metric=weighted):
    """Performs UniFrac recursively over a tree.

    Specifically, for each node in the tree, performs UniFrac clustering.
    Then compares the UniFrac tree to a reference tree of the same taxa using
    the tip-to-tip distances and the subset distances. Assumption is that if
    the two trees match, the node represents a group in which evolution has
    mirrored the evolution of the reference tree.

    tree: contains the tree on which UniFrac will be performed recursively.
    envs: environments for UniFrac clustering (these envs should match the
          taxon labels in the ref_tree)
    ref_tree: reference tree that the clustering is supposed to match.
    metric: metric for UniFrac clustering.

    Typically, will want to estimate significance by comparing the actual
    values from ref_tree to values obtained with one or more shuffled versions
    of ref_tree (can make these with permute_tip_labels).


    Note from Jon: 

    I've modified this code a bit to test each node against a set of label-
    permuted host trees, and return some additional information about each node.

    It doesn't appear to give sensible results, not sure why. Almost none of the
    resulting permutations yield any other than zero or the number of permuta-
    tions. In other words, every permutation yields either a better or worse 
    match than the true tree. 
    """
    UNIFRAC_CLUST_ENVS = "cluster_envs"

    lengths, dists, sets, s_nodes, h_nodes, dist_below, sets_below, h_tips, s_tips = [
    ], [], [], [], [], [], [], [], []

    # Permute host tips, store permuted trees in a list of tree strings
    # print "Permuting host tree..."

    permuted_trees = []
    host_names = ref_tree.getTipNames()
    random_names = ref_tree.getTipNames()
    # for i in range(permutations):
    #   shuffle(random_names)
    #   permute_dict = dict(zip(host_names,random_names))
    #   permuted_subtree = ref_tree.copy()
    #   permuted_subtree.reassignNames(permute_dict)
    #   permuted_trees.append(str(permuted_subtree))
    #
    # alt:
    for i in range(permutations):
        shuffle(random_names)
        permute_dict = dict(zip(host_names, random_names))
        permuted_subtree = ref_tree.copy()
        permuted_subtree.reassignNames(permute_dict)
        permuted_trees.append(permuted_subtree)

    interaction = data.clip(0, 1)
    # Parse OTU table data into Unifrac-compatible envs tuple

    envs = make_envs_dict(data.T, sample_names, taxon_names)

    # Pass host tree, new OTU tree, and envs to recursive unifrac
    # print "Performing recursive Unifrac analysis..."

    for node in tree.traverse(self_before=True, self_after=False):

        #pause = raw_input("pause!")
        # print node
        try:
            result = fast_unifrac(
                node, envs, weighted=False, modes=set([UNIFRAC_CLUST_ENVS]))
            curr_tree = result[UNIFRAC_CLUST_ENVS]
        except ValueError:
            # hit a single node?
            continue
        except AttributeError:
            # hit a zero branch length
            continue
        if curr_tree is None:
            # hit single node?
            continue
        try:
            l = len(curr_tree.tips())
            d = curr_tree.compareByTipDistances(ref_tree)
            s = curr_tree.compareBySubsets(ref_tree, True)

            d_b = 0.0
            s_b = 0.0

            # for rand_tree_string in permuted_trees:
            #   rand_tree = DndParser(rand_tree_string)
            #   if d >= curr_tree.compareByTipDistances(rand_tree):
            #       d_b += 1
            #   if s >= curr_tree.compareBySubsets(rand_tree):
            #       s_b += 1

            for rand_tree in permuted_trees:
                if d >= curr_tree.compareByTipDistances(rand_tree):
                    d_b += 1
                if s >= curr_tree.compareBySubsets(rand_tree):
                    s_b += 1

            d_b = d_b / float(len(permuted_trees))
            s_b = s_b / float(len(permuted_trees))

            # The following section generates s_tips and h_tips variables
            # get just OTUs in this node
            otu_subset = node.getTipNames()
            s_tips_tmp = 0
            h_tips_tmp = 0
            s_vec = []
            # find positional index (from OTU table) for each cOTU represented
            # in this node:
            for i in range(len(taxon_names)):
                if taxon_names[i] in otu_subset:
                    s_tips_tmp += 1
                    s_vec.append(i)

            # slice interaction matrix down to only cOTUs in this node
            i_s_slice = interaction[numpy.ix_(s_vec)]

            # find positional index (this time from OTU table size) for each sample in this node:
            # sum all values in column for each host, if greater than zero, add
            # that host position to h_vec
            for j in range(i_s_slice.shape[1]):
                if i_s_slice[:, j].sum():
                    h_tips_tmp += 1

            # want to calculate all values before appending so we can bail out
            # if any of the calculations fails: this ensures that the lists
            # remain synchronized.

            """
            print curr_tree.asciiArt()
            print ref_tree.asciiArt()
            print l
            print d
            print d_b
            print s
            print s_b
            print node
            
            pause = raw_input("pause!")
            """

            if l > 2:
                lengths.append(l)
                dists.append(d)
                sets.append(s)
                s_nodes.append(node)
                h_nodes.append(curr_tree)
                dist_below.append(d_b)
                sets_below.append(s_b)
                h_tips.append(h_tips_tmp)
                s_tips.append(s_tips_tmp)
        except ValueError:
            # no common taxa
            continue
    results_dict = {'p_vals': sets_below, 's_tips': s_tips,
                    'h_tips': h_tips, 's_nodes': s_nodes, 'h_nodes': h_nodes}

    acc_dict = {'lengths': lengths, 'dists': dists,
                'sets': sets, 'dist_below': dist_below}

    return (results_dict, acc_dict)
Exemplo n.º 28
0
            #EMDUnifrac with flow
            t0 = timeit.default_timer()
            (Z, Flow,
             diffab) = EMDU.EMDUnifrac_weighted_flow(T, l, nodes_in_order, P,
                                                     Q)
            t1 = timeit.default_timer()
            EMDUnifrac_flow_times.append(t1 - t0)
            #EMDUnifrac no flow
            t0 = timeit.default_timer()
            (Z, diffab) = EMDU.EMDUnifrac_weighted(T, l, nodes_in_order, P, Q)
            t1 = timeit.default_timer()
            EMDUnifrac_times.append(t1 - t0)
            #FastUnifrac_ weighted
            t0 = timeit.default_timer()
            res = fast_unifrac(tr,
                               envs,
                               weighted=True,
                               modes=set(['distance_matrix']))
            t1 = timeit.default_timer()
            FastUnifrac_times.append(t1 - t0)
            i = i + 1
    #Save means
    mean_EMDUnifrac_times[tree_sizes.index(tree_size)] = np.array(
        EMDUnifrac_times).mean()
    mean_EMDUnifrac_flow_times[tree_sizes.index(tree_size)] = np.array(
        EMDUnifrac_flow_times).mean()
    mean_FastUnifrac_times[tree_sizes.index(tree_size)] = np.array(
        FastUnifrac_times).mean()

#  Export all mean times
np.savetxt('EMDU_mean_times.txt', mean_EMDUnifrac_times, delimiter=',')
np.savetxt('EMDU_flow_mean_times.txt',