Exemplo n.º 1
0
 def alternate_admixes(self, n=1000):
     for i in xrange(n):
         old_tree = deepcopy(self.tree)
         self.tree = addadmix(
             self.tree,
             new_node_names=['n' + str(i) + a for a in ['o', 'n']])
         if self.no_admixes + 1 == get_number_of_admixes(self.tree):
             print 'INCREASED NUMBER OF ADMIXTURES BY ONE= ' + 'TRUE'
         else:
             print 'INCREASED NUMBER OF ADMIXTURES BY ONE= ' + 'FALSE'
         ad = make_consistency_checks(self.tree, ['s1', 's2', 's3', 's4'])
         if not ad[0]:
             print ad
             plot_graph(old_tree, drawing_name='good.png')
             plot_graph(self.tree, drawing_name='bad.png')
             break
         #plot_graph(self.tree)
         old_tree = deepcopy(self.tree)
         self.tree = deladmix(self.tree)
         if self.no_admixes == get_number_of_admixes(self.tree):
             print 'DECREASED NUMBER OF ADMIXTURES BY ONE= ' + 'TRUE'
         else:
             print 'DECREASED NUMBER OF ADMIXTURES BY ONE= ' + 'FALSE'
         #plot_graph(self.tree)
         print self.tree
         ad = make_consistency_checks(self.tree, ['s1', 's2', 's3', 's4'])
         if not ad[0]:
             print ad
             plot_graph(old_tree, drawing_name='good.png')
             plot_graph(self.tree, drawing_name='bad.png')
             deladmix(old_tree)
             break
Exemplo n.º 2
0
def topological_support(start_tree, n=10000, nodes=None):
    from tree_plotting import plot_as_directed_graph, plot_graph, pretty_print, pretty_string
    tree = start_tree
    score = 0
    for i in range(n):
        prop_index = choice(3, 1)
        if prop_index == 0 and get_number_of_admixes(tree) > 0:
            new_tree = deladmix(tree)[0]
            score -= 1
        elif prop_index == 1:
            new_tree = make_regraft(tree, _get_new_nodes(i, prop_index))[0]
        elif prop_index == 2:
            new_tree = addadmix(tree, _get_new_nodes(i, prop_index))[0]
            score += 1
        else:
            new_tree = tree
        consistent, information = make_consistency_checks(new_tree, nodes)
        if consistent:
            if get_number_of_admixes(new_tree) < 50:
                tree = new_tree
        else:
            print information
            print 'last_good_tree', tree
            print 'new_bad_tree', new_tree
            plot_as_directed_graph(tree, drawing_name='before.png')
            wait(1)
            plot_as_directed_graph(new_tree, drawing_name='after.png')
            wait(1)
        if i % 1000 == 0:
            plot_as_directed_graph(tree,
                                   drawing_name='number_' + str(i) + 'K.png')
            wait(1)
    plot_as_directed_graph(tree, drawing_name='final_tree.png')
    wait(1)
    return score
Exemplo n.º 3
0
def r_correction(x1, x2, r1, r2, p1, p2):
    (tree1, _), (tree2, _) = x1, x2
    n1 = get_number_of_admixes(tree1)
    n2 = get_number_of_admixes(tree2)
    cadmix_prior11 = no_admixes(p=p1, admixes=n1, r=r1)
    cadmix_prior12 = no_admixes(p=p2, admixes=n1, r=r2)
    cadmix_prior21 = no_admixes(p=p1, admixes=n2, r=r1)
    cadmix_prior22 = no_admixes(p=p2, admixes=n2, r=r2)

    return cadmix_prior12 - cadmix_prior11, cadmix_prior21 - cadmix_prior22
Exemplo n.º 4
0
def rescale_admix_correction(tree,
                             sigma=0.01,
                             pks={},
                             make_correction=True,
                             return_without_correction=False):
    k = get_number_of_admixes(tree)
    pks['rescale_admix_correction_adap_param'] = sigma
    new_tree = deepcopy(tree)
    if k > 0:
        updat = updater(sigma / sqrt(k))
        new_tree = update_all_admixtures(new_tree, updat)
        if new_tree is None:
            return tree, 1, 0  #rejecting by setting backward jump probability to 0.
    else:
        return new_tree, 1, 0.234  #not to have to deal with the admix=0 case, I return 0.234 such that the adaptive parameter is not affected by these special cases.
    if make_correction:
        untouched_tree = deepcopy(new_tree)
        new_tree, qforward, qbackward = getcorrection(tree, new_tree,
                                                      sigma / 20)
    else:
        qforward = qbackward = 1.0
    if new_tree is None:
        return tree, 1, 0
    if return_without_correction:
        return new_tree, qforward, qbackward, untouched_tree
    return new_tree, qforward, qbackward  #, untouched_tree
Exemplo n.º 5
0
 def many_admixes(self, n=100):
     for i in xrange(n):
         self.tree = addadmix(
             self.tree,
             new_node_names=['n' + str(i) + a for a in ['o', 'n']])
         #plot_graph(self.tree)
         if self.no_admixes + 1 == get_number_of_admixes(self.tree):
             print 'INCREASED NUMBER OF ADMIXTURES BY ONE= ' + 'TRUE'
         else:
             print 'INCREASED NUMBER OF ADMIXTURES BY ONE= ' + 'FALSE'
         self.no_admixes = get_number_of_admixes(self.tree)
         ad = make_consistency_checks(self.tree, ['s1', 's2', 's3', 's4'])
         if not ad[0]:
             print ad
             plot_graph(self.tree, drawing_name='bad.png')
             break
     plot_as_directed_graph(self.tree)
 def probability(self, tree=None, admixtures=None):
     if admixtures is None:
         A = get_number_of_admixes(tree)
     else:
         A = admixtures
     if len(self.admixture_probabilities) > A:
         return log(self.admixture_probabilities[A]) - log(2) * A
     else:
         val = self.get_new_probability(A)
         return log(val) - log(2) * A
Exemplo n.º 7
0
def rescale(tree, sigma=0.01, pks={}):
    n = get_number_of_leaves(tree)
    k = get_number_of_admixes(tree)
    pks['rescale_adap_param'] = sigma
    new_tree = deepcopy(tree)
    updat = updater(sigma / sqrt(2 * n - 2 + 4 * k))
    new_tree = update_all_branches(new_tree, updat)
    if new_tree is None:
        return tree, 1, 0  #rejecting by setting backward jump probability to 0.
    return new_tree, 1, 1
def rescale_admixtures(tree, sigma=0.01, pks={}):
    k=get_number_of_admixes(tree)
    pks['rescale_admixtures_adap_param']=sigma
    new_tree=deepcopy(tree)
    if k>0:
        updat=updater(sigma/sqrt(k))
        new_tree=update_all_admixtures(new_tree, updat)
        if new_tree is None:
            return tree,1,0 #rejecting by setting backward jump probability to 0.
    else:
        return new_tree,1,0.234 #not to have to deal with the admix=0 case, I return 0.234 such that the adaptive parameter is not affected by these special cases.
    return new_tree ,1,1
Exemplo n.º 9
0
 def __call__(self, x, pks={}):
     tree, add=x
     index=choice(len(self.props),1)[0]
     if get_number_of_admixes(tree)==0 and index<=1:
         index=0
         backj=0.2
         forwj=0.4
     elif get_number_of_admixes(tree)==1 and index==1:
         backj=0.4
         forwj=0.2
     else:
         backj=0.2
         forwj=0.2
     
     names=self.node_naming.next_nodes(self.props[index].new_nodes)
     pks['proposal_type']= self.props[index].proposal_name
     args=[]
     if names:
         args.append(names)
     if self.params[index] is not None:
         args.extend(self.params[index])
     new_tree, forward, backward =self.props[index](tree, *args, pks=pks)
     return new_tree,forward,backward,1,forwj,backj
Exemplo n.º 10
0
def topological_prior(tree):
    total_prob = 0
    no_admix = get_number_of_admixes(tree)
    leaves, coalescences_nodes, admixture_nodes = get_categories(tree)
    ready_lineages = [(key, 0) for key in leaves]
    totally_free_coalescences = coalescences_nodes + ['r']
    free_admixtures = admixture_nodes

    coalescences_on_hold = []
    res = 0
    while True:
        sames, single_coalescences, admixtures = get_destination_of_lineages(
            tree, ready_lineages)
        waiting_coalescences, awaited_coalescences, still_on_hold = matchmake(
            single_coalescences, coalescences_on_hold)
        #print sames, waiting_coalescences, admixtures
        res += _get_selection_probabilities(
            no_sames=len(sames),
            no_waiting_coalescences=len(waiting_coalescences),
            no_awaited_coalescences=len(awaited_coalescences),
            no_admixtures=len(admixtures),
            no_totally_free_coalescences=len(totally_free_coalescences),
            no_free_admixtures=len(free_admixtures),
            no_ready_lineages=len(ready_lineages),
            no_coalescences_on_hold=len(coalescences_on_hold),
            no_admixture_pairs=_get_number_of_admixture_branches(
                ready_lineages))

        #updating lineages
        coalescences_on_hold = still_on_hold + waiting_coalescences.values()
        totally_free_coalescences, free_admixtures = _thin_out_frees(
            tree, totally_free_coalescences, free_admixtures, ready_lineages)
        #print 'free_admixture', free_admixtures
        #print 'totally_free_coalescences', totally_free_coalescences
        ready_lineages = propagate_married(tree, awaited_coalescences)
        ready_lineages.extend(propagate_married(tree, sames))
        ready_lineages.extend(propagate_admixtures(tree, admixtures))
        #print 'ready_lineages', ready_lineages

        #stop criteria
        if len(ready_lineages) == 1 and ready_lineages[0][0] == 'r':
            break

    return res
Exemplo n.º 11
0
 def __call__(self, x, pks={}):
     tree,add=x
     k=get_number_of_admixes(tree)
     index, jforward, jbackward = draw_proposal(self.props, k, self.proportions)
     
     names=self.node_naming.next_nodes(self.props[index].new_nodes)
     pks['proposal_type']= self.props[index].proposal_name
     self.recently_called_type=self.props[index].proposal_name
     self.recently_called_index=index
     proposal_input= self.props[index].input
     args=get_args2(names, self.adaps[index])
     
     if proposal_input=='add':
         new_add, forward, backward = self.props[index](add, *args, pks=pks)
         return (tree, new_add), forward, backward, 1.0, jforward, jbackward
     if proposal_input=='tree':
         new_tree, forward, backward = self.props[index](tree, *args, pks=pks)
         return (new_tree, add), forward, backward, 1.0, jforward, jbackward
     else:
         new_x, forward, backward = self.props[index](x, *args, pks=pks)
         return new_x, forward, backward, 1.0, jforward, jbackward
Exemplo n.º 12
0
def add_sadmixes(tree, final_no_sadmixes):
    k=get_number_of_admixes(tree)
    n=get_number_of_leaves(tree)
    maxrank=n*(n+1)/2
    #print pretty_string(tree)
    for i in range(k,final_no_sadmixes):
        pops=get_rank(tree)
        assert pops<maxrank, 'Admixture event number '+str(i+1)+' cant be added because the model is already maxed out'
        names=['sad'+str(i)+'a','sad'+str(i)+'b']
        candidate_tree,_,_=addadmix(tree,new_node_names=names, preserve_root_distance=False)
        candidate_pops=get_rank(candidate_tree)
        #print 'cand_res', candidate_pops, pops
        while candidate_pops<=pops:
            #print 'rejected addition'
            candidate_tree,_,_=addadmix(tree,new_node_names=names, preserve_root_distance=False)
            candidate_pops=get_rank(candidate_tree)
            #print 'cand_res', candidate_pops, pops
        tree=candidate_tree
        #print '----------'
        #print pretty_string(tree)
        
    return tree
Exemplo n.º 13
0
 def __call__(self, x, pks={}):
     tree, add=x
     index, backj, forwj = get_random_proposal_without_deleting_empty(len(self.props), get_number_of_admixes(tree))
     
     
     names=self.node_naming.next_nodes(self.props[index].new_nodes)
     pks['proposal_type']= self.props[index].proposal_name
     self.recently_called_type=self.props[index].proposal_name
     args=get_args(names, self.params[index])
     
     if self.recently_called_type[-3:] == 'add':
         new_add, forward, backward =self.props[index](add, *args, pks=pks)
         return (tree,new_add),forward,backward,1,forwj,backj
     else:
         new_tree, forward, backward =self.props[index](tree, *args, pks=pks)
         return (new_tree,add),forward,backward,1,forwj,backj
Exemplo n.º 14
0
def initialize_posterior2(emp_cov=None,
                          true_tree=None,
                          M=None,
                          use_skewed_distr=False,
                          p=0.5,
                          rescale=False,
                          model_choice=[
                              'empirical covariance', 'true tree covariance',
                              'wishart on true tree covariance',
                              'empirical covariance on true tree',
                              'no likelihood'
                          ],
                          simulate_true_tree=False,
                          true_tree_no_leaves=None,
                          true_tree_no_admixes=None,
                          nodes=None,
                          simulate_true_tree_with_skewed_prior=False,
                          reduce_cov=None,
                          add_outgroup_to_true_tree=False,
                          reduce_true_tree=False):

    if not isinstance(model_choice, basestring):
        model_choice = model_choice[0]

    if model_choice == 'no likelihood':
        return initialize_prior_as_posterior(), {}

    if (model_choice == 'true tree covariance'
            or model_choice == 'wishart on true tree covariance'
            or model_choice == 'empirical covariance on true tree'):

        if simulate_true_tree:
            true_tree = generate_phylogeny(
                true_tree_no_leaves, true_tree_no_admixes, nodes,
                simulate_true_tree_with_skewed_prior)

        elif isinstance(true_tree, basestring):
            if ';' in true_tree:  #this means that the true tree is a s_tree
                true_tree_s = true_tree
                true_tree = identifier_to_tree_clean(true_tree_s)
            else:
                with open(true_tree, 'r') as f:
                    true_tree_s = f.readline().rstrip()
                true_tree = identifier_to_tree_clean(true_tree_s)

        true_tree = Rtree_operations.simple_reorder_the_leaves_after_removal_of_s1(
            true_tree)

        no_leaves = get_number_of_leaves(true_tree)
        no_admixes = get_number_of_admixes(true_tree)

        cov = make_covariance(true_tree)

        if reduce_cov is not None:
            pass
        if reduce_true_tree is not None:
            true_tree = Rtree_operations.remove_outgroup(
                true_tree, reduce_true_tree)
            if reduce_true_tree == 's1' or reduce_true_tree == 0:
                pass
        if emp_cov is not None:
            if isinstance(emp_cov, basestring):
                pass

    if M is None:
        M = n_mark(emp_cov)
    if rescale:
        emp_cov, multiplier = rescale_empirical_covariance(emp_cov)
        print 'multiplier is', multiplier

    def posterior(x, pks={}):
        #print tot_branch_length
        prior_value = prior(x, p=p, use_skewed_distr=use_skewed_distr, pks=pks)
        if prior_value == -float('inf'):
            return -float('inf'), prior_value
        likelihood_value = likelihood(x, emp_cov, M=M)
        pks['prior'] = prior_value
        pks['likelihood'] = likelihood_value
        #pks['posterior']=prior_value+likelihood_value
        return likelihood_value, prior_value

    if rescale:
        return posterior, multiplier
    return posterior
Exemplo n.º 15
0
 def __call__(self, **kwargs):
     old_tree = kwargs['tree']
     return get_number_of_admixes(old_tree)
Exemplo n.º 16
0
def test_posterior_model_multichain(true_tree=None,
                                    start_tree=None,
                                    sim_lengths=[250] * 800,
                                    summaries=None,
                                    thinning_coef=1,
                                    admixtures_of_true_tree=None,
                                    no_leaves_true_tree=4,
                                    wishart_df=None,
                                    sim_from_wishart=False,
                                    no_chains=8,
                                    result_file='results_mc3.csv',
                                    emp_cov=None,
                                    emp_remove=-1,
                                    rescale_empirical_cov=False):
    if true_tree is None:
        if admixtures_of_true_tree is None:
            admixtures_of_true_tree = geom.rvs(p=0.5) - 1
        true_tree = generate_phylogeny(no_leaves_true_tree,
                                       admixtures_of_true_tree)
    else:
        no_leaves_true_tree = get_no_leaves(true_tree)
        admixtures_of_true_tree = get_number_of_admixes(true_tree)
    true_x = (true_tree, 0)

    m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree))
    if start_tree is None:
        start_tree = true_tree

    start_x = (start_tree, 0)
    if wishart_df is None:
        wishart_df = n_mark(m)
    if sim_from_wishart:
        r = m.shape[0]
        print m
        m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df))
        print m
    if emp_cov is not None:
        m = emp_cov
    if rescale_empirical_cov:
        posterior, multiplier = initialize_posterior(
            m,
            wishart_df,
            use_skewed_distr=True,
            rescale=rescale_empirical_cov)
    else:
        posterior = initialize_posterior(m,
                                         wishart_df,
                                         use_skewed_distr=True,
                                         rescale=rescale_empirical_cov)
        multiplier = None
    print 'true_tree=', unique_identifier_and_branch_lengths(true_tree)
    if rescale_empirical_cov:
        post_ = posterior(
            (scale_tree_copy(true_x[0],
                             1.0 / multiplier), true_x[1] / multiplier))
    else:
        post_ = posterior(true_x)
    print 'likelihood(true_tree)', post_[0]
    print 'prior(true_tree)', post_[1]
    print 'posterior(true_tree)', sum(post_)
    if summaries is None:
        summaries = [
            s_variable('posterior'),
            s_variable('mhr'),
            s_no_admixes()
        ]
    proposal = basic_meta_proposal()
    #proposal.props=proposal.props[2:] #a little hack under the hood
    #proposal.params=proposal.params[2:] #a little hack under the hood.
    sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries}
    sample_verbose_scheme_first = deepcopy(sample_verbose_scheme)
    if 'posterior' in sample_verbose_scheme:
        sample_verbose_scheme_first['posterior'] = (1, 1)  #(1,1)
        sample_verbose_scheme_first['no_admixes'] = (1, 1)
    #if 'likelihood' in sample_verbose_scheme:
    #sample_verbose_scheme_first['likelihood']=(1,1)
    print sample_verbose_scheme_first
    MCMCMC(starting_trees=[deepcopy(start_x) for _ in range(no_chains)],
           posterior_function=posterior,
           summaries=summaries,
           temperature_scheme=fixed_geometrical(800.0, no_chains),
           printing_schemes=[sample_verbose_scheme_first] +
           [sample_verbose_scheme for _ in range(no_chains - 1)],
           iteration_scheme=sim_lengths,
           overall_thinnings=int(thinning_coef),
           proposal_scheme=[adaptive_proposal() for _ in range(no_chains)],
           cores=no_chains,
           no_chains=no_chains,
           multiplier=multiplier,
           result_file=result_file,
           store_permuts=False)
    print 'finished MC3'
    #save_pandas_dataframe_to_csv(results, result_file)
    #save_permuts_to_csv(permuts, get_permut_filename(result_file))
    return true_tree
Exemplo n.º 17
0
def test_posterior_model(true_tree=None,
                         start_tree=None,
                         sim_length=100000,
                         summaries=None,
                         thinning_coef=19,
                         admixtures_of_true_tree=None,
                         no_leaves_true_tree=4,
                         filename='results.csv',
                         sim_from_wishart=False,
                         wishart_df=None,
                         sap_sim=False,
                         sap_ana=False,
                         resimulate_regrafted_branch_length=False,
                         emp_cov=None,
                         big_posterior=False,
                         rescale_empirical_cov=False):
    if true_tree is None:
        if admixtures_of_true_tree is None:
            admixtures_of_true_tree = geom.rvs(p=0.5) - 1
        true_tree = generate_phylogeny(no_leaves_true_tree,
                                       admixtures_of_true_tree,
                                       skewed_admixture_prior=sap_sim)
    else:
        no_leaves_true_tree = get_no_leaves(true_tree)
        admixtures_of_true_tree = get_number_of_admixes(true_tree)

    true_x = (true_tree, 0)

    m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree))
    if start_tree is None:
        start_tree = true_tree

    start_x = (start_tree, 0)
    if wishart_df is None:
        wishart_df = n_mark(m)
    if sim_from_wishart:
        r = m.shape[0]
        print m
        m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df))
        print m
    if emp_cov is not None:
        m = emp_cov
    if big_posterior:
        posterior = initialize_big_posterior(m,
                                             wishart_df,
                                             use_skewed_distr=sap_ana)
    else:
        posterior = initialize_posterior(m,
                                         wishart_df,
                                         use_skewed_distr=sap_ana,
                                         rescale=rescale_empirical_cov)
    print 'true_tree=', unique_identifier_and_branch_lengths(true_tree)
    post_ = posterior(true_x)
    print 'likelihood(true_tree)', post_[0]
    print 'prior(true_tree)', post_[1]
    print 'posterior(true_tree)', sum(post_[:2])
    if summaries is None:
        summaries = [s_posterior(), s_variable('mhr'), s_no_admixes()]
    proposal = adaptive_proposal(
        resimulate_regrafted_branch_length=resimulate_regrafted_branch_length)
    #proposal.props=proposal.props[2:] #a little hack under the hood
    #proposal.params=proposal.params[2:] #a little hack under the hood.
    sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries}
    sample_verbose_scheme['posterior'] = (1, 1)
    sample_verbose_scheme['no_admixes'] = (1, 1)
    final_tree, final_posterior, results, _ = basic_chain(
        start_x,
        summaries,
        posterior,
        proposal,
        post=None,
        N=sim_length,
        sample_verbose_scheme=sample_verbose_scheme,
        overall_thinning=int(max(thinning_coef, sim_length / 60000)),
        i_start_from=0,
        temperature=1.0,
        proposal_update=None,
        check_trees=False)
    save_to_csv(results, summaries, filename=filename)
    return true_tree
Exemplo n.º 18
0
def addadmix(tree,
             new_node_names=None,
             pks={},
             fixed_sink_source=None,
             new_branch_length=None,
             new_to_root_length=None,
             check_opposite=False,
             preserve_root_distance=True):
    '''
    This proposal adds an admixture to the tree. There are a lot of free parameters but only 5 are in play here:
        c1: the branch length of the source population
        c2: the branch length of the population genes are migrating into. 
        u1: the position of the admixture source on the source population branch
        u2: the position of the admixture destination on the sink population branch
        w: the admixture proportion.
        The connecting function, h, (see Green & Hastie 2009) is
            h(c1,c2,u1,u2,w)=(c1*u1, c1*(1-u1), c2*u2, c2*(1-u2), 0.5*w)
    '''

    possible_nodes = _get_possible_starters(tree)

    no_admixtures = get_number_of_admixes(tree)
    new_tree = deepcopy(tree)
    #print possible_nodes
    sink_key, sink_branch = possible_nodes[choice(len(possible_nodes), 1)[0]]
    if fixed_sink_source is not None:
        sink_key, sink_branch, source_key, source_branch = fixed_sink_source
    children, other = get_all_branch_descendants_and_rest(
        tree, sink_key, sink_branch)
    candidates = other + [('r', 0)]
    ch = choice(len(candidates), 1)[0]
    if fixed_sink_source is None:
        source_key, source_branch = candidates[ch]

    pks['sink_key'] = sink_key
    pks['source_key'] = source_key
    pks['source_branch'] = source_branch
    pks['sink_branch'] = sink_branch
    #print 'children', children
    #print 'candidates', candidates
    #print 'sink', (sink_key, sink_branch)
    #print 'source', (source_key,source_branch)
    #print 'new_tree',new_tree
    if fixed_sink_source is not None:
        new_tree, forward_density, backward_density, multip = insert_admix(
            new_tree,
            source_key,
            source_branch,
            sink_key,
            sink_branch,
            pks=pks,
            new_branch_length=new_branch_length,
            new_to_root_length=new_to_root_length,
            preserve_root_distance=preserve_root_distance)
    elif new_node_names is None:
        new_tree, forward_density, backward_density, multip = insert_admix(
            new_tree,
            source_key,
            source_branch,
            sink_key,
            sink_branch,
            pks=pks,
            preserve_root_distance=preserve_root_distance)
    else:
        new_tree, forward_density, backward_density, multip = insert_admix(
            new_tree,
            source_key,
            source_branch,
            sink_key,
            sink_branch,
            pks=pks,
            source_name=new_node_names[0],
            sink_name=new_node_names[1],
            preserve_root_distance=preserve_root_distance)

    choices_forward = float(len(possible_nodes) * len(candidates)) * 2
    choices_backward = float(len(_get_removable_admixture_branches(new_tree)))

    pks['forward_density'] = forward_density
    pks['backward_density'] = backward_density
    pks['forward_choices'] = choices_forward
    pks['backward_choices'] = choices_backward

    if check_opposite:
        pks2 = {}
        t, f, b = deladmix(new_tree,
                           pks=pks2,
                           fixed_remove=(pks['sink_new_name'],
                                         pks['sink_new_branch']),
                           check_opposite=False,
                           preserve_root_distance=preserve_root_distance)
        if (float_equal(forward_density, pks2['backward_density'])
                and choices_forward == pks2['backward_choices']
                and float_equal(backward_density, pks2['forward_density'])
                and choices_backward == pks2['forward_choices']):
            print 'test passed'
        else:
            print 'TEST FAILED'
            print forward_density, "==", pks2[
                'backward_density'], ":", forward_density == pks2[
                    'backward_density']
            print backward_density, "==", pks2[
                'forward_density'], ":", backward_density == pks2[
                    'forward_density']
            print choices_forward, "==", pks2[
                'backward_choices'], ":", choices_forward == pks2[
                    'backward_choices']
            print choices_backward, "==", pks2[
                'forward_choices'], ":", choices_backward == pks2[
                    'forward_choices']
            print pretty_string(tree)
            print pretty_string(new_tree)
            print pretty_string(t)
            for key, val in pks.items():
                print key, ': ', val
            print "-----------"
            for key, val in pks2.items():
                print key, ': ', val
            assert False

    return new_tree, forward_density / choices_forward, backward_density / choices_backward * multip
Exemplo n.º 19
0
 def summary_of_phylogeny(self, tree):
     return get_number_of_admixes(tree)
def main(args):
    parser = ArgumentParser(
        usage='pipeline for plotting posterior distribution summaries.',
        version='1.0.0')

    parser.add_argument(
        '--posterior_distribution_file',
        required=True,
        type=str,
        help=
        'The file containing posterior distributions from the "AdmixtureBayes posterior" command. It needs the two columns "pops" and topology.'
    )
    parser.add_argument(
        '--plot',
        choices=['consensus_trees', 'top_node_trees', 'top_trees'],
        required=True,
        help='The type of plot to make. Choose between: 1) consensus_trees. '
        'It plots an admixture graph based on all nodes that have a higher (marginal) posterior probability of X. '
        'Different X\'s can be supplied with the command --consensus_threshold \n'
        '2) top_node_trees. It plots the X highest posterior combinations of node types '
        'and creates the corresponding minimal topologies.  X can be supplied through the command --top_node_trees_to_plot'
        '3) top_trees. It plots the X highest posterior topologies. X can be supplied by the command --top_trees_to_plot'
    )
    parser.add_argument('--outgroup',
                        default='outgroup',
                        help='name of the outgroup to plot')
    parser.add_argument(
        '--consensus_threshold',
        default=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99],
        type=float,
        nargs='+',
        help=
        'The posterior thresholds for which to draw different consensus trees.'
    )
    parser.add_argument(
        '--top_node_trees_to_plot',
        type=int,
        default=3,
        help='The number of node trees (or minimal topologies) to plot')
    parser.add_argument('--top_trees_to_plot',
                        type=int,
                        default=3,
                        help='The number of trees (or topologies) to plot ')
    parser.add_argument(
        '--write_ranking_to_file',
        type=str,
        default='',
        help=
        'if a file is supplied here, the natural rankings for each of the plots is written here.'
    )
    parser.add_argument(
        '--rankings_to_write_to_file',
        type=int,
        default=1000,
        help=
        'the number of rankings(nodes, min topology or topology depending on --plot) to write to the ranking file.'
    )
    parser.add_argument(
        '--dont_annotate_node_posterior',
        default=False,
        action='store_true',
        help=
        'This will not color the nodes according to their posterior probability.'
    )
    parser.add_argument('--nodes',
                        default='',
                        type=str,
                        help='file where the first line is the leaf nodes')
    parser.add_argument('--suppress_plot', default=False, action='store_true')
    parser.add_argument(
        '--no_sort',
        default=False,
        action='store_true',
        help=
        'often the tree is sorted according to the leaf names. no_sort willl assumed that they are not sorted according to this but sorted according to '
    )
    parser.add_argument('--sep',
                        default=',',
                        type=str,
                        help='the separator used in the input file')

    #parser.add_argument('--no_header', default=False, action='store_true',help='will assume that there is no header in the file')
    #parser.add_argument('--burn_in_rows', default=0, type=int, help='the number of rows that will be skipped in the input file as burn-in period')
    #parser.add_argument('--burn_in_fraction', default=0.0, type=float, help='the proportion of the rows that are discarded as burn in period')
    #parser.add_argument('--tree_column_name', default='tree', type=str, help='the name in the header of the column with all the trees.')
    parser.add_argument(
        '--consensus_method',
        choices=['descendant_frequencies'],
        default='descendant_frequencies',
        help='Which method should be used to calculate the consensus tree?')
    #parser.add_argument('--min_w', default=0.0, type=float, help='a lower threshold of which descendants matter when the consensus_method is descendant_frequencies.')

    #parser.add_argument('--plot_tops_file', action='store_true', default=False, help='this will assume that the file is a tops file from downstream_analysis_parser and plot each line numbered.')

    #parser.add_argument('--get_effective_number_of_admixtures', action='store_true', default=False, help='this will cancel all the other analysis and only print the effective number of admixes(tadmixes/sadmixes or admixes) to a a file.')
    #parser.add_argument('--effective_number_of_admixtures_file', type=str, default='no_tadmixes.txt', help='this is the file in which to write the effective number of admixes in the file')
    #parser.add_argument('--type_of_effective_admixtures', type=str, choices=['sadmix','tadmix','admix'], help='this is the type of admixes to write to the file.')

    #parser.add_argument('--node_count_file', default='', type=str, help='if plot_tops option is supplied')
    #parser.add_argument('--node_count_probs', default='', type=str, help='if supplied this will make a new ')
    #parser.add_argument('--test_run', default=False, action='store_true',
    #                    help='will overwrite everything and run a test function')

    options = parser.parse_args(args)

    def combine_nodes(node_structure, new_node, seen_sets):
        candidate = new_node.name
        seen = []
        for lists_of_fixed_size in seen_sets[::-1]:
            for attached_branch in lists_of_fixed_size:
                if (attached_branch.issubset(candidate) and
                    ((not attached_branch.issubset(seen)) or
                     (not node_structure[attached_branch].has_parent()))):
                    seen.extend(list(attached_branch))
                    new_node.add_child(node_structure[attached_branch])
                    node_structure[attached_branch].add_parent(new_node)
        return node_structure

    def get_number_of_tadmixtures(node_structure):
        total = 0
        for key in node_structure:
            total += max(0, node_structure[key].get_number_of_parents() - 1)
        return total

    def node_combinations_to_node_structure(node_combinations):
        length_sorted = {}
        for node_combination in node_combinations:
            leaves = frozenset(node_combination.split('.'))
            k = len(leaves)
            if k in length_sorted:
                length_sorted[k].append(leaves)
            else:
                length_sorted[k] = [leaves]
        length_sorted_list = [
            length_sorted.get(k, [])
            for k in range(1,
                           max(length_sorted.keys()) + 1)
        ]
        #length_sorted_list is of the form [[[A],[B],[C]],[[A,B],[B,C]],...,[[A,B,C]]]
        node_structure = {}
        for leaf_node in length_sorted_list[0]:
            node_structure[leaf_node] = Node(leaf_node)
        added_sets = [length_sorted_list[0]]
        for lists_of_fixed_size in length_sorted_list[1:]:
            for branch_set in lists_of_fixed_size:
                new_node = Node(branch_set)
                combine_nodes(node_structure, new_node, added_sets)
                node_structure[branch_set] = new_node
            added_sets.append(lists_of_fixed_size)
        return node_structure

    # if options.node_count_file:
    #     with open(options.node_count_file, 'r') as f:
    #         node_count_dic={}
    #         for lin in f.readlines():
    #             key,freq=lin.rstrip().split()
    #             node_count_dic[frozenset(key.split('.'))]=float(freq)
    # else:
    #     node_count_dic=None

    if options.plot == 'consensus_trees' or options.plot == 'top_node_trees':
        df = pd.read_csv(options.posterior_distribution_file,
                         sep=options.sep,
                         usecols=['pops'])
        nodes_list = df['pops'].tolist()
        #print(nodes_list)
        seen_combinations = {}
        for nodes in nodes_list:
            #print(nodes)
            for node in nodes.split('-'):
                #print(node)
                seen_combinations[node] = seen_combinations.get(node, 0) + 1
        N = len(nodes_list)
        #print(seen_combinations)
        if options.plot == 'consensus_trees':
            node_combinations = []
            for threshold in options.consensus_threshold:
                total_threshold = int(N * threshold)
                final_node_combinations = [
                    k for k, v in seen_combinations.items()
                    if v > total_threshold
                ]
                node_combinations.append(final_node_combinations)
            if not options.dont_annotate_node_posterior:
                node_count_dic = {
                    frozenset(k.split('.')): float(v) / N
                    for k, v in seen_combinations.items()
                }
            else:
                node_count_dic = None
            for i, final_node_combinations in enumerate(node_combinations):
                #print(final_node_combinations)
                final_node_structure = node_combinations_to_node_structure(
                    final_node_combinations)
                if not options.suppress_plot:
                    from tree_plotting import plot_node_structure_as_directed_graph
                    plot_node_structure_as_directed_graph(
                        final_node_structure,
                        drawing_name='consensus_' +
                        str(int(100 * options.consensus_threshold[i])) +
                        '.png',
                        node_dic=node_count_dic)
            if options.write_ranking_to_file:
                with open(options.write_ranking_to_file, 'w') as f:
                    c = Counter(seen_combinations)
                    to_write = c.most_common(options.rankings_to_write_to_file)
                    for node, frequency in to_write:
                        f.write(node + ',' + str(float(frequency) / N) + '\n')
        elif options.plot == 'top_node_trees':
            c = Counter(nodes_list)
            to_plots = c.most_common(options.top_node_trees_to_plot)
            if options.write_ranking_to_file:
                with open(options.write_ranking_to_file, 'w') as f:
                    for tree, frequency in c.most_common(
                            options.rankings_to_write_to_file):
                        f.write(tree + ',' + str(float(frequency) / N) + '\n')
            if not options.dont_annotate_node_posterior:
                c = Counter(seen_combinations)
                node_count_dic = {
                    frozenset(key.split('.')): float(count) / N
                    for key, count in c.most_common(1000)
                }
            else:
                node_count_dic = None
            if not options.suppress_plot:
                from tree_plotting import plot_node_structure_as_directed_graph
                for i, (to_plot, count) in enumerate(to_plots):
                    node_structure = node_combinations_to_node_structure(
                        to_plot.split('-'))
                    plot_node_structure_as_directed_graph(
                        node_structure,
                        drawing_name='minimal_topology_' + str(i + 1) + '.png',
                        node_dic=node_count_dic)
    elif options.plot == 'top_trees':
        df = pd.read_csv(options.posterior_distribution_file,
                         sep=options.sep,
                         usecols=['pops', 'topology'])
        trees_list = df['topology'].tolist()
        no_leaves = len(trees_list[0].split('-')[0].split('.'))
        N = len(trees_list)
        c = Counter(trees_list)
        to_plots = c.most_common(options.top_trees_to_plot)

        #obtaining nodes:
        if not options.nodes:
            nodes = df['pops'].tolist()[0].split('-')
            leaves = list(
                set([leaf for node in nodes for leaf in node.split('.')]))
            if len(leaves) == no_leaves:
                pass  #everything is good
            elif len(leaves) == no_leaves - 1:
                #adding outgroup
                leaves.append(options.outgroup)
            else:
                assert False, 'The number of leaves could not be obtained'
            assert not options.no_sort, 'When nodes are not specified, they will always be sorted'
            leaves = sorted(leaves)
        else:
            leaves = read_one_line(options.nodes)
            if not options.no_sort:
                leaves = sorted(leaves)

        if options.write_ranking_to_file:
            with open(options.write_ranking_to_file, 'w') as f:
                for tree, frequency in c.most_common(
                        options.rankings_to_write_to_file):
                    f.write(tree + ',' + str(float(frequency) / N) + '\n')

        if not options.suppress_plot:
            from tree_plotting import plot_as_directed_graph
            for i, (to_plot, count) in enumerate(to_plots):
                tree = topological_identifier_to_tree_clean(
                    to_plot,
                    leaves=generate_predefined_list_string(deepcopy(leaves)))
                plot_as_directed_graph(tree,
                                       drawing_name='topology_' + str(i + 1) +
                                       '.png')
    sys.exit()

    if options.plot_tops_file:
        with open(options.input_file, 'r') as f:
            for n, lin in enumerate(f.readlines()):
                rank, probability, combination = lin.rstrip().split(',')
                all_nodes = [c.split('.') for c in combination.split('_')]
                flattened = [item for sublist in all_nodes for item in sublist]
                a = list(set(flattened))
                code = rank + '_' + str(int(
                    100 * round(float(probability), 2))) + '_' + '_'.join(a)
                print 'code', code
                node_structure = node_combinations_to_node_structure(
                    combination.split('_'))

                print node_structure
                plot_node_structure_as_directed_graph(node_structure,
                                                      drawing_name=code +
                                                      '.png',
                                                      node_dic=node_count_dic)
        sys.exit()

    if options.test_run:
        from generate_prior_trees import generate_phylogeny
        from tree_statistics import unique_identifier_and_branch_lengths
        from tree_plotting import plot_node_structure_as_directed_graph, plot_as_directed_graph
        N = 5
        tree1 = generate_phylogeny(N, 1)
        plot_as_directed_graph(tree1, drawing_name='tree1.png')
        tree2 = generate_phylogeny(N, 1)
        plot_as_directed_graph(tree2, drawing_name='tree2.png')
        stree1 = unique_identifier_and_branch_lengths(tree1)
        stree2 = unique_identifier_and_branch_lengths(tree2)
        with open('tmp_tree.txt', 'w') as f:
            f.write(' '.join(['s' + str(i) for i in range(1, N + 1)]) + '\n')
            f.write(stree1)
        with open('trees.txt', 'w') as f:
            f.write(stree1 + '\n' + stree2 + '\n' + stree1)

        options.input_file = 'trees.txt'
        options.nodes = 'tmp_tree.txt'
        options.no_header = True
        options.posterior_threshold = [0.25, 0.5, 0.9]

    if options.input_file == options.node_count_file:
        node_combinations = []
        print 'using population sets from ', options.node_count_file
        for threshold in options.posterior_threshold:
            final_node_combinations = [
                '.'.join(sorted(list(k))) for k, v in node_count_dic.items()
                if v > threshold
            ]
            node_combinations.append(final_node_combinations)
    else:
        print 'Reading file...'
        #loading trees
        if options.no_header:
            strees = []
            with open(options.input_file, 'r') as f:
                for lin in f.readlines():
                    strees.append(lin.rstrip())
        else:
            df = pd.read_csv(options.input_file,
                             sep=options.sep,
                             usecols=[options.tree_column_name])
            strees = df[options.tree_column_name].tolist()
        n = len(strees)
        print 'trees read: ', n

        #thinning tree list

        rows_to_remove_from_fraction = int(options.burn_in_fraction * n)
        rows_to_remove = max(rows_to_remove_from_fraction,
                             options.burn_in_rows)
        strees = strees[rows_to_remove:]

        print 'removed burn-in:', rows_to_remove
        print 'In list are now', len(strees), 'trees'

        #thinning

        distance_between = max(1, len(strees) // options.max_number_of_trees)
        nstrees = []
        for a, stree in enumerate(strees):
            if a % distance_between == 0 and len(
                    nstrees) < options.max_number_of_trees:
                nstrees.append(stree)
        print 'thinned'
        print 'In list are now', len(nstrees), 'trees'

        N = len(nstrees)

        seen_node_combinations = {}

        nodes = read_one_line(options.nodes)
        if not options.no_sort:
            nodes = sorted(nodes)

        tenth = len(nstrees) // 10
        trees = []
        for i, stree in enumerate(nstrees):
            if tenth > 0 and i % tenth == 0:
                print i // tenth * 10, '%'
            if ';' in stree:
                tree = identifier_to_tree_clean(
                    stree,
                    leaves=generate_predefined_list_string(deepcopy(nodes)))
            else:
                tree = topological_identifier_to_tree_clean(
                    stree,
                    leaves=generate_predefined_list_string(deepcopy(nodes)))
            trees.append(tree)
            ad = get_populations(tree, min_w=options.min_w)
            for a in ad:
                seen_node_combinations[a] = seen_node_combinations.get(a,
                                                                       0) + 1
        node_combinations = []
        for threshold in options.posterior_threshold:
            total_threshold = int(N * threshold)
            final_node_combinations = [
                k for k, v in seen_node_combinations.items()
                if v > total_threshold
            ]
            node_combinations.append(final_node_combinations)

    for i, final_node_combinations in enumerate(node_combinations):
        print 'final_node_combinations', final_node_combinations
        final_node_structure = node_combinations_to_node_structure(
            final_node_combinations)
        if options.get_effective_number_of_admixtures:
            with open(options.effective_number_of_admixtures_file, 'w') as f:
                if options.type_of_effective_admixtures == 'tadmix':
                    effictive_admixtures = get_number_of_tadmixtures(
                        final_node_structure)
                    f.write(str(effictive_admixtures))
                elif options.type_of_effective_admixtures == 'sadmix':
                    val = 0
                    count = 0
                    for tree in trees:
                        val += effective_number_of_admixes(tree)
                        count += 1
                    if count == 1:
                        f.write(str(int(val)))
                    else:
                        f.write(str(float(val) / count))
                elif options.type_of_effective_admixtures == 'admix':
                    val = 0
                    count = 0
                    for tree in trees:
                        val += get_number_of_admixes(tree)
                        count += 1
                    if count == 1:
                        f.write(str(int(val)))
                    else:
                        f.write(str(float(val) / count))
        if not options.suppress_plot:
            from tree_plotting import plot_node_structure_as_directed_graph, plot_as_directed_graph
            plot_node_structure_as_directed_graph(final_node_structure,
                                                  drawing_name='tmp' +
                                                  str(i + 1) + '.png',
                                                  node_dic=node_count_dic)
Exemplo n.º 21
0
 def __init__(self, tree):
     self.tree = tree
     self.no_admixes = get_number_of_admixes(self.tree)
Exemplo n.º 22
0
    updates=org.dot(normal(scale=0.01, size=org.shape[1]))
    #print pretty_string(update_specific_branch_lengths(tree_good, branches_determined, updates, add=True))
    
    #print make_covariance(tree_good, node_keys= nodes_determined)
    
    #print org.T.dot(coef)
    
    import sys
    
    sys.exit()
    
    from generate_prior_trees import generate_phylogeny
    from numpy.linalg import matrix_rank
    
    
    from Rtree_operations import get_number_of_admixes
    from tree_plotting import plot_as_directed_graph
    

    
    for _ in xrange(3):
        tree=generate_phylogeny(3,2)
        mat=make_coefficient_matrix(tree)[0]
        rank=matrix_rank(mat, tol=0.001)
        print rank, get_number_of_admixes(tree)
        print mat
        plot_as_directed_graph(tree, drawing_name='tmp'+str(_)+'.png')
        
    

Exemplo n.º 23
0
def admixes_are_sadmixes(tree):
    r,b,n=get_rank(tree),get_rank(tree_to_0tree(tree)),get_number_of_admixes(tree)
    #print 'rank=base_rank+no_admix',r,'=',b,'+',n
    return r==b+n