def get_nodes(arguments, input_file, outgroup_name, reduce_node, backup_number=8): if not arguments[ 0]: #this means that we should use the input file for nodes if ';' in input_file: nodes = get_trivial_nodes(len(input_file.split('-')[0].split('.'))) elif '.' in input_file: nodes = read_one_line(input_file) elif ',' in input_file: nodes = get_trivial_nodes(int(input_file[1:].split(',')[0])) else: nodes = get_trivial_nodes(int(input_file)) else: nodes = arguments before_added_outgroup = deepcopy(nodes) reduced_nodes = deepcopy(nodes) if outgroup_name in nodes: before_added_outgroup.remove(outgroup_name) elif outgroup_name: nodes.append(outgroup_name) if reduce_node in reduced_nodes: reduced_nodes.remove(reduce_node) if reduce_node and reduce_node not in nodes: nodes.append(reduce_node) return before_added_outgroup, nodes, reduced_nodes
def emp_cov_to_file(m, filename='emp_cov.txt', nodes=None): if nodes is None: n=m.shape[0] nodes=get_trivial_nodes(n) with open(filename, 'w') as f: f.write(' '.join(nodes)+'\n') for i, node in enumerate(nodes): f.write(node+ ' '+ ' '.join(map(str, m[i]))+'\n')
def ms_to_treemix3(filename='tmp.txt', samples_per_pop=20, no_pops=4, n_reps=1, filename2='tmp.treemix_in', nodes=None, convert_to_gz=True): if nodes is None: nodes=get_trivial_nodes(no_pops) total_sum=0 total_number_of_genes=0 with open(filename, 'r') as f: with open(filename2, 'w') as e: e.write(' '.join(nodes)+'\n') pop_count=0 rep_count=0 count=0 data=[] s_vecs=[] for r in f.readlines(): data.append(map(int,list(r.rstrip()))) count+=1 if count==samples_per_pop: count=0 pop_count+=1 s_vec=sum(array(data), axis=0) s_vecs.append(s_vec) total_sum+=sum(s_vec) total_number_of_genes+=len(s_vec)*samples_per_pop data=[] #print rep_count, pop_count if pop_count==no_pops: pop_count=0 rep_count+=1 for s in zip(*s_vecs): e.write(' '.join([str(a)+','+str(samples_per_pop-a) for a in s])+'\n') s_vecs=[] if rep_count>=n_reps: break muhat=float(total_sum)/float(total_number_of_genes) #print 'muhat', muhat if not convert_to_gz: return filename2 return gzip(filename2,overwrite=True)
def ms_to_treemix(filename='tmp.txt', samples_per_pop=20, no_pops=4, n_reps=1, filename2='tmp.treemix_in', treemix_files='tmp'): data=[] with open(filename, 'r') as f: for r in f.readlines(): #print r[:5] data.append(map(int,list(r.rstrip()))) m= array(data) if n_reps>1:#reorder the data so that there are more SNPs in stead of more samples/populations #print m.shape #print 'samples, pops, reps', samples_per_pop, no_pops, n_reps m=hstack(vsplit(m, n_reps)) #print samples_per_pop sums=tuple([sum(m[(i*samples_per_pop):((i+1)*samples_per_pop), : ], axis=0) for i in xrange(no_pops)]) #print sums, 'sums' with open(filename2, 'w') as f: f.write(' '.join(get_trivial_nodes(no_pops))+'\n') for s_vec in zip(*sums): f.write(' '.join([str(s)+','+str(samples_per_pop-s) for s in s_vec])+'\n') filename2_gz=filename2+'.gz' subprocess.call(['gzip','-f', filename2]) return read_data(filename2_gz, blocksize=10000 ,outgroup='s3', noss=True, outfile=treemix_files)
def test_posterior_model_multichain(true_tree=None, start_tree=None, sim_lengths=[250] * 800, summaries=None, thinning_coef=1, admixtures_of_true_tree=None, no_leaves_true_tree=4, wishart_df=None, sim_from_wishart=False, no_chains=8, result_file='results_mc3.csv', emp_cov=None, emp_remove=-1, rescale_empirical_cov=False): if true_tree is None: if admixtures_of_true_tree is None: admixtures_of_true_tree = geom.rvs(p=0.5) - 1 true_tree = generate_phylogeny(no_leaves_true_tree, admixtures_of_true_tree) else: no_leaves_true_tree = get_no_leaves(true_tree) admixtures_of_true_tree = get_number_of_admixes(true_tree) true_x = (true_tree, 0) m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree)) if start_tree is None: start_tree = true_tree start_x = (start_tree, 0) if wishart_df is None: wishart_df = n_mark(m) if sim_from_wishart: r = m.shape[0] print m m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df)) print m if emp_cov is not None: m = emp_cov if rescale_empirical_cov: posterior, multiplier = initialize_posterior( m, wishart_df, use_skewed_distr=True, rescale=rescale_empirical_cov) else: posterior = initialize_posterior(m, wishart_df, use_skewed_distr=True, rescale=rescale_empirical_cov) multiplier = None print 'true_tree=', unique_identifier_and_branch_lengths(true_tree) if rescale_empirical_cov: post_ = posterior( (scale_tree_copy(true_x[0], 1.0 / multiplier), true_x[1] / multiplier)) else: post_ = posterior(true_x) print 'likelihood(true_tree)', post_[0] print 'prior(true_tree)', post_[1] print 'posterior(true_tree)', sum(post_) if summaries is None: summaries = [ s_variable('posterior'), s_variable('mhr'), s_no_admixes() ] proposal = basic_meta_proposal() #proposal.props=proposal.props[2:] #a little hack under the hood #proposal.params=proposal.params[2:] #a little hack under the hood. sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} sample_verbose_scheme_first = deepcopy(sample_verbose_scheme) if 'posterior' in sample_verbose_scheme: sample_verbose_scheme_first['posterior'] = (1, 1) #(1,1) sample_verbose_scheme_first['no_admixes'] = (1, 1) #if 'likelihood' in sample_verbose_scheme: #sample_verbose_scheme_first['likelihood']=(1,1) print sample_verbose_scheme_first MCMCMC(starting_trees=[deepcopy(start_x) for _ in range(no_chains)], posterior_function=posterior, summaries=summaries, temperature_scheme=fixed_geometrical(800.0, no_chains), printing_schemes=[sample_verbose_scheme_first] + [sample_verbose_scheme for _ in range(no_chains - 1)], iteration_scheme=sim_lengths, overall_thinnings=int(thinning_coef), proposal_scheme=[adaptive_proposal() for _ in range(no_chains)], cores=no_chains, no_chains=no_chains, multiplier=multiplier, result_file=result_file, store_permuts=False) print 'finished MC3' #save_pandas_dataframe_to_csv(results, result_file) #save_permuts_to_csv(permuts, get_permut_filename(result_file)) return true_tree
def test_posterior_model(true_tree=None, start_tree=None, sim_length=100000, summaries=None, thinning_coef=19, admixtures_of_true_tree=None, no_leaves_true_tree=4, filename='results.csv', sim_from_wishart=False, wishart_df=None, sap_sim=False, sap_ana=False, resimulate_regrafted_branch_length=False, emp_cov=None, big_posterior=False, rescale_empirical_cov=False): if true_tree is None: if admixtures_of_true_tree is None: admixtures_of_true_tree = geom.rvs(p=0.5) - 1 true_tree = generate_phylogeny(no_leaves_true_tree, admixtures_of_true_tree, skewed_admixture_prior=sap_sim) else: no_leaves_true_tree = get_no_leaves(true_tree) admixtures_of_true_tree = get_number_of_admixes(true_tree) true_x = (true_tree, 0) m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree)) if start_tree is None: start_tree = true_tree start_x = (start_tree, 0) if wishart_df is None: wishart_df = n_mark(m) if sim_from_wishart: r = m.shape[0] print m m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df)) print m if emp_cov is not None: m = emp_cov if big_posterior: posterior = initialize_big_posterior(m, wishart_df, use_skewed_distr=sap_ana) else: posterior = initialize_posterior(m, wishart_df, use_skewed_distr=sap_ana, rescale=rescale_empirical_cov) print 'true_tree=', unique_identifier_and_branch_lengths(true_tree) post_ = posterior(true_x) print 'likelihood(true_tree)', post_[0] print 'prior(true_tree)', post_[1] print 'posterior(true_tree)', sum(post_[:2]) if summaries is None: summaries = [s_posterior(), s_variable('mhr'), s_no_admixes()] proposal = adaptive_proposal( resimulate_regrafted_branch_length=resimulate_regrafted_branch_length) #proposal.props=proposal.props[2:] #a little hack under the hood #proposal.params=proposal.params[2:] #a little hack under the hood. sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} sample_verbose_scheme['posterior'] = (1, 1) sample_verbose_scheme['no_admixes'] = (1, 1) final_tree, final_posterior, results, _ = basic_chain( start_x, summaries, posterior, proposal, post=None, N=sim_length, sample_verbose_scheme=sample_verbose_scheme, overall_thinning=int(max(thinning_coef, sim_length / 60000)), i_start_from=0, temperature=1.0, proposal_update=None, check_trees=False) save_to_csv(results, summaries, filename=filename) return true_tree
def identifier_to_tree(identifier, leaves=None, inner_nodes=None, branch_lengths=None, admixture_proportions=None): ''' Transforms an identifier of the form qwert-uio-asdfg-jk into a dictionary tree using the generators of leaves, inner_nodes, branch_lengths and admixture_proportions. ''' levels=identifier.split('-') n_leaves=len(levels[0].split('.')) #initiate leaves if leaves is None: leaf_values=sorted(get_trivial_nodes(n_leaves)) else: leaf_values=[leaves() for _ in range(n_leaves)] tree={leaf:[None]*5 for leaf in leaf_values} trace_lineages=[(leaf,0) for leaf in leaf_values] #initiate generators if inner_nodes is None: inner_nodes=generate_numbered_nodes('n') if branch_lengths is None: def f(): return 1.0 branch_lengths= f if admixture_proportions is None: def g(): return 0.4 admixture_proportions=g for level in levels: identifier_lineages=level.split('.') assert len(trace_lineages)==len(identifier_lineages), 'the number of traced lineages did not match the number of lineages in the identifier '+\ '\n\n'+'trace_lineages:'+'\n'+str(trace_lineages)+\ '\n\n'+'identifier_lineages:'+'\n'+str(identifier_lineages) parent_index={} indexes_to_be_removed=[] for n,identifier_lineage in enumerate(identifier_lineages): if identifier_lineage=='c': ##there is a coalecence for the n'th lineage, and it should be replaced by a new lineage new_key=inner_nodes() old_key,old_branch=trace_lineages[n] new_branch_length=branch_lengths() tree=update_parent_and_branch_length(tree, old_key, old_branch, new_key, new_branch_length) tree[new_key]=[None]*5 parent_index[n]=new_key trace_lineages[n]=(new_key,0) elif identifier_lineage=='w': pass elif identifier_lineage=='a': new_key=inner_nodes(admixture=True) old_key,old_branch=trace_lineages[n] new_branch_length=branch_lengths() tree=update_parent_and_branch_length(tree, old_key, old_branch, new_key, new_branch_length) new_admixture_proportion=admixture_proportions() tree[new_key]=[None,None,new_admixture_proportion,None,None] trace_lineages[n]=(new_key,0) trace_lineages.append((new_key,1)) else: ##there is a coalescence but this lineage disappears try: new_key=parent_index[int(identifier_lineage)] except KeyError as e: print e print 'new_key', new_key print 'parent_index', parent_index print 'identifier_lineage', identifier_lineage print pretty_string(insert_children_in_tree(tree)) old_key,old_branch=trace_lineages[n] new_branch_length=branch_lengths() tree=update_parent_and_branch_length(tree, old_key, old_branch, new_key, new_branch_length) indexes_to_be_removed.append(n) ##remove lineages trace_lineages=[trace_lineage for n,trace_lineage in enumerate(trace_lineages) if n not in indexes_to_be_removed] root_key=new_key del tree[root_key] tree=rename_root(tree, new_key) return insert_children_in_tree(tree)