def mcmcmc(observed_covariance, df , outgroup=False, chains=8, its=[50]*100): nodes=['s'+str(i+1) for i in range(observed_covariance.shape[0])] start_x=identifier_to_tree_clean(simulate_tree(4,0)),0 summaries=[summary.s_posterior(), summary.s_basic_tree_statistics(tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_variable('add', output='double'), summary.s_no_admixes(),] options=options_object(outgroup, chains=chains) proposal=make_proposal(options) posterior_function=posterior_class(observed_covariance, M=df, nodes=nodes) sample_verbose_scheme=[{'posterior':(1,200), 'tree':(1,0),'add':(1,200),'no_admixes':(1,200)}]+[{s.name:(1,0) for s in summaries}]*(chains-1) res=MCMCMC(starting_trees=[(identifier_to_tree_clean(simulate_tree(4,0)),0) for _ in range(chains)], posterior_function= posterior_function, summaries=summaries, temperature_scheme=fixed_geometrical(800,chains), printing_schemes=sample_verbose_scheme, iteration_scheme=its, overall_thinnings=40, proposal_scheme= proposal, cores=chains, no_chains=chains, multiplier=None, result_file=None, store_permuts=False, stop_criteria=None) res=res.loc[res.layer==0,['iteration','posterior','tree','no_admixes']] return res
def read_tree(input, nodes): if isinstance(input, basestring): if not ';' in input: input=read_one_line_skip(filename=input) return identifier_to_tree_clean(input, leaves=generate_predefined_list_string(deepcopy(nodes))) else: return identifier_to_tree_clean(input, leaves=generate_predefined_list_string(deepcopy(nodes))) else: return input
def get_most_likely_subgraphs_list(strees, nodes, subgraph_keys, sort_nodes=True): if sort_nodes: nodes = sorted(nodes) topologies = {} n = len(strees) for i, stree in enumerate(strees): if i % (n / 10) == 0: print float(i) / n tree = identifier_to_tree_clean(stree, leaves=generate_predefined_list_string( deepcopy(nodes))) sub_tree = get_subtree(tree, subgraph_keys) sub_stree = get_unique_plottable_tree(sub_tree) sub_topology, sbranch_lengths, sadmixture_proportions = sub_stree.split( ';') branch_lengths = map(float, sbranch_lengths.split('-')) if len(sadmixture_proportions) > 0: admixture_proportions = map(float, sadmixture_proportions.split('-')) else: admixture_proportions = [] if sub_topology in topologies: topologies[sub_topology][0].append(branch_lengths) topologies[sub_topology][1].append(admixture_proportions) else: topologies[sub_topology] = [[branch_lengths], [admixture_proportions]] return topologies
def get_empirical_matrix(stree, factor=1.0, pop_size=20, reps=400): tree= identifier_to_tree_clean(stree) ms_command=tree_to_ms_command(scale_tree_copy(tree, factor), pop_size, reps) #print ms_command call_ms_string(ms_command, 'tmp.txt') empirical_covariance=ms_to_treemix2(filename='tmp.txt', samples_per_pop=pop_size, no_pops=get_number_of_leaves(tree), n_reps=reps, filename2='tmp.treemix_in') return reduce_covariance(empirical_covariance,0)
def main(args): parser = ArgumentParser( usage='pipeline for plotting posterior distribution summaries.', version='1.0.0') parser.add_argument( '--posterior_distribution_file', required=True, type=str, help= 'The file containing posterior distributions from the "AdmixtureBayes posterior" command. It needs the two columns "pops" and topology.' ) parser.add_argument( '--no_topologies_to_plot', default=10, type=int, help= 'The number of the most posterior topologies to transform to qpgraphs') parser.add_argument( '--consensus_threshold', default=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99], type=float, nargs='+', help= 'The posterior thresholds for which to draw different consensus trees.' ) parser.add_argument('--sep', default=',', type=str, help='the separator used in the input file') parser.add_argument( '--outfile_prefix', default='', type=str, help='beginning of all files where the qp graphs are saved') options = parser.parse_args(args) df = pd.read_csv(options.posterior_distribution_file, sep=options.sep, usecols=['string_tree', 'topology']) stree_list = df['string_tree'].tolist() nodes = stree_list[0].split('=')[:-1] topologies = df['topology'].tolist() counter = Counter(topologies) rd = counter.most_common(options.no_topologies_to_plot) for n, (string_topology, common_ness) in enumerate(rd): index = topologies.index(string_topology) stree = stree_list[index] Rtree = identifier_to_tree_clean( stree.split('=')[-1], leaves=generate_predefined_list_string(deepcopy(nodes))) ab2qpg(Rtree, options.outfile_prefix + 'qp' + str(n + 1) + '.graph')
def visualize_topology(stree): numba=str(np.random.randint(0,1000000)) filename='tree'+numba+'.png' if ';' in stree: plot_as_directed_graph(identifier_to_tree_clean(stree), drawing_name= filename, popup=False) else: plot_as_directed_graph(topological_identifier_to_tree_clean(stree), drawing_name= filename, popup=False) return filename
def read_tree_file(filename): with open(filename, 'r') as f: lines = f.readlines() print lines nodes = lines[0].rstrip().split() print nodes tree = identifier_to_tree_clean(lines[1].rstrip(), leaves=generate_predefined_list_string( deepcopy(nodes))) return tree, nodes
def get_posterior_A_matrices(outfile, add_multiplier=1, nodes=None, outgroup='out', thinning=100): a=pd.read_csv(outfile, usecols=['tree','add','layer']) b=a.loc[a.layer == 0, :] b=b[int(b.shape[0])/2::thinning] AmatricesA=[] for stree, add in zip(b['tree'], b['add']): #print stree tree=identifier_to_tree_clean(stree) #print pretty_string(tree) tree= add_outgroup(tree, inner_node_name='new_node', to_new_root_length=float(add)*add_multiplier, to_outgroup_length=0, outgroup_name=outgroup) cov=make_covariance(tree, node_keys=nodes) #print cov AmatricesA.append(Areduce(cov)) return AmatricesA
def mcmc(observed_covariance, df, outgroup=False): nodes=['s'+str(i+1) for i in range(observed_covariance.shape[0])] start_x=identifier_to_tree_clean(simulate_tree(4,0)),0 summaries=[summary.s_posterior(), summary.s_basic_tree_statistics(tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_variable('add', output='double'), summary.s_no_admixes(),] options=options_object(outgroup) proposal=make_proposal(options)[0] posterior_function=posterior_class(observed_covariance, M=df, nodes=nodes) sample_verbose_scheme={'posterior':(1,200), 'tree':(1,0),'add':(1,200),'no_admixes':(1,200)} a=basic_chain(start_x, summaries, posterior_function, proposal, post=None, N=5000, sample_verbose_scheme=sample_verbose_scheme, overall_thinning=100, i_start_from=0, temperature=1.0, proposal_update=None, multiplier=None, check_trees=False, appending_result_file=None, appending_result_frequency=10) return a[2]
def __call__(self, Rtree=None, add=None, **kwargs): if Rtree is None: assert 'sfull_tree' in kwargs, 'sfull_tree not specified' nodes = sorted(kwargs['full_nodes']) sfull_tree = kwargs['sfull_tree'] full_tree = identifier_to_tree_clean( sfull_tree, leaves=generate_predefined_list_string(deepcopy(nodes))) if self.subnodes: full_tree = get_subtree(full_tree, self.subnodes) if self.remove_sadtrees and (not admixes_are_sadmixes(full_tree)): return {'full_tree': full_tree}, True return {'full_tree': full_tree}, False full_tree = add_outgroup(deepcopy(Rtree), inner_node_name='new_node', to_new_root_length=float(add) * self.add_multiplier, to_outgroup_length=0, outgroup_name=self.outgroup_name) if self.subnodes: full_tree = get_subtree(full_tree, self.subnodes) return {'full_tree': full_tree}, False
def __call__(self, tree, **not_needed): #print tree #print not_needed #print tree Rtree = identifier_to_tree_clean( tree, leaves=generate_predefined_list_string(deepcopy(self.nodes))) #print pretty_string(Rtree) if self.subnodes: #DETTE TAGER IKKE ORDENTLIG HOJDE FOR KOVARIANSMATRICERNE SOM BLIVER FORKERTE try: Rtree = get_subtree(Rtree, self.subnodes) except AssertionError: print pretty_string(Rtree) from tree_plotting import plot_as_directed_graph plot_as_directed_graph(Rtree) print 'input_tree', tree print 'nodes', self.nodes print 'subnodes', self.subnodes assert False if self.remove_sadtrees and (not admixes_are_sadmixes(Rtree)): print 'returned true because adtrees are not sad' return {'Rtree': Rtree}, True return {'Rtree': Rtree}, False
def analyze_tree(topology, branches, admixtures): id_branches = '-'.join(map(str, range(len(branches.split('-'))))) id_admixtures = '-'.join(map(str, range(1, len(admixtures.split('-')) + 1))) #print branches, id_branches id_stree = ';'.join([topology, id_branches, id_admixtures]) no_leaves = len((id_stree.split('-')[0]).split('.')) id_tree = identifier_to_tree_clean(id_stree.strip()) strees = sorted(get_possible_permutation_strees(id_tree)) top_topology = strees[0].split(';')[0] res = {} for stree in strees: lookup_topology, branches_sperm, admixtures_sperm = stree.split(';') rf = map(round, map(float, branches_sperm.split('-'))) branches_permutation = map(int, rf) admixtures_permutation = get_admixtures_permutation(admixtures_sperm) res[lookup_topology] = (top_topology, branches_permutation, admixtures_permutation) #print res return res
def see_covariance_matrix(stree, reduce=None, factor=1.0): if reduce is None: return make_covariance(identifier_to_tree_clean(stree)) * factor else: return reduce_covariance( make_covariance(identifier_to_tree_clean(stree)), 0) * factor
def identifier_to_tree_clean_wrapper(stree): return identifier_to_tree_clean(stree)
def add_random_admix(stree, *kwargs): tree = identifier_to_tree_clean(stree) ad = addadmix(tree, new_node_names=['x1', 'x2'], *kwargs) return unique_identifier_and_branch_lengths(ad[0])
def plot_big_tree(stree): plot_as_directed_graph(identifier_to_tree_clean(stree))
def plot_minimal_topology(stree): tree = identifier_to_tree_clean(stree) node_combination = tree_to_node_combinations(tree) node_structure = node_combination_to_node_structure(node_combination) plot_node_structure(node_structure, 'minimal')
'n6': ['n15', None, None, 0.002455554, None, 's8', 'a2'] } #print plot_as_directed_graph(tree) sub_tree = get_subtree(tree, ['s1', 's2', 's3']) #print plot_as_directed_graph(sub_tree) print pretty_string(sub_tree) #plots=get_unique_plottable_tree(sub_tree) #print 'gotten unique_plottable' #print plots stree_difficult = 'a.w.w.c.c.w.c.4.3.6-c.w.0.w.c.w.w.4-c.w.w.w.w.0-c.w.w.w.0-c.0.w.w-c.0.w-c.0;0.014843959-0.003602704-0.002128203-0.027030132-0.008484730-0.067616899-0.021207056-0.027455759-0.011647297-0.009065170-0.053386961-0.001718477-0.009310923-0.010471979-0.036314546-0.004808845-0.055956235-0.004694887-0.003482668-0.039323330-0.014821628;1.000' from tree_statistics import (identifier_to_tree_clean, generate_predefined_list_string, identifier_file_to_tree_clean, unique_identifier_and_branch_lengths) from Rtree_to_covariance_matrix import make_covariance nodes = sorted(['s' + str(i + 1) for i in range(10)]) tree_difficult = identifier_to_tree_clean( stree_difficult, leaves=generate_predefined_list_string(deepcopy(nodes))) cov1 = make_covariance(tree_difficult) tree_difficult2 = remove_non_mixing_admixtures(deepcopy(tree_difficult)) cov2 = make_covariance(tree_difficult2) print cov1 print cov2 print cov1 - cov2 print pretty_string(tree_difficult) print get_branches_to_keep(tree_difficult, ['s1', 's2', 's3']) sub_tree = get_subtree(tree_difficult, ['s1', 's2', 's3']) print pretty_string(sub_tree)
def plot_string_tree(stree): plot_graph(identifier_to_tree_clean(stree))
def run_posterior_multichain(wishart_df=1000, true_tree_as_identifier=None, result_file='result_mc3.csv', emp_cov_file=None, emp_remove=-1, remove_outgroup=False, make_emp_cov_file=True): if true_tree_as_identifier is None: true_tree = Rcatalogue_of_trees.tree_good else: true_tree = tree_statistics.identifier_to_tree_clean( 'w.w.w.w.w.w.a.a.w-c.w.c.c.w.c.5.0.w.3.2-c.w.w.0.c.4.w-c.w.0.c.3-w.c.1-c.0;0.07-0.974-1.016-0.089-0.81-0.086-1.499-0.052-1.199-2.86-0.403-0.468-0.469-1.348-1.302-1.832-0.288-0.18-0.45-0.922-2.925-3.403;0.388-0.485' ) #with open(true_tree_as_identifier, 'r') as f: # s=f.readline().rstrip() # true_tree=tree_statistics.identifier_to_tree_clean(s) if remove_outgroup: true_tree = Rtree_operations.remove_outgroup(true_tree) true_tree = Rtree_operations.simple_reorder_the_leaves_after_removal_of_s1( true_tree) if make_emp_cov_file: cov = tree_to_data.get_empirical_matrix(s, factor=0.01, reps=400) tree_to_data.emp_cov_to_file(cov, filename=emp_cov_file) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths( true_tree) no_leaves = Rtree_operations.get_no_leaves(true_tree) #s_tree=tree_statistics.identifier_to_tree_clean('w.w.a.w.w.a.a.a.w-c.w.c.c.w.w.c.0.w.w.6.3.2-c.w.w.0.w.c.5.w.w-c.w.0.c.3.w.w-c.w.c.2.0-w.c.1-c.0;0.828-0.21-0.197-0.247-0.568-1.06-0.799-1.162-2.632-2.001-0.45-1.048-0.834-0.469-0.191-2.759-0.871-1.896-0.473-0.019-1.236-0.287-0.179-0.981-0.456-0.91-2.114-3.368;0.655-0.506-0.389-0.23') s_tree = Rtree_operations.create_burled_leaved_tree(no_leaves, 1.0) print 'no_leaves', no_leaves summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_variable('add', output='double'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_likelihood(), summary.s_prior(), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] if emp_cov_file is not None: if emp_remove < 0: emp_cov = tree_to_data.file_to_emp_cov(emp_cov_file) else: emp_cov = tree_to_data.file_to_emp_cov(emp_cov_file, emp_remove) else: emp_cov = None print 'emp_cov', emp_cov r = simulation_sanity.test_posterior_model_multichain( true_tree, s_tree, [50] * 20000, summaries=summaries, thinning_coef=24, wishart_df=wishart_df, result_file=result_file, emp_cov=emp_cov, rescale_empirical_cov=False) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r) analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
def initialize_posterior2(emp_cov=None, true_tree=None, M=None, use_skewed_distr=False, p=0.5, rescale=False, model_choice=[ 'empirical covariance', 'true tree covariance', 'wishart on true tree covariance', 'empirical covariance on true tree', 'no likelihood' ], simulate_true_tree=False, true_tree_no_leaves=None, true_tree_no_admixes=None, nodes=None, simulate_true_tree_with_skewed_prior=False, reduce_cov=None, add_outgroup_to_true_tree=False, reduce_true_tree=False): if not isinstance(model_choice, basestring): model_choice = model_choice[0] if model_choice == 'no likelihood': return initialize_prior_as_posterior(), {} if (model_choice == 'true tree covariance' or model_choice == 'wishart on true tree covariance' or model_choice == 'empirical covariance on true tree'): if simulate_true_tree: true_tree = generate_phylogeny( true_tree_no_leaves, true_tree_no_admixes, nodes, simulate_true_tree_with_skewed_prior) elif isinstance(true_tree, basestring): if ';' in true_tree: #this means that the true tree is a s_tree true_tree_s = true_tree true_tree = identifier_to_tree_clean(true_tree_s) else: with open(true_tree, 'r') as f: true_tree_s = f.readline().rstrip() true_tree = identifier_to_tree_clean(true_tree_s) true_tree = Rtree_operations.simple_reorder_the_leaves_after_removal_of_s1( true_tree) no_leaves = get_number_of_leaves(true_tree) no_admixes = get_number_of_admixes(true_tree) cov = make_covariance(true_tree) if reduce_cov is not None: pass if reduce_true_tree is not None: true_tree = Rtree_operations.remove_outgroup( true_tree, reduce_true_tree) if reduce_true_tree == 's1' or reduce_true_tree == 0: pass if emp_cov is not None: if isinstance(emp_cov, basestring): pass if M is None: M = n_mark(emp_cov) if rescale: emp_cov, multiplier = rescale_empirical_covariance(emp_cov) print 'multiplier is', multiplier def posterior(x, pks={}): #print tot_branch_length prior_value = prior(x, p=p, use_skewed_distr=use_skewed_distr, pks=pks) if prior_value == -float('inf'): return -float('inf'), prior_value likelihood_value = likelihood(x, emp_cov, M=M) pks['prior'] = prior_value pks['likelihood'] = likelihood_value #pks['posterior']=prior_value+likelihood_value return likelihood_value, prior_value if rescale: return posterior, multiplier return posterior
def run_d(true_tree_as_file=None): #true_tree=generate_prior_trees.generate_phylogeny(8,2) if true_tree_as_file is None: true_tree = tree_statistics.identifier_to_tree_clean( 'w.w.w.w.w.w.a.a.w-c.w.c.c.w.c.5.0.w.3.2-c.w.w.0.c.4.w-c.w.0.c.3-w.c.1-c.0;0.07-0.974-1.016-0.089-0.81-0.086-1.499-0.052-1.199-2.86-0.403-0.468-0.469-1.348-1.302-1.832-0.288-0.18-0.45-0.922-2.925-3.403;0.388-0.485' ) #true_tree=Rcatalogue_of_trees.tree_good s_tree = tree_statistics.identifier_to_tree_clean( 'w.w.a.w.w.a.a.a.w-c.w.c.c.w.w.c.0.w.w.6.3.2-c.w.w.0.w.c.5.w.w-c.w.0.c.3.w.w-c.w.c.2.0-w.c.1-c.0;0.828-0.21-0.197-0.247-0.568-1.06-0.799-1.162-2.632-2.001-0.45-1.048-0.834-0.469-0.191-2.759-0.871-1.896-0.473-0.019-1.236-0.287-0.179-0.981-0.456-0.91-2.114-3.368;0.655-0.506-0.389-0.23' ) print Rtree_operations.pretty_string(s_tree) print Rtree_operations.pretty_string(true_tree) else: with open(true_tree_as_file, 'r') as f: s = f.readline().rstrip() true_tree = tree_statistics.identifier_to_tree_clean(s) no_leaves = Rtree_operations.get_number_of_leaves(true_tree) s_tree = Rtree_operations.create_trivial_tree(no_leaves) summaries = [ summary.s_posterior(), summary.s_variable('mhr', output='double_missing'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.get_admixture_proportion_string, 'admixtures', output='string'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_variable('add', output='double'), summary.s_variable('sliding_rescale_adap_param', output='double_missing'), summary.s_variable('cutoff_distance', output='double_missing'), summary.s_variable('number_of_pieces', output='double_missing'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_constrained_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] r = simulation_sanity.test_posterior_model( true_tree, s_tree, 100000, summaries=summaries, thinning_coef=20, wishart_df=10000, resimulate_regrafted_branch_length=False) #, #admixtures_of_true_tree=2, no_leaves_true_tree=8, rescale_empirical_cov=True) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r) analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
def run_analysis_of_proposals(): #true_tree=generate_prior_trees.generate_phylogeny(8,2) true_tree = tree_statistics.identifier_to_tree_clean( 'w.w.c.w.w.w.2.w-w.w.a.w.w.w.w-w.c.1.w.c.w.w.4-w.c.1.w.w.w-w.c.1.w.w-c.0.w.w-c.w.0-a.w-c.0.w-c.0;0.091-1.665-0.263-0.821-0.058-0.501-0.141-0.868-5.064-0.153-0.372-3.715-1.234-0.913-2.186-0.168-0.542-0.056-2.558-0.324;0.367-0.451' ) true_tree = Rcatalogue_of_trees.tree_good s_tree = Rtree_operations.create_trivial_tree(4) summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_bposterior_difference(lambda x: x[0], 'likelihood_difference'), summary.s_bposterior_difference(lambda x: x[1], 'prior_difference'), summary.s_bposterior_difference(lambda x: x[2][0], 'branch_prior_difference'), summary.s_bposterior_difference(lambda x: x[2][1], 'no_admix_prior_difference'), summary.s_bposterior_difference(lambda x: x[2][2], 'adix_prop_prior_difference'), summary.s_bposterior_difference(lambda x: x[2][3], 'top_prior_difference'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] r = simulation_sanity.test_posterior_model( true_tree, true_tree, 100000, summaries=summaries, thinning_coef=2, wishart_df=1000, resimulate_regrafted_branch_length=False, admixtures_of_true_tree=2, no_leaves_true_tree=4, big_posterior=True, rescale_empirical_cov=True) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r) analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
def main(args): parser = ArgumentParser( usage='pipeline for plotting posterior distribution summaries.', version='1.0.0') parser.add_argument( '--posterior_distribution_file', required=True, type=str, help= 'The file containing posterior distributions from the "AdmixtureBayes posterior" command. It needs the two columns "pops" and topology.' ) parser.add_argument( '--plot', choices=['consensus_trees', 'top_node_trees', 'top_trees'], required=True, help='The type of plot to make. Choose between: 1) consensus_trees. ' 'It plots an admixture graph based on all nodes that have a higher (marginal) posterior probability of X. ' 'Different X\'s can be supplied with the command --consensus_threshold \n' '2) top_node_trees. It plots the X highest posterior combinations of node types ' 'and creates the corresponding minimal topologies. X can be supplied through the command --top_node_trees_to_plot' '3) top_trees. It plots the X highest posterior topologies. X can be supplied by the command --top_trees_to_plot' ) parser.add_argument('--outgroup', default='outgroup', help='name of the outgroup to plot') parser.add_argument( '--consensus_threshold', default=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99], type=float, nargs='+', help= 'The posterior thresholds for which to draw different consensus trees.' ) parser.add_argument( '--top_node_trees_to_plot', type=int, default=3, help='The number of node trees (or minimal topologies) to plot') parser.add_argument('--top_trees_to_plot', type=int, default=3, help='The number of trees (or topologies) to plot ') parser.add_argument( '--write_ranking_to_file', type=str, default='', help= 'if a file is supplied here, the natural rankings for each of the plots is written here.' ) parser.add_argument( '--rankings_to_write_to_file', type=int, default=1000, help= 'the number of rankings(nodes, min topology or topology depending on --plot) to write to the ranking file.' ) parser.add_argument( '--dont_annotate_node_posterior', default=False, action='store_true', help= 'This will not color the nodes according to their posterior probability.' ) parser.add_argument('--nodes', default='', type=str, help='file where the first line is the leaf nodes') parser.add_argument('--suppress_plot', default=False, action='store_true') parser.add_argument( '--no_sort', default=False, action='store_true', help= 'often the tree is sorted according to the leaf names. no_sort willl assumed that they are not sorted according to this but sorted according to ' ) parser.add_argument('--sep', default=',', type=str, help='the separator used in the input file') #parser.add_argument('--no_header', default=False, action='store_true',help='will assume that there is no header in the file') #parser.add_argument('--burn_in_rows', default=0, type=int, help='the number of rows that will be skipped in the input file as burn-in period') #parser.add_argument('--burn_in_fraction', default=0.0, type=float, help='the proportion of the rows that are discarded as burn in period') #parser.add_argument('--tree_column_name', default='tree', type=str, help='the name in the header of the column with all the trees.') parser.add_argument( '--consensus_method', choices=['descendant_frequencies'], default='descendant_frequencies', help='Which method should be used to calculate the consensus tree?') #parser.add_argument('--min_w', default=0.0, type=float, help='a lower threshold of which descendants matter when the consensus_method is descendant_frequencies.') #parser.add_argument('--plot_tops_file', action='store_true', default=False, help='this will assume that the file is a tops file from downstream_analysis_parser and plot each line numbered.') #parser.add_argument('--get_effective_number_of_admixtures', action='store_true', default=False, help='this will cancel all the other analysis and only print the effective number of admixes(tadmixes/sadmixes or admixes) to a a file.') #parser.add_argument('--effective_number_of_admixtures_file', type=str, default='no_tadmixes.txt', help='this is the file in which to write the effective number of admixes in the file') #parser.add_argument('--type_of_effective_admixtures', type=str, choices=['sadmix','tadmix','admix'], help='this is the type of admixes to write to the file.') #parser.add_argument('--node_count_file', default='', type=str, help='if plot_tops option is supplied') #parser.add_argument('--node_count_probs', default='', type=str, help='if supplied this will make a new ') #parser.add_argument('--test_run', default=False, action='store_true', # help='will overwrite everything and run a test function') options = parser.parse_args(args) def combine_nodes(node_structure, new_node, seen_sets): candidate = new_node.name seen = [] for lists_of_fixed_size in seen_sets[::-1]: for attached_branch in lists_of_fixed_size: if (attached_branch.issubset(candidate) and ((not attached_branch.issubset(seen)) or (not node_structure[attached_branch].has_parent()))): seen.extend(list(attached_branch)) new_node.add_child(node_structure[attached_branch]) node_structure[attached_branch].add_parent(new_node) return node_structure def get_number_of_tadmixtures(node_structure): total = 0 for key in node_structure: total += max(0, node_structure[key].get_number_of_parents() - 1) return total def node_combinations_to_node_structure(node_combinations): length_sorted = {} for node_combination in node_combinations: leaves = frozenset(node_combination.split('.')) k = len(leaves) if k in length_sorted: length_sorted[k].append(leaves) else: length_sorted[k] = [leaves] length_sorted_list = [ length_sorted.get(k, []) for k in range(1, max(length_sorted.keys()) + 1) ] #length_sorted_list is of the form [[[A],[B],[C]],[[A,B],[B,C]],...,[[A,B,C]]] node_structure = {} for leaf_node in length_sorted_list[0]: node_structure[leaf_node] = Node(leaf_node) added_sets = [length_sorted_list[0]] for lists_of_fixed_size in length_sorted_list[1:]: for branch_set in lists_of_fixed_size: new_node = Node(branch_set) combine_nodes(node_structure, new_node, added_sets) node_structure[branch_set] = new_node added_sets.append(lists_of_fixed_size) return node_structure # if options.node_count_file: # with open(options.node_count_file, 'r') as f: # node_count_dic={} # for lin in f.readlines(): # key,freq=lin.rstrip().split() # node_count_dic[frozenset(key.split('.'))]=float(freq) # else: # node_count_dic=None if options.plot == 'consensus_trees' or options.plot == 'top_node_trees': df = pd.read_csv(options.posterior_distribution_file, sep=options.sep, usecols=['pops']) nodes_list = df['pops'].tolist() #print(nodes_list) seen_combinations = {} for nodes in nodes_list: #print(nodes) for node in nodes.split('-'): #print(node) seen_combinations[node] = seen_combinations.get(node, 0) + 1 N = len(nodes_list) #print(seen_combinations) if options.plot == 'consensus_trees': node_combinations = [] for threshold in options.consensus_threshold: total_threshold = int(N * threshold) final_node_combinations = [ k for k, v in seen_combinations.items() if v > total_threshold ] node_combinations.append(final_node_combinations) if not options.dont_annotate_node_posterior: node_count_dic = { frozenset(k.split('.')): float(v) / N for k, v in seen_combinations.items() } else: node_count_dic = None for i, final_node_combinations in enumerate(node_combinations): #print(final_node_combinations) final_node_structure = node_combinations_to_node_structure( final_node_combinations) if not options.suppress_plot: from tree_plotting import plot_node_structure_as_directed_graph plot_node_structure_as_directed_graph( final_node_structure, drawing_name='consensus_' + str(int(100 * options.consensus_threshold[i])) + '.png', node_dic=node_count_dic) if options.write_ranking_to_file: with open(options.write_ranking_to_file, 'w') as f: c = Counter(seen_combinations) to_write = c.most_common(options.rankings_to_write_to_file) for node, frequency in to_write: f.write(node + ',' + str(float(frequency) / N) + '\n') elif options.plot == 'top_node_trees': c = Counter(nodes_list) to_plots = c.most_common(options.top_node_trees_to_plot) if options.write_ranking_to_file: with open(options.write_ranking_to_file, 'w') as f: for tree, frequency in c.most_common( options.rankings_to_write_to_file): f.write(tree + ',' + str(float(frequency) / N) + '\n') if not options.dont_annotate_node_posterior: c = Counter(seen_combinations) node_count_dic = { frozenset(key.split('.')): float(count) / N for key, count in c.most_common(1000) } else: node_count_dic = None if not options.suppress_plot: from tree_plotting import plot_node_structure_as_directed_graph for i, (to_plot, count) in enumerate(to_plots): node_structure = node_combinations_to_node_structure( to_plot.split('-')) plot_node_structure_as_directed_graph( node_structure, drawing_name='minimal_topology_' + str(i + 1) + '.png', node_dic=node_count_dic) elif options.plot == 'top_trees': df = pd.read_csv(options.posterior_distribution_file, sep=options.sep, usecols=['pops', 'topology']) trees_list = df['topology'].tolist() no_leaves = len(trees_list[0].split('-')[0].split('.')) N = len(trees_list) c = Counter(trees_list) to_plots = c.most_common(options.top_trees_to_plot) #obtaining nodes: if not options.nodes: nodes = df['pops'].tolist()[0].split('-') leaves = list( set([leaf for node in nodes for leaf in node.split('.')])) if len(leaves) == no_leaves: pass #everything is good elif len(leaves) == no_leaves - 1: #adding outgroup leaves.append(options.outgroup) else: assert False, 'The number of leaves could not be obtained' assert not options.no_sort, 'When nodes are not specified, they will always be sorted' leaves = sorted(leaves) else: leaves = read_one_line(options.nodes) if not options.no_sort: leaves = sorted(leaves) if options.write_ranking_to_file: with open(options.write_ranking_to_file, 'w') as f: for tree, frequency in c.most_common( options.rankings_to_write_to_file): f.write(tree + ',' + str(float(frequency) / N) + '\n') if not options.suppress_plot: from tree_plotting import plot_as_directed_graph for i, (to_plot, count) in enumerate(to_plots): tree = topological_identifier_to_tree_clean( to_plot, leaves=generate_predefined_list_string(deepcopy(leaves))) plot_as_directed_graph(tree, drawing_name='topology_' + str(i + 1) + '.png') sys.exit() if options.plot_tops_file: with open(options.input_file, 'r') as f: for n, lin in enumerate(f.readlines()): rank, probability, combination = lin.rstrip().split(',') all_nodes = [c.split('.') for c in combination.split('_')] flattened = [item for sublist in all_nodes for item in sublist] a = list(set(flattened)) code = rank + '_' + str(int( 100 * round(float(probability), 2))) + '_' + '_'.join(a) print 'code', code node_structure = node_combinations_to_node_structure( combination.split('_')) print node_structure plot_node_structure_as_directed_graph(node_structure, drawing_name=code + '.png', node_dic=node_count_dic) sys.exit() if options.test_run: from generate_prior_trees import generate_phylogeny from tree_statistics import unique_identifier_and_branch_lengths from tree_plotting import plot_node_structure_as_directed_graph, plot_as_directed_graph N = 5 tree1 = generate_phylogeny(N, 1) plot_as_directed_graph(tree1, drawing_name='tree1.png') tree2 = generate_phylogeny(N, 1) plot_as_directed_graph(tree2, drawing_name='tree2.png') stree1 = unique_identifier_and_branch_lengths(tree1) stree2 = unique_identifier_and_branch_lengths(tree2) with open('tmp_tree.txt', 'w') as f: f.write(' '.join(['s' + str(i) for i in range(1, N + 1)]) + '\n') f.write(stree1) with open('trees.txt', 'w') as f: f.write(stree1 + '\n' + stree2 + '\n' + stree1) options.input_file = 'trees.txt' options.nodes = 'tmp_tree.txt' options.no_header = True options.posterior_threshold = [0.25, 0.5, 0.9] if options.input_file == options.node_count_file: node_combinations = [] print 'using population sets from ', options.node_count_file for threshold in options.posterior_threshold: final_node_combinations = [ '.'.join(sorted(list(k))) for k, v in node_count_dic.items() if v > threshold ] node_combinations.append(final_node_combinations) else: print 'Reading file...' #loading trees if options.no_header: strees = [] with open(options.input_file, 'r') as f: for lin in f.readlines(): strees.append(lin.rstrip()) else: df = pd.read_csv(options.input_file, sep=options.sep, usecols=[options.tree_column_name]) strees = df[options.tree_column_name].tolist() n = len(strees) print 'trees read: ', n #thinning tree list rows_to_remove_from_fraction = int(options.burn_in_fraction * n) rows_to_remove = max(rows_to_remove_from_fraction, options.burn_in_rows) strees = strees[rows_to_remove:] print 'removed burn-in:', rows_to_remove print 'In list are now', len(strees), 'trees' #thinning distance_between = max(1, len(strees) // options.max_number_of_trees) nstrees = [] for a, stree in enumerate(strees): if a % distance_between == 0 and len( nstrees) < options.max_number_of_trees: nstrees.append(stree) print 'thinned' print 'In list are now', len(nstrees), 'trees' N = len(nstrees) seen_node_combinations = {} nodes = read_one_line(options.nodes) if not options.no_sort: nodes = sorted(nodes) tenth = len(nstrees) // 10 trees = [] for i, stree in enumerate(nstrees): if tenth > 0 and i % tenth == 0: print i // tenth * 10, '%' if ';' in stree: tree = identifier_to_tree_clean( stree, leaves=generate_predefined_list_string(deepcopy(nodes))) else: tree = topological_identifier_to_tree_clean( stree, leaves=generate_predefined_list_string(deepcopy(nodes))) trees.append(tree) ad = get_populations(tree, min_w=options.min_w) for a in ad: seen_node_combinations[a] = seen_node_combinations.get(a, 0) + 1 node_combinations = [] for threshold in options.posterior_threshold: total_threshold = int(N * threshold) final_node_combinations = [ k for k, v in seen_node_combinations.items() if v > total_threshold ] node_combinations.append(final_node_combinations) for i, final_node_combinations in enumerate(node_combinations): print 'final_node_combinations', final_node_combinations final_node_structure = node_combinations_to_node_structure( final_node_combinations) if options.get_effective_number_of_admixtures: with open(options.effective_number_of_admixtures_file, 'w') as f: if options.type_of_effective_admixtures == 'tadmix': effictive_admixtures = get_number_of_tadmixtures( final_node_structure) f.write(str(effictive_admixtures)) elif options.type_of_effective_admixtures == 'sadmix': val = 0 count = 0 for tree in trees: val += effective_number_of_admixes(tree) count += 1 if count == 1: f.write(str(int(val))) else: f.write(str(float(val) / count)) elif options.type_of_effective_admixtures == 'admix': val = 0 count = 0 for tree in trees: val += get_number_of_admixes(tree) count += 1 if count == 1: f.write(str(int(val))) else: f.write(str(float(val) / count)) if not options.suppress_plot: from tree_plotting import plot_node_structure_as_directed_graph, plot_as_directed_graph plot_node_structure_as_directed_graph(final_node_structure, drawing_name='tmp' + str(i + 1) + '.png', node_dic=node_count_dic)
from tree_statistics import identifier_to_tree_clean from tree_plotting import plot_as_directed_graph, pretty_string #from sphinx.util.nodes import _new_copy true_tree_s= 'w.w.w.w.w.w.a.a.w-c.w.c.c.w.c.5.0.w.3.2-c.w.w.0.c.4.w-c.w.0.c.4-w.c.1-c.0;0.07-0.974-1.016-0.089-0.81-0.086-1.499-0.052-1.199-2.86-0.403-0.468-0.469-1.348-1.302-1.832-0.288-0.18-0.45-0.922-2.925-3.403;0.388-0.485' true_tree=identifier_to_tree_clean(true_tree_s) wrong_trees_s=['w.w.a.w.w.a.a.a.w-c.w.c.c.w.w.c.0.w.w.6.3.2-c.w.w.0.w.c.5.w.w-c.w.0.c.4.w.w-c.w.c.4.0-w.c.1-c.0;0.828-0.21-0.197-0.247-0.568-1.06-0.799-1.162-2.632-2.001-0.45-1.048-0.834-0.469-0.191-2.759-0.871-1.896-0.473-0.019-1.236-0.287-0.179-0.981-0.456-0.91-2.114-3.368;0.655-0.506-0.389-0.23', 'w.w.w.w.w.w.a.a.w-w.w.w.c.w.c.5.a.w.3.a-c.w.c.w.c.4.w.w.0.2.a-w.w.w.w.c.c.4.5.w-c.c.w.w.1.0.w-c.w.w.0.w-c.w.0.w-a.w.w-c.w.0.w-c.w.0-c.0;0.387-0.087-0.806-0.082-2.062-0.803-0.122-0.544-0.061-0.733-0.474-1.342-0.871-0.798-0.753-0.288-0.024-0.174-0.754-0.282-0.45-0.924-0.416-1.081-0.467-1.296-1.171-0.54-1.944-0.258-8.813-0.76-0.073-3.416;0.388-0.467-0.098-0.185-0.019-0.44'] wrong_trees=[identifier_to_tree_clean(tree) for tree in wrong_trees_s] plot_as_directed_graph(true_tree, drawing_name= 'tmp0.bmp') plot_as_directed_graph(wrong_trees[0], drawing_name = 'tmp1.bmp') print pretty_string(wrong_trees[0]) t=wrong_trees[0] from Rproposal_admix import deladmix pks={} from Rtree_to_covariance_matrix import make_covariance from posterior import initialize_big_posterior true_cov=make_covariance(true_tree) posterior_f=initialize_big_posterior(true_cov, M=10000) nt, f,b=deladmix(t,pks=pks, fixed_remove=('a1',1)) plot_as_directed_graph(nt) new_likelihood_value, new_prior_value, (new_branch_prior, new_no_admix_prior, new_admix_prop_prior, new_top_prior), new_covariance= posterior_f((nt,0)) old_likelihood_value, old_prior_value, (old_branch_prior, old_no_admix_prior, old_admix_prop_prior, old_top_prior), old_covariance= posterior_f((t,0))
def scale_tree(tree, mult): return tree_statistics.unique_identifier_and_branch_lengths(Rtree_operations.scale_tree(identifier_to_tree_clean(tree),mult),nodes)
def print_tree(stree): pretty_print(identifier_to_tree_clean(stree))
from tree_plotting import plot_as_directed_graph, pretty_string from tree_statistics import topological_identifier_to_tree_clean, identifier_to_tree_clean, identifier_file_to_tree_clean from Rtree_to_coefficient_matrix import get_numbers import sys count = 0 if len(sys.argv) <= 1: while True: var = raw_input("Please enter something: ") if var == 'q' or var == 'exit' or var == 'q()' or var == 'exit()': break if ';' in var: tree = identifier_to_tree_clean(var) print get_numbers(tree) plot_as_directed_graph(tree, drawing_name='tmp' + str(count) + '.png') else: tree = topological_identifier_to_tree_clean(var) print get_numbers(tree) plot_as_directed_graph(tree, drawing_name='tmp' + str(count) + '.png') count += 1 else: files = sys.argv[1:] for fil in files: tree = identifier_file_to_tree_clean(fil) print get_numbers(tree) plot_as_directed_graph(tree, drawing_name='tmp' + str(count) + '.png') count += 1
def tree_to_ms_command(stree, samples_per_population=20, snps=250000000): nreps=snps//500000 tree=identifier_to_tree_clean(stree) return tree_to_data.tree_to_ms_command(tree, sample_per_pop=samples_per_population, nreps=nreps, leaf_keys=nodes)
def tree_to_covariance(stree): tree=identifier_to_tree_clean(stree) nodes=sorted(get_leaf_keys(tree)) return make_covariance(tree, node_keys=nodes)