def save_stage(value, stage_number, prefix, full_nodes, before_added_outgroup_nodes, after_reduce_nodes, filename=None): if filename is None: save_word=dictionary_of_reasonable_names[stage_number] filename=prefix+save_word+'.txt' if stage_number==1: write_one_line_to_file(filename, str(value)) elif stage_number==2: write_one_line_to_file(filename, str(value)) elif stage_number==3: write_two_lines_to_file(filename, ' '.join(before_added_outgroup_nodes), unique_identifier_and_branch_lengths(value, before_added_outgroup_nodes)) elif stage_number==4: write_two_lines_to_file(filename, ' '.join(full_nodes), unique_identifier_and_branch_lengths(value, full_nodes)) elif stage_number==5: # print full_nodes write_two_lines_to_file(filename, ' '.join(full_nodes), unique_identifier_and_branch_lengths(value, full_nodes)) elif stage_number==6: pass #print 'file is already made elsewhere' elif stage_number==7: emp_cov_to_file(value, filename, full_nodes) elif stage_number==8: emp_cov_to_file(value, filename, after_reduce_nodes) elif stage_number in [21,22,23]: print 'Stage not saved' else: emp_cov_to_file(value[0], filename, after_reduce_nodes) with open(filename, 'a') as f: f.write('multiplier='+str(value[1]))
def create_treemix_csv_output(tree, add, m_scale, outfile): if m_scale is not None: tree = scale_tree(tree, 1.0 / m_scale) add = add / m_scale with open(outfile, 'w') as f: f.write('tree,add' + '\n') f.write(unique_identifier_and_branch_lengths(tree) + ',' + str(add))
def __call__(self, full_tree, **kwargs): stree = unique_identifier_and_branch_lengths(full_tree, leaf_order=self.nodes) if self.tree_unifier is not None: stree = self.tree_unifier(stree) string_tree = self.node_string + stree return {'string_tree': string_tree}, False
def get_possible_permutation_strees(tree): leaves, _, admixture_keys = get_categories(tree) k = len(admixture_keys) format_code = '{0:0' + str(k) + 'b}' n_trees = [] for i in range(2**k): pruned_tree = deepcopy(tree) bina = format_code.format(i) prop = 1.0 for adm_key, str_bin in zip(admixture_keys, list(bina)): int_bin = int(str_bin) if int_bin == 1: pruned_tree[adm_key] = change_admixture(pruned_tree[adm_key]) n_tree = unique_identifier_and_branch_lengths(pruned_tree) n_trees.append(n_tree) return n_trees
def autogenerate_tree(no_leaves, no_admixtures, minimum_number_of_nonzeros=1, minimum_number_of_zeros=1): while True: tree = generate_phylogeny(no_leaves, no_admixtures) cov = make_covariance(tree) zeros = [get_number_of_zeros(row) for row in cov] no_non_zeros = cov.shape[0] - max(zeros) if no_non_zeros >= minimum_number_of_nonzeros and max( zeros) >= minimum_number_of_zeros: break tree = add_outgroup(tree, 'z', 0.234, 1.96, 'Aa') cov = make_covariance(tree) print cov print reduce_covariance(cov, 0) plot_as_directed_graph(tree) suffix = str(no_leaves) + '_' + str(no_admixtures) + '_' + str( minimum_number_of_nonzeros) + '_' + str(minimum_number_of_zeros) return unique_identifier_and_branch_lengths(tree), suffix
if options.input_type == 'tree': tree = identifier_file_to_tree_clean(options.input_file) if options.input_add: with open(options.input_add, 'r') as f: add = float(f.readline()) tree = add_outgroup(tree, inner_node_name='new_node', to_new_root_length=float(add), to_outgroup_length=0, outgroup_name=options.outgroup_name) nodes = get_leaf_keys(tree) assert all((a in nodes for a in options.populations )), 'Requested population was not found in the tree' subtree = get_subtree(tree, options.populations) if not options.output_file: options.output_file = options.input_file + '_'.join( options.populations) with open(options.output_file, 'w') as f: f.write(' '.join(sorted(options.populations)) + '\n') f.write(unique_identifier_and_branch_lengths(subtree)) if options.input_type == 'snps': if options.input_file.endswith('.gz'): options.input_file = unzip(options.input_file, overwrite=False) df = pd.read_csv(options.input_file, usecols=options.populations, sep=' ') if not options.output_file: options.output_file = options.input_file + '_'.join( options.populations) df.to_csv(options.output_file, sep=' ', index=False) gzip(options.output_file, overwrite=True)
def test_posterior_model_multichain(true_tree=None, start_tree=None, sim_lengths=[250] * 800, summaries=None, thinning_coef=1, admixtures_of_true_tree=None, no_leaves_true_tree=4, wishart_df=None, sim_from_wishart=False, no_chains=8, result_file='results_mc3.csv', emp_cov=None, emp_remove=-1, rescale_empirical_cov=False): if true_tree is None: if admixtures_of_true_tree is None: admixtures_of_true_tree = geom.rvs(p=0.5) - 1 true_tree = generate_phylogeny(no_leaves_true_tree, admixtures_of_true_tree) else: no_leaves_true_tree = get_no_leaves(true_tree) admixtures_of_true_tree = get_number_of_admixes(true_tree) true_x = (true_tree, 0) m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree)) if start_tree is None: start_tree = true_tree start_x = (start_tree, 0) if wishart_df is None: wishart_df = n_mark(m) if sim_from_wishart: r = m.shape[0] print m m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df)) print m if emp_cov is not None: m = emp_cov if rescale_empirical_cov: posterior, multiplier = initialize_posterior( m, wishart_df, use_skewed_distr=True, rescale=rescale_empirical_cov) else: posterior = initialize_posterior(m, wishart_df, use_skewed_distr=True, rescale=rescale_empirical_cov) multiplier = None print 'true_tree=', unique_identifier_and_branch_lengths(true_tree) if rescale_empirical_cov: post_ = posterior( (scale_tree_copy(true_x[0], 1.0 / multiplier), true_x[1] / multiplier)) else: post_ = posterior(true_x) print 'likelihood(true_tree)', post_[0] print 'prior(true_tree)', post_[1] print 'posterior(true_tree)', sum(post_) if summaries is None: summaries = [ s_variable('posterior'), s_variable('mhr'), s_no_admixes() ] proposal = basic_meta_proposal() #proposal.props=proposal.props[2:] #a little hack under the hood #proposal.params=proposal.params[2:] #a little hack under the hood. sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} sample_verbose_scheme_first = deepcopy(sample_verbose_scheme) if 'posterior' in sample_verbose_scheme: sample_verbose_scheme_first['posterior'] = (1, 1) #(1,1) sample_verbose_scheme_first['no_admixes'] = (1, 1) #if 'likelihood' in sample_verbose_scheme: #sample_verbose_scheme_first['likelihood']=(1,1) print sample_verbose_scheme_first MCMCMC(starting_trees=[deepcopy(start_x) for _ in range(no_chains)], posterior_function=posterior, summaries=summaries, temperature_scheme=fixed_geometrical(800.0, no_chains), printing_schemes=[sample_verbose_scheme_first] + [sample_verbose_scheme for _ in range(no_chains - 1)], iteration_scheme=sim_lengths, overall_thinnings=int(thinning_coef), proposal_scheme=[adaptive_proposal() for _ in range(no_chains)], cores=no_chains, no_chains=no_chains, multiplier=multiplier, result_file=result_file, store_permuts=False) print 'finished MC3' #save_pandas_dataframe_to_csv(results, result_file) #save_permuts_to_csv(permuts, get_permut_filename(result_file)) return true_tree
def test_posterior_model(true_tree=None, start_tree=None, sim_length=100000, summaries=None, thinning_coef=19, admixtures_of_true_tree=None, no_leaves_true_tree=4, filename='results.csv', sim_from_wishart=False, wishart_df=None, sap_sim=False, sap_ana=False, resimulate_regrafted_branch_length=False, emp_cov=None, big_posterior=False, rescale_empirical_cov=False): if true_tree is None: if admixtures_of_true_tree is None: admixtures_of_true_tree = geom.rvs(p=0.5) - 1 true_tree = generate_phylogeny(no_leaves_true_tree, admixtures_of_true_tree, skewed_admixture_prior=sap_sim) else: no_leaves_true_tree = get_no_leaves(true_tree) admixtures_of_true_tree = get_number_of_admixes(true_tree) true_x = (true_tree, 0) m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree)) if start_tree is None: start_tree = true_tree start_x = (start_tree, 0) if wishart_df is None: wishart_df = n_mark(m) if sim_from_wishart: r = m.shape[0] print m m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df)) print m if emp_cov is not None: m = emp_cov if big_posterior: posterior = initialize_big_posterior(m, wishart_df, use_skewed_distr=sap_ana) else: posterior = initialize_posterior(m, wishart_df, use_skewed_distr=sap_ana, rescale=rescale_empirical_cov) print 'true_tree=', unique_identifier_and_branch_lengths(true_tree) post_ = posterior(true_x) print 'likelihood(true_tree)', post_[0] print 'prior(true_tree)', post_[1] print 'posterior(true_tree)', sum(post_[:2]) if summaries is None: summaries = [s_posterior(), s_variable('mhr'), s_no_admixes()] proposal = adaptive_proposal( resimulate_regrafted_branch_length=resimulate_regrafted_branch_length) #proposal.props=proposal.props[2:] #a little hack under the hood #proposal.params=proposal.params[2:] #a little hack under the hood. sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} sample_verbose_scheme['posterior'] = (1, 1) sample_verbose_scheme['no_admixes'] = (1, 1) final_tree, final_posterior, results, _ = basic_chain( start_x, summaries, posterior, proposal, post=None, N=sim_length, sample_verbose_scheme=sample_verbose_scheme, overall_thinning=int(max(thinning_coef, sim_length / 60000)), i_start_from=0, temperature=1.0, proposal_update=None, check_trees=False) save_to_csv(results, summaries, filename=filename) return true_tree
def run_posterior_multichain(wishart_df=1000, true_tree_as_identifier=None, result_file='result_mc3.csv', emp_cov_file=None, emp_remove=-1, remove_outgroup=False, make_emp_cov_file=True): if true_tree_as_identifier is None: true_tree = Rcatalogue_of_trees.tree_good else: true_tree = tree_statistics.identifier_to_tree_clean( 'w.w.w.w.w.w.a.a.w-c.w.c.c.w.c.5.0.w.3.2-c.w.w.0.c.4.w-c.w.0.c.3-w.c.1-c.0;0.07-0.974-1.016-0.089-0.81-0.086-1.499-0.052-1.199-2.86-0.403-0.468-0.469-1.348-1.302-1.832-0.288-0.18-0.45-0.922-2.925-3.403;0.388-0.485' ) #with open(true_tree_as_identifier, 'r') as f: # s=f.readline().rstrip() # true_tree=tree_statistics.identifier_to_tree_clean(s) if remove_outgroup: true_tree = Rtree_operations.remove_outgroup(true_tree) true_tree = Rtree_operations.simple_reorder_the_leaves_after_removal_of_s1( true_tree) if make_emp_cov_file: cov = tree_to_data.get_empirical_matrix(s, factor=0.01, reps=400) tree_to_data.emp_cov_to_file(cov, filename=emp_cov_file) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths( true_tree) no_leaves = Rtree_operations.get_no_leaves(true_tree) #s_tree=tree_statistics.identifier_to_tree_clean('w.w.a.w.w.a.a.a.w-c.w.c.c.w.w.c.0.w.w.6.3.2-c.w.w.0.w.c.5.w.w-c.w.0.c.3.w.w-c.w.c.2.0-w.c.1-c.0;0.828-0.21-0.197-0.247-0.568-1.06-0.799-1.162-2.632-2.001-0.45-1.048-0.834-0.469-0.191-2.759-0.871-1.896-0.473-0.019-1.236-0.287-0.179-0.981-0.456-0.91-2.114-3.368;0.655-0.506-0.389-0.23') s_tree = Rtree_operations.create_burled_leaved_tree(no_leaves, 1.0) print 'no_leaves', no_leaves summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_variable('add', output='double'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_likelihood(), summary.s_prior(), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] if emp_cov_file is not None: if emp_remove < 0: emp_cov = tree_to_data.file_to_emp_cov(emp_cov_file) else: emp_cov = tree_to_data.file_to_emp_cov(emp_cov_file, emp_remove) else: emp_cov = None print 'emp_cov', emp_cov r = simulation_sanity.test_posterior_model_multichain( true_tree, s_tree, [50] * 20000, summaries=summaries, thinning_coef=24, wishart_df=wishart_df, result_file=result_file, emp_cov=emp_cov, rescale_empirical_cov=False) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r) analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
def run_analysis_of_proposals(): #true_tree=generate_prior_trees.generate_phylogeny(8,2) true_tree = tree_statistics.identifier_to_tree_clean( 'w.w.c.w.w.w.2.w-w.w.a.w.w.w.w-w.c.1.w.c.w.w.4-w.c.1.w.w.w-w.c.1.w.w-c.0.w.w-c.w.0-a.w-c.0.w-c.0;0.091-1.665-0.263-0.821-0.058-0.501-0.141-0.868-5.064-0.153-0.372-3.715-1.234-0.913-2.186-0.168-0.542-0.056-2.558-0.324;0.367-0.451' ) true_tree = Rcatalogue_of_trees.tree_good s_tree = Rtree_operations.create_trivial_tree(4) summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_bposterior_difference(lambda x: x[0], 'likelihood_difference'), summary.s_bposterior_difference(lambda x: x[1], 'prior_difference'), summary.s_bposterior_difference(lambda x: x[2][0], 'branch_prior_difference'), summary.s_bposterior_difference(lambda x: x[2][1], 'no_admix_prior_difference'), summary.s_bposterior_difference(lambda x: x[2][2], 'adix_prop_prior_difference'), summary.s_bposterior_difference(lambda x: x[2][3], 'top_prior_difference'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] r = simulation_sanity.test_posterior_model( true_tree, true_tree, 100000, summaries=summaries, thinning_coef=2, wishart_df=1000, resimulate_regrafted_branch_length=False, admixtures_of_true_tree=2, no_leaves_true_tree=4, big_posterior=True, rescale_empirical_cov=True) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r) analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
def create_treemix_sfull_tree_csv_output(tree, m_scale, outfile): if m_scale is not None: tree = scale_tree(tree, 1.0 / m_scale) with open(outfile, 'w') as f: f.write('sfull_tree' + '\n') f.write(unique_identifier_and_branch_lengths(tree))
def load_treemix_tree(prefix): files=[prefix+'.'+name for name in ['treeout','vertices','edges']] return tree_statistics.unique_identifier_and_branch_lengths(read_treemix_file(*files), ['s'+str(i) for i in range(1,10)]+['out'])
parser.add_argument('--leaves', type=int, default=4, help='number of leaves in tree') parser.add_argument('--admixes', type=int, default=2, help='number of admixture events in tree') parser.add_argument('--output_file', type=str, default='tmp.txt', help='directory') options = parser.parse_args() tree = generate_phylogeny(options.leaves, options.admixes) from tree_statistics import unique_identifier_and_branch_lengths s = unique_identifier_and_branch_lengths(tree) pretty_print(tree) print 'turns into', s with open(options.output_file, 'w') as f: f.write(s) # print _classify_type(12, 12, 0, 1) # #print _allowed_generation([1, 4, 6, 8, 9, 11],12) # #print _allowed_generation([2,3], 5) # #print _allowed_generation([1,2], 5) # #print _allowed_generation([2,3,5], 5) # # from tree_plotting import plot_graph, pretty_print # for _ in xrange(100): # ak=generate_admix_topology(2, 1) # pretty_print(ak)
def main(args): parser = ArgumentParser( usage='pipeline for plotting posterior distribution summaries.', version='1.0.0') parser.add_argument( '--posterior_distribution_file', required=True, type=str, help= 'The file containing posterior distributions from the "AdmixtureBayes posterior" command. It needs the two columns "pops" and topology.' ) parser.add_argument( '--plot', choices=['consensus_trees', 'top_node_trees', 'top_trees'], required=True, help='The type of plot to make. Choose between: 1) consensus_trees. ' 'It plots an admixture graph based on all nodes that have a higher (marginal) posterior probability of X. ' 'Different X\'s can be supplied with the command --consensus_threshold \n' '2) top_node_trees. It plots the X highest posterior combinations of node types ' 'and creates the corresponding minimal topologies. X can be supplied through the command --top_node_trees_to_plot' '3) top_trees. It plots the X highest posterior topologies. X can be supplied by the command --top_trees_to_plot' ) parser.add_argument('--outgroup', default='outgroup', help='name of the outgroup to plot') parser.add_argument( '--consensus_threshold', default=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99], type=float, nargs='+', help= 'The posterior thresholds for which to draw different consensus trees.' ) parser.add_argument( '--top_node_trees_to_plot', type=int, default=3, help='The number of node trees (or minimal topologies) to plot') parser.add_argument('--top_trees_to_plot', type=int, default=3, help='The number of trees (or topologies) to plot ') parser.add_argument( '--write_ranking_to_file', type=str, default='', help= 'if a file is supplied here, the natural rankings for each of the plots is written here.' ) parser.add_argument( '--rankings_to_write_to_file', type=int, default=1000, help= 'the number of rankings(nodes, min topology or topology depending on --plot) to write to the ranking file.' ) parser.add_argument( '--dont_annotate_node_posterior', default=False, action='store_true', help= 'This will not color the nodes according to their posterior probability.' ) parser.add_argument('--nodes', default='', type=str, help='file where the first line is the leaf nodes') parser.add_argument('--suppress_plot', default=False, action='store_true') parser.add_argument( '--no_sort', default=False, action='store_true', help= 'often the tree is sorted according to the leaf names. no_sort willl assumed that they are not sorted according to this but sorted according to ' ) parser.add_argument('--sep', default=',', type=str, help='the separator used in the input file') #parser.add_argument('--no_header', default=False, action='store_true',help='will assume that there is no header in the file') #parser.add_argument('--burn_in_rows', default=0, type=int, help='the number of rows that will be skipped in the input file as burn-in period') #parser.add_argument('--burn_in_fraction', default=0.0, type=float, help='the proportion of the rows that are discarded as burn in period') #parser.add_argument('--tree_column_name', default='tree', type=str, help='the name in the header of the column with all the trees.') parser.add_argument( '--consensus_method', choices=['descendant_frequencies'], default='descendant_frequencies', help='Which method should be used to calculate the consensus tree?') #parser.add_argument('--min_w', default=0.0, type=float, help='a lower threshold of which descendants matter when the consensus_method is descendant_frequencies.') #parser.add_argument('--plot_tops_file', action='store_true', default=False, help='this will assume that the file is a tops file from downstream_analysis_parser and plot each line numbered.') #parser.add_argument('--get_effective_number_of_admixtures', action='store_true', default=False, help='this will cancel all the other analysis and only print the effective number of admixes(tadmixes/sadmixes or admixes) to a a file.') #parser.add_argument('--effective_number_of_admixtures_file', type=str, default='no_tadmixes.txt', help='this is the file in which to write the effective number of admixes in the file') #parser.add_argument('--type_of_effective_admixtures', type=str, choices=['sadmix','tadmix','admix'], help='this is the type of admixes to write to the file.') #parser.add_argument('--node_count_file', default='', type=str, help='if plot_tops option is supplied') #parser.add_argument('--node_count_probs', default='', type=str, help='if supplied this will make a new ') #parser.add_argument('--test_run', default=False, action='store_true', # help='will overwrite everything and run a test function') options = parser.parse_args(args) def combine_nodes(node_structure, new_node, seen_sets): candidate = new_node.name seen = [] for lists_of_fixed_size in seen_sets[::-1]: for attached_branch in lists_of_fixed_size: if (attached_branch.issubset(candidate) and ((not attached_branch.issubset(seen)) or (not node_structure[attached_branch].has_parent()))): seen.extend(list(attached_branch)) new_node.add_child(node_structure[attached_branch]) node_structure[attached_branch].add_parent(new_node) return node_structure def get_number_of_tadmixtures(node_structure): total = 0 for key in node_structure: total += max(0, node_structure[key].get_number_of_parents() - 1) return total def node_combinations_to_node_structure(node_combinations): length_sorted = {} for node_combination in node_combinations: leaves = frozenset(node_combination.split('.')) k = len(leaves) if k in length_sorted: length_sorted[k].append(leaves) else: length_sorted[k] = [leaves] length_sorted_list = [ length_sorted.get(k, []) for k in range(1, max(length_sorted.keys()) + 1) ] #length_sorted_list is of the form [[[A],[B],[C]],[[A,B],[B,C]],...,[[A,B,C]]] node_structure = {} for leaf_node in length_sorted_list[0]: node_structure[leaf_node] = Node(leaf_node) added_sets = [length_sorted_list[0]] for lists_of_fixed_size in length_sorted_list[1:]: for branch_set in lists_of_fixed_size: new_node = Node(branch_set) combine_nodes(node_structure, new_node, added_sets) node_structure[branch_set] = new_node added_sets.append(lists_of_fixed_size) return node_structure # if options.node_count_file: # with open(options.node_count_file, 'r') as f: # node_count_dic={} # for lin in f.readlines(): # key,freq=lin.rstrip().split() # node_count_dic[frozenset(key.split('.'))]=float(freq) # else: # node_count_dic=None if options.plot == 'consensus_trees' or options.plot == 'top_node_trees': df = pd.read_csv(options.posterior_distribution_file, sep=options.sep, usecols=['pops']) nodes_list = df['pops'].tolist() #print(nodes_list) seen_combinations = {} for nodes in nodes_list: #print(nodes) for node in nodes.split('-'): #print(node) seen_combinations[node] = seen_combinations.get(node, 0) + 1 N = len(nodes_list) #print(seen_combinations) if options.plot == 'consensus_trees': node_combinations = [] for threshold in options.consensus_threshold: total_threshold = int(N * threshold) final_node_combinations = [ k for k, v in seen_combinations.items() if v > total_threshold ] node_combinations.append(final_node_combinations) if not options.dont_annotate_node_posterior: node_count_dic = { frozenset(k.split('.')): float(v) / N for k, v in seen_combinations.items() } else: node_count_dic = None for i, final_node_combinations in enumerate(node_combinations): #print(final_node_combinations) final_node_structure = node_combinations_to_node_structure( final_node_combinations) if not options.suppress_plot: from tree_plotting import plot_node_structure_as_directed_graph plot_node_structure_as_directed_graph( final_node_structure, drawing_name='consensus_' + str(int(100 * options.consensus_threshold[i])) + '.png', node_dic=node_count_dic) if options.write_ranking_to_file: with open(options.write_ranking_to_file, 'w') as f: c = Counter(seen_combinations) to_write = c.most_common(options.rankings_to_write_to_file) for node, frequency in to_write: f.write(node + ',' + str(float(frequency) / N) + '\n') elif options.plot == 'top_node_trees': c = Counter(nodes_list) to_plots = c.most_common(options.top_node_trees_to_plot) if options.write_ranking_to_file: with open(options.write_ranking_to_file, 'w') as f: for tree, frequency in c.most_common( options.rankings_to_write_to_file): f.write(tree + ',' + str(float(frequency) / N) + '\n') if not options.dont_annotate_node_posterior: c = Counter(seen_combinations) node_count_dic = { frozenset(key.split('.')): float(count) / N for key, count in c.most_common(1000) } else: node_count_dic = None if not options.suppress_plot: from tree_plotting import plot_node_structure_as_directed_graph for i, (to_plot, count) in enumerate(to_plots): node_structure = node_combinations_to_node_structure( to_plot.split('-')) plot_node_structure_as_directed_graph( node_structure, drawing_name='minimal_topology_' + str(i + 1) + '.png', node_dic=node_count_dic) elif options.plot == 'top_trees': df = pd.read_csv(options.posterior_distribution_file, sep=options.sep, usecols=['pops', 'topology']) trees_list = df['topology'].tolist() no_leaves = len(trees_list[0].split('-')[0].split('.')) N = len(trees_list) c = Counter(trees_list) to_plots = c.most_common(options.top_trees_to_plot) #obtaining nodes: if not options.nodes: nodes = df['pops'].tolist()[0].split('-') leaves = list( set([leaf for node in nodes for leaf in node.split('.')])) if len(leaves) == no_leaves: pass #everything is good elif len(leaves) == no_leaves - 1: #adding outgroup leaves.append(options.outgroup) else: assert False, 'The number of leaves could not be obtained' assert not options.no_sort, 'When nodes are not specified, they will always be sorted' leaves = sorted(leaves) else: leaves = read_one_line(options.nodes) if not options.no_sort: leaves = sorted(leaves) if options.write_ranking_to_file: with open(options.write_ranking_to_file, 'w') as f: for tree, frequency in c.most_common( options.rankings_to_write_to_file): f.write(tree + ',' + str(float(frequency) / N) + '\n') if not options.suppress_plot: from tree_plotting import plot_as_directed_graph for i, (to_plot, count) in enumerate(to_plots): tree = topological_identifier_to_tree_clean( to_plot, leaves=generate_predefined_list_string(deepcopy(leaves))) plot_as_directed_graph(tree, drawing_name='topology_' + str(i + 1) + '.png') sys.exit() if options.plot_tops_file: with open(options.input_file, 'r') as f: for n, lin in enumerate(f.readlines()): rank, probability, combination = lin.rstrip().split(',') all_nodes = [c.split('.') for c in combination.split('_')] flattened = [item for sublist in all_nodes for item in sublist] a = list(set(flattened)) code = rank + '_' + str(int( 100 * round(float(probability), 2))) + '_' + '_'.join(a) print 'code', code node_structure = node_combinations_to_node_structure( combination.split('_')) print node_structure plot_node_structure_as_directed_graph(node_structure, drawing_name=code + '.png', node_dic=node_count_dic) sys.exit() if options.test_run: from generate_prior_trees import generate_phylogeny from tree_statistics import unique_identifier_and_branch_lengths from tree_plotting import plot_node_structure_as_directed_graph, plot_as_directed_graph N = 5 tree1 = generate_phylogeny(N, 1) plot_as_directed_graph(tree1, drawing_name='tree1.png') tree2 = generate_phylogeny(N, 1) plot_as_directed_graph(tree2, drawing_name='tree2.png') stree1 = unique_identifier_and_branch_lengths(tree1) stree2 = unique_identifier_and_branch_lengths(tree2) with open('tmp_tree.txt', 'w') as f: f.write(' '.join(['s' + str(i) for i in range(1, N + 1)]) + '\n') f.write(stree1) with open('trees.txt', 'w') as f: f.write(stree1 + '\n' + stree2 + '\n' + stree1) options.input_file = 'trees.txt' options.nodes = 'tmp_tree.txt' options.no_header = True options.posterior_threshold = [0.25, 0.5, 0.9] if options.input_file == options.node_count_file: node_combinations = [] print 'using population sets from ', options.node_count_file for threshold in options.posterior_threshold: final_node_combinations = [ '.'.join(sorted(list(k))) for k, v in node_count_dic.items() if v > threshold ] node_combinations.append(final_node_combinations) else: print 'Reading file...' #loading trees if options.no_header: strees = [] with open(options.input_file, 'r') as f: for lin in f.readlines(): strees.append(lin.rstrip()) else: df = pd.read_csv(options.input_file, sep=options.sep, usecols=[options.tree_column_name]) strees = df[options.tree_column_name].tolist() n = len(strees) print 'trees read: ', n #thinning tree list rows_to_remove_from_fraction = int(options.burn_in_fraction * n) rows_to_remove = max(rows_to_remove_from_fraction, options.burn_in_rows) strees = strees[rows_to_remove:] print 'removed burn-in:', rows_to_remove print 'In list are now', len(strees), 'trees' #thinning distance_between = max(1, len(strees) // options.max_number_of_trees) nstrees = [] for a, stree in enumerate(strees): if a % distance_between == 0 and len( nstrees) < options.max_number_of_trees: nstrees.append(stree) print 'thinned' print 'In list are now', len(nstrees), 'trees' N = len(nstrees) seen_node_combinations = {} nodes = read_one_line(options.nodes) if not options.no_sort: nodes = sorted(nodes) tenth = len(nstrees) // 10 trees = [] for i, stree in enumerate(nstrees): if tenth > 0 and i % tenth == 0: print i // tenth * 10, '%' if ';' in stree: tree = identifier_to_tree_clean( stree, leaves=generate_predefined_list_string(deepcopy(nodes))) else: tree = topological_identifier_to_tree_clean( stree, leaves=generate_predefined_list_string(deepcopy(nodes))) trees.append(tree) ad = get_populations(tree, min_w=options.min_w) for a in ad: seen_node_combinations[a] = seen_node_combinations.get(a, 0) + 1 node_combinations = [] for threshold in options.posterior_threshold: total_threshold = int(N * threshold) final_node_combinations = [ k for k, v in seen_node_combinations.items() if v > total_threshold ] node_combinations.append(final_node_combinations) for i, final_node_combinations in enumerate(node_combinations): print 'final_node_combinations', final_node_combinations final_node_structure = node_combinations_to_node_structure( final_node_combinations) if options.get_effective_number_of_admixtures: with open(options.effective_number_of_admixtures_file, 'w') as f: if options.type_of_effective_admixtures == 'tadmix': effictive_admixtures = get_number_of_tadmixtures( final_node_structure) f.write(str(effictive_admixtures)) elif options.type_of_effective_admixtures == 'sadmix': val = 0 count = 0 for tree in trees: val += effective_number_of_admixes(tree) count += 1 if count == 1: f.write(str(int(val))) else: f.write(str(float(val) / count)) elif options.type_of_effective_admixtures == 'admix': val = 0 count = 0 for tree in trees: val += get_number_of_admixes(tree) count += 1 if count == 1: f.write(str(int(val))) else: f.write(str(float(val) / count)) if not options.suppress_plot: from tree_plotting import plot_node_structure_as_directed_graph, plot_as_directed_graph plot_node_structure_as_directed_graph(final_node_structure, drawing_name='tmp' + str(i + 1) + '.png', node_dic=node_count_dic)
def simulate_tree(no_leaves, no_admixes=None): if no_admixes is None: no_admixes = geom.rvs(p=0.5) - 1 tree = generate_phylogeny(no_leaves, no_admixes) return unique_identifier_and_branch_lengths(tree)
def add_random_admix(stree, *kwargs): tree = identifier_to_tree_clean(stree) ad = addadmix(tree, new_node_names=['x1', 'x2'], *kwargs) return unique_identifier_and_branch_lengths(ad[0])
def scale_tree(tree, mult): return tree_statistics.unique_identifier_and_branch_lengths(Rtree_operations.scale_tree(identifier_to_tree_clean(tree),mult),nodes)
def run_d(true_tree_as_file=None): #true_tree=generate_prior_trees.generate_phylogeny(8,2) if true_tree_as_file is None: true_tree = tree_statistics.identifier_to_tree_clean( 'w.w.w.w.w.w.a.a.w-c.w.c.c.w.c.5.0.w.3.2-c.w.w.0.c.4.w-c.w.0.c.3-w.c.1-c.0;0.07-0.974-1.016-0.089-0.81-0.086-1.499-0.052-1.199-2.86-0.403-0.468-0.469-1.348-1.302-1.832-0.288-0.18-0.45-0.922-2.925-3.403;0.388-0.485' ) #true_tree=Rcatalogue_of_trees.tree_good s_tree = tree_statistics.identifier_to_tree_clean( 'w.w.a.w.w.a.a.a.w-c.w.c.c.w.w.c.0.w.w.6.3.2-c.w.w.0.w.c.5.w.w-c.w.0.c.3.w.w-c.w.c.2.0-w.c.1-c.0;0.828-0.21-0.197-0.247-0.568-1.06-0.799-1.162-2.632-2.001-0.45-1.048-0.834-0.469-0.191-2.759-0.871-1.896-0.473-0.019-1.236-0.287-0.179-0.981-0.456-0.91-2.114-3.368;0.655-0.506-0.389-0.23' ) print Rtree_operations.pretty_string(s_tree) print Rtree_operations.pretty_string(true_tree) else: with open(true_tree_as_file, 'r') as f: s = f.readline().rstrip() true_tree = tree_statistics.identifier_to_tree_clean(s) no_leaves = Rtree_operations.get_number_of_leaves(true_tree) s_tree = Rtree_operations.create_trivial_tree(no_leaves) summaries = [ summary.s_posterior(), summary.s_variable('mhr', output='double_missing'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.get_admixture_proportion_string, 'admixtures', output='string'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_variable('add', output='double'), summary.s_variable('sliding_rescale_adap_param', output='double_missing'), summary.s_variable('cutoff_distance', output='double_missing'), summary.s_variable('number_of_pieces', output='double_missing'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_constrained_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] r = simulation_sanity.test_posterior_model( true_tree, s_tree, 100000, summaries=summaries, thinning_coef=20, wishart_df=10000, resimulate_regrafted_branch_length=False) #, #admixtures_of_true_tree=2, no_leaves_true_tree=8, rescale_empirical_cov=True) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r) analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
#print reduce_covariance(identity(10), 5) #print file_to_emp_cov('out_stem.cov',4) from sys import exit exit() from generate_prior_trees import generate_phylogeny from Rcatalogue_of_trees import * from Rtree_operations import create_trivial_tree, scale_tree tree2=scale_tree(generate_phylogeny(5,1),0.05) print pretty_string(tree2) print pretty_string(identifier_to_tree_clean(unique_identifier_and_branch_lengths(tree2))) print supplementary_text_ms_string() tree_good=generate_phylogeny(7) a=tree_to_ms_command(tree_good, 50,20) #print call_ms_string(a, 'supp.txt') b=time_adjusted_tree_to_ms_command(tree_good,50,20) #print call_ms_string(b, 'supp2.txt') #print call_ms_string(tree_to_ms_command(tree2, 50,20), 'tmp.txt') #cov= ms_to_treemix2('supp.txt', 20, 20,400) #cov= ms_to_treemix2('tmp.txt', 50, 5,20) #cov2=calculate_covariance_matrix('tmp.txt', 50, 5,20) #print cov #print cov2 #print make_covariance(tree2) #print reduce_covariance(cov, 0) #print reduce_covariance(cov2, 0)