def evolve_to_current_time(node, finalize=False): # if it's not the end yet, just dummy if not finalize: if node is None: return for virus in node.viruses(): virus.set_time(GC.time) # otherwise, store trees elif not hasattr(GC, 'sampled_trees'): seed_to_root_virus = { v.get_name(): GC.seed_to_first_virus[v].get_root() for u, v, t in GC.transmissions if u is None } inf_to_seed = {} for u, v, t in GC.transmissions: if u is None: inf_to_seed[v.get_name()] = v.get_name() else: inf_to_seed[v.get_name()] = inf_to_seed[u.get_name()] trees = { l.decode().strip() if isinstance(l, bytes) else l.strip() for l in GC.tree_file } trees = {tree for tree in trees if len(tree) != 0} GC.sampled_trees = set() for tree in trees: t = read_tree_newick(tree) seeds = { inf_to_seed[str(leaf).split('|')[1]] for leaf in t.traverse_leaves() } assert len(seeds) == 1, "More than 1 seed in tree: %s" % tree seed = seeds.pop() GC.sampled_trees.add((seed_to_root_virus[seed], tree)) GC.PRUNE_TREES = False
def resolve_polytomy_helper(input_tree, output_file): full_tree = treeswift.read_tree_newick(input_tree) full_tree.resolve_polytomies() if (hide_prefix): full_tree.write_tree_newick(output_file, hide_rooted_prefix=True) else: full_tree.write_tree_newick(output_file)
def preorder(m): if m == 'dendropy': tree = dendropy.Tree.get(data=treestr, schema='newick') t_start = time() for node in tree.preorder_node_iter(): pass t_end = time() elif m == 'biophylo': tree = Phylo.read(treeio, 'newick') t_start = time() for node in tree.find_clades(order='preorder'): pass t_end = time() elif m == 'treeswift': tree = read_tree_newick(treestr) t_start = time() for node in tree.traverse_preorder(): pass t_end = time() elif m == 'ete3': tree = ete3.Tree(treestr,format=1) t_start = time() for node in tree.traverse(strategy='preorder'): pass t_end = time() else: assert False, "Invalid tool: %s"%m return t_end-t_start
def evolve_to_current_time(node, finalize=False): if node is None: return viruses = [virus for virus in node.viruses()] for virus in viruses: time = GC.time - virus.get_time() if time > 0: node.remove_virus(virus) try: command = [ GC.dualbirth_path, str(GC.rate_A), str(GC.rate_B), '-t', str(time) ] if GC.random_number_seed is not None: command += ['-s', str(GC.random_number_seed)] GC.random_number_seed += 1 treestr = check_output(command).decode() except FileNotFoundError: from os import chdir chdir(GC.START_DIR) assert False, "dualbirth executable was not found: %s" % GC.dualbirth_path tree = read_tree_newick(treestr) virus.set_time(virus.get_time() + tree.root.edge_length) for c in tree.root.children: GC.treenode_add_child(virus, c, node)
def mrca(m): if m == 'dendropy': tree = dendropy.Tree.get(data=treestr, schema='newick') t_start = time() leaves = {l.taxon for l in tree.leaf_node_iter()} tree.mrca(taxa=leaves) t_end = time() elif m == 'biophylo': tree = Phylo.read(treeio, 'newick') t_start = time() leaves = tree.get_terminals() tree.common_ancestor(leaves) t_end = time() elif m == 'treeswift': tree = read_tree_newick(treestr) t_start = time() leaves = {str(l) for l in tree.traverse_leaves()} tree.mrca(leaves) t_end = time() elif m == 'ete3': tree = ete3.Tree(treestr,format=1) t_start = time() leaves = tree.get_leaf_names() tree.get_common_ancestor(leaves) t_end = time() else: assert False, "Invalid tool: %s"%m return t_end-t_start
def check_mulrf_scores(sfile, gfile, mulrf): """ Checks RF scores are the same regardless of preprocessing gene family trees Parameters ---------- sfile : string name of file containing species tree gfile : string name of file containing gene family trees mulrf: string name including full path of MulRFScorer binary """ # Read species tree stree = treeswift.read_tree(sfile, "newick") remove_internal_node_labels(stree) stree.suppress_unifurcations() total_rf = 0 with open(gfile, 'r') as f: g = 1 for line in f.readlines(): temp = "".join(line.split()) # Build MUL-tree mtree = treeswift.read_tree_newick(temp) remove_internal_node_labels(mtree) unroot(mtree) # Build pre-processed MUL-tree mxtree = treeswift.read_tree(temp, "newick") remove_internal_node_labels(mxtree) [nEM, nLM, nR, c, nEMX, nLMX] = preprocess_multree(mxtree) score_shift = compute_score_shift(nEM, nLM, nR, c, nEMX, nLMX) # Compute MulRF scores temp = gfile.rsplit('.', 1)[0] mscore = score_with_MulRF(mulrf, stree, mtree, temp + "-scored") mxscore = score_with_MulRF(mulrf, stree, mxtree, temp + "-preprocessed-and-scored") # Check scores match! if mxscore + score_shift != mscore: sys.exit("Gene tree on line %d failed!\n" % g) total_rf += mscore g += 1 sys.stdout.write('%d\n' % total_rf) sys.stdout.flush() os._exit(0) # CRITICAL ON BLUE WATERS LOGIN NODE
def time_to_mutation_rate(tree): if not hasattr(GC,"NUMPY_SEEDED"): from numpy.random import seed as numpy_seed numpy_seed(seed=GC.random_number_seed) GC.random_number_seed += 1 GC.NUMPY_SEEDED = True t = read_tree_newick(tree) for node in t.traverse_preorder(): if node.edge_length is not None: node.edge_length *= noncentral_f(dfnum=GC.tree_rate_dfnum,dfden=GC.tree_rate_dfden,nonc=GC.tree_rate_lambda) return str(t)
def read_preprocess_and_write_multrees(ifile, ofile, verbose): """ Creates file with preprocessed MUL-trees for FastRFS Parameters ---------- ifile : string name of file containing gene family trees (one newick string per line) ofile : string name of output file (one newick string per line) """ with open(ifile, 'r') as fi, open(ofile, 'w') as fo: g = 1 for line in fi.readlines(): if verbose: sys.stdout.write("Preprocessing gene tree on line %d...\n" % g) sys.stdout.flush() temp = "".join(line.split()) donot = 0 if not temp: donot = 1 else: tree = treeswift.read_tree_newick(temp) if count_leaves(tree) < 4: dotnot = 2 else: [nEM, nLM, nR, c, nEMX, nLMX] = preprocess_multree(tree) score_shift = compute_score_shift(nEM, nLM, nR, c, nEMX, nLMX) if nLMX < 4: donot = 3 else: fo.write(tree.newick() + '\n') if donot and verbose: sys.stdout.write("...did not write tree as ") if donot == 1: sys.stdout.write("as line is empty!") elif donot == 2: sys.stdout.write("as tree has <4 leaves before " "preprocessing!") elif donot == 3: sys.stdout.write("as tree has <4 leaves after " "preprocessing!") sys.stdout.write('\n') sys.stdout.flush() g += 1
def _read_phylogeny(phylogeny_fp): with open(str(phylogeny_fp)) as input_file: tree_str = input_file.readline() if isinstance(tree_str, bytes): tree_str_decoded = tree_str.decode().strip() else: tree_str_decoded = tree_str.strip() tree = read_tree_newick(tree_str_decoded) return tree
def time_to_mutation_rate(tree): if not hasattr(GC, "NUMPY_SEEDED"): from numpy.random import seed as numpy_seed numpy_seed(seed=GC.random_number_seed) GC.random_number_seed += 1 GC.NUMPY_SEEDED = True t = read_tree_newick(tree) for node in t.traverse_preorder(): if node.edge_length is not None: node.edge_length *= pareto(a=GC.tree_rate_shape) return str(t)
def induce_tree_helper(input_tree, input_data, output_file, hide_prefix, input_type, resolve_polytomies): full_tree = treeswift.read_tree_newick(input_tree) to_keep_node_labels = set() if input_type == "fasta": for sequence in SeqIO.parse(open(input_data), "fasta"): to_keep_node_labels.add(sequence.id) elif input_type == "newick": to_keep_tree = treeswift.read_tree_newick(input_data) for current_node in to_keep_tree.traverse_leaves(): to_keep_node_labels.add(current_node.label) induced_tree = full_tree.extract_tree_with(to_keep_node_labels); if(resolve_polytomies): induced_tree.resolve_polytomies() if(hide_prefix): induced_tree.write_tree_newick(output_file, hide_rooted_prefix=True) else: induced_tree.write_tree_newick(output_file)
def run_TreeCluster(threshold, tree_file, threshold_free, method, support): trees = [] trees.append(read_tree_newick(tree_file)) # run algorithm for t, tree in enumerate(trees): if threshold_free is None: clusters = METHODS[method.lower()](tree, threshold, support) else: clusters = THRESHOLDFREE[threshold_free](METHODS[method.lower()], tree, threshold, support) return clusters
def remove_outgroups_newick(tree_filename, outgroups_filename): if tree_filename is None: return None if outgroups_filename is None: return tree_filename if not isfile(tree_filename): raise ValueError("Invalid tree file: %s" % tree_filename) outgroups = {l.strip() for l in read_file(outgroups_filename)} tree = read_tree_newick(tree_filename) out_filename = '%s.no_outgroup.%s' % ('.'.join(rstrip_gz(tree_filename).split('.')[:-1]), rstrip_gz(tree_filename).split('.')[-1]) tree_no_og = tree.extract_tree_without(outgroups) tree_no_og.root.edge_length = None write_file('%s\n' % tree_no_og.newick().lstrip('[&R] '), out_filename) return out_filename
def relabel_tree_helper(input_tree, tax_list, output_file, hide_prefix): full_tree = treeswift.read_tree_newick(input_tree) tax_map = {} with open(tax_list, "r") as f: line_counter = 0 for line in f: tax_map[str(line_counter)] = line.strip() line_counter += 1 print(tax_map) full_tree.rename_nodes(tax_map) if (hide_prefix): full_tree.write_tree_newick(output_file, hide_rooted_prefix=True) else: full_tree.write_tree_newick(output_file)
def time_to_mutation_rate(tree): if not hasattr(GC, "NUMPY_SEEDED"): from numpy.random import seed as numpy_seed numpy_seed(seed=GC.random_number_seed) GC.random_number_seed += 1 GC.NUMPY_SEEDED = True t = read_tree_newick(tree) for node in t.traverse_preorder(): if node.is_root(): node.rate = GC.tree_rate_R0 else: node.rate = exponential(scale=node.parent.rate) if node.edge_length is not None: node.edge_length *= node.rate return str(t)
def test_leaf_dijkstra(self): tree = ts.read_tree_newick("(A:3.2,(B:2.1,(C:1,D:1)));") nodes = [n for n in tree.traverse_preorder()] b = nodes[5] obs = b.leaf_dijkstra() exp = [(3.1,"C"), (3.1,"D"), (5.300000000000001,"A")] self.assertEqual(obs, exp) a = nodes[6] obs = a.leaf_dijkstra(2) exp = [(4.2,"C"), (4.2,"D")] self.assertEqual(obs, exp) tree = ts.read_tree_newick("((A:2.3,E:3)(B:2,(C:1.2,D:1)));") nodes = [n for n in tree.traverse_preorder()] a = nodes[7] obs = a.leaf_dijkstra(3) exp = [(3.3,"D"), (3.5,"C"), (4.3,"B")] self.assertEqual(obs, exp) with self.assertRaises(TypeError): obs = a.leaf_dijkstra(nodes[0])
def time_to_mutation_rate(tree): if not hasattr(GC, "NUMPY_SEEDED"): from numpy.random import seed as numpy_seed numpy_seed(seed=GC.random_number_seed) GC.random_number_seed += 1 GC.NUMPY_SEEDED = True t = read_tree_newick(tree) for node in t.traverse_preorder(): if node.edge_length is not None: node.edge_length *= truncnorm.rvs(a=GC.tree_rate_min, b=GC.tree_rate_max, loc=GC.tree_rate_loc, scale=GC.tree_rate_scale, size=1)[0] return str(t)
def estimate_mutation_rate(rooted_tree_filename, dates_filename): tree = read_tree_newick(rooted_tree_filename) dates = dict() for u,t in load_dates_ViReport(dates_filename): dates[u] = date_to_days(t) rtt = dict(); x = list(); y = list() # x is time, y is root-to-tip for node in tree.traverse_preorder(): if node.is_root(): rtt[node] = 0 else: rtt[node] = rtt[node.parent] if node.edge_length is not None: rtt[node] += node.edge_length if node.is_leaf(): x.append(dates[node.label]); y.append(rtt[node]) return linregress(x,y)[0] # slope is mutations/site/time, x-intercept is tMRCA
def time_to_mutation_rate(tree): if not hasattr(GC, "NUMPY_SEEDED"): from numpy.random import seed as numpy_seed numpy_seed(seed=GC.random_number_seed) GC.random_number_seed += 1 GC.NUMPY_SEEDED = True t = read_tree_newick(tree) for node in t.traverse_preorder(): if node.is_root(): node.rate = GC.tree_rate_R0 else: assert node.edge_length is not None and node.edge_length > 0, "All edges must have positive lengths for TreeUnit_AutocorrelatedLogNormal" node.rate = lognormal(mean=node.parent.rate, sigma=GC.tree_rate_v * node.edge_length) if node.edge_length is not None: # root node might not have incident edge node.edge_length *= node.rate return str(t)
def read_peroba_database(f_prefix, trust_global_sequences=False): if f_prefix[-1] == ".": f_prefix = f_prefix[: -1] ## both `perobaDB.0621` and `perobaDB.0621.` are valid fname = f_prefix + common.suffix["metadata"] logger.info(f"Reading database metadata from \'{fname}\'") metadata = pd.read_csv(fname, compression="infer", index_col="peroba_seq_uid", dtype="unicode") metadata = common.df_finalise_metadata(metadata) fname = f_prefix + common.suffix["subsample"] logger.info(f"Reading subsampling information from \'{fname}\'") subsample = pd.read_csv(fname, compression="infer", index_col="peroba_seq_uid", dtype="unicode") for col in subsample.columns: subsample[col] = pd.to_numeric(subsample[col], errors='coerce') fname = f_prefix + common.suffix["tree"] logger.info(f"Reading database tree from \'{fname}\'") treestring = open(fname).readline().rstrip().replace("\'", "").replace( "\"", "").replace("[&R]", "") tree = treeswift.read_tree_newick(treestring) fname = f_prefix + common.suffix["alignment"] logger.info(f"Reading database alignment from \'{fname}\'") sequences = common.read_fasta(fname, check_name=False) unaligned = [] if trust_global_sequences: logger.info( f"Will assume global sequences are 'better' than local when duplicates exist" ) else: fname = f_prefix + common.suffix["sequences"] logger.info(f"Reading database unaligned sequences from \'{fname}\'") unaligned = common.read_fasta(fname, check_name=False) logger.info( "Finished loading the database; dataframe has dimensions %s and it's assumed we have the same number of sequences; the tree may be smaller", metadata.shape) return [metadata, sequences, tree, subsample, unaligned]
def sample(tree, sampling_method): """ Samples from a tagged tree, by taking clades at random at duplication vetices NOTE: must be run after 'tag()' Parameters ---------- tree: tagged treeswift tree sampling_method: defines the number of samples "linear" - the number of sample is the same as the duplication node "exp" - the number of sample = 2^number of duplication node custom method - takes as parameter the number of duplication nodes, and returns the number of samples Returns samples as a list of trees """ random.seed(0) # set fixed seed for reproducibility out = [] root = tree.root if sampling_method == 'linear': n_sample = tree.n_dup + 1 elif sampling_method == 'exp': n_sample = 2**tree.n_dup elif sampling_method.isdigit(): n_sample = int(sampling_method) else: n_sample = sampling_method(tree.n_dup) for i in range(n_sample): for node in tree.traverse_postorder(leaves=False): if node.tag == 'D': # deletes one randomly [left, right] = node.child_nodes() # we want to keep sections with more duplicates more often # otherwise we can end up getting the same small tree repeatedly bias = (left.n_dup + 0.5) / node.n_dup node.delete = left if random.random() > bias else right #node.delete = random.choice(node.child_nodes()) node.remove_child(node.delete) out.append(treeswift.read_tree_newick(tree.newick())) for node in tree.traverse_preorder(leaves=False): if node.tag == 'D': node.add_child(node.delete) return out
def time_to_mutation_rate(tree): t = read_tree_newick(tree) for node in t.traverse_preorder(): if node.is_root(): node.rate = GC.tree_rate_R0 else: node.rate = node.parent.rate r = random() if r < GC.tree_rate_p / 2: # increment node.rate += GC.tree_rate_delta if node.rate > GC.tree_rate_max: node.rate = GC.tree_rate_max elif r < GC.tree_rate_p: # decrement node.rate -= GC.tree_rate_delta if node.rate < GC.tree_rate_min: node.rate = GC.tree_rate_min if node.edge_length is not None: node.edge_length *= node.rate return str(t)
def main(args): if args.output is None: split = args.input.rsplit('.', 1) output = split[0] + '-mclades.' + split[1] else: output = args.output with open(args.input, 'r') as fi: with open(output, 'w') as fo: for line in fi: tree = treeswift.read_tree_newick(line) unroot(tree) max_clades = find_max_clades(tree, args.delimiter) for c in max_clades: unroot(c) c.suppress_unifurcations() newk = c.newick() if args.trivial or not trivial(newk): fo.write(newk + '\n')
def pairwise_distances(tree_filename): if not isfile(tree_filename): raise ValueError("Invalid tree file: %s" % tree_filename) out_filename = '%s/pairwise_distances_phylogeny.csv' % GC.OUT_DIR_OUTFILES if GC.GZIP_OUTPUT: out_filename += '.gz' if isfile(out_filename) or isfile('%s.gz' % out_filename): GC.SELECTED['Logging'].writeln("Pairwise phylogenetic distances exist. Skipping recomputation.") else: dm = read_tree_newick(tree_filename).distance_matrix(leaf_labels=True) labels = sorted(dm.keys()) out_lines = ['ID1,ID2,Distance'] for i in range(len(labels)-1): u = labels[i] for j in range(i+1, len(labels)): v = labels[j] out_lines.append('%s,%s,%s' % (u, v, GC.num_str(dm[u][v]))) GC.write_file('\n'.join(out_lines), out_filename) return out_filename
def reconstruct(rooted_tree_filename, aln_filename): if not isfile(rooted_tree_filename): raise ValueError("Invalid tree file: %s" % rooted_tree_filename) if not isfile(aln_filename): raise ValueError("Invalid alignment file: %s" % aln_filename) treetime_dir = '%s/TreeTime_AncestralSequenceReconstruction' % GC.OUT_DIR_TMPFILES out_filename = '%s/ancestral_sequences.fas' % GC.OUT_DIR_OUTFILES if GC.GZIP_OUTPUT: out_filename += '.gz' if isfile(out_filename): GC.SELECTED['Logging'].writeln( "Ancestral sequences exist. Skipping recomputation.") else: makedirs(treetime_dir, exist_ok=True) tree_with_internal_labels_filename = '%s/tree_with_internal_labels.tre' % treetime_dir log = open('%s/log.txt' % treetime_dir, 'w') tmp = read_tree_newick(rooted_tree_filename) for i, node in enumerate(tmp.traverse_levelorder(leaves=False)): if node.is_root(): node.label = "ROOT" else: node.label = "I%d" % i GC.write_file('%s\n' % tmp.newick(), tree_with_internal_labels_filename) if aln_filename.endswith('.gz'): unzipped_filename = '%s/aln_unzipped.fas' % treetime_dir GC.write_file('\n'.join(GC.read_file(aln_filename)), unzipped_filename) aln_filename = unzipped_filename command = [ 'treetime', 'ancestral', '--aln', aln_filename, '--tree', tree_with_internal_labels_filename, '--outdir', treetime_dir ] f = open('%s/command.txt' % treetime_dir, 'w') f.write('%s\n' % ' '.join(command)) f.close() call(command, stdout=log) log.close() GC.write_file( '\n'.join( GC.read_file('%s/ancestral_sequences.fasta' % treetime_dir)), out_filename) return out_filename
def inorder(m): if m == 'dendropy': tree = dendropy.Tree.get(data=treestr, schema='newick') t_start = time() for node in tree.inorder_node_iter(): pass t_end = time() elif m == 'biophylo': return NA elif m == 'treeswift': tree = read_tree_newick(treestr) t_start = time() for node in tree.traverse_inorder(): pass t_end = time() elif m == 'ete3': return NA else: assert False, "Invalid tool: %s"%m return t_end-t_start
def load_tree(m): if m == 'dendropy': t_start = time() tree = dendropy.Tree.get(data=treestr, schema='newick') t_end = time() elif m == 'biophylo': t_start = time() tree = Phylo.read(treeio, 'newick') t_end = time() elif m == 'treeswift': t_start = time() tree = read_tree_newick(treestr) t_end = time() elif m == 'ete3': t_start = time() tree = ete3.Tree(treestr,format=1) t_end = time() else: assert False, "Invalid tool: %s"%m return t_end-t_start
def measure_memory(m): if m == 'dendropy': m_start = memory() t = dendropy.Tree.get(data=treestr, schema='newick') t.encode_bipartitions() m_end = memory() elif m == 'biophylo': m_start = memory() t = Phylo.read(treeio, 'newick') m_end = memory() elif m == 'treeswift': m_start = memory() t = read_tree_newick(treestr) m_end = memory() elif m == 'ete3': m_start = memory() t = ete3.Tree(treestr,format=1) m_end = memory() else: assert False, "Invalid tool: %s"%m return m_end-m_start
def rootdistorder(m): if m == 'dendropy': tree = dendropy.Tree.get(data=treestr, schema='newick') t_start = time() tree.calc_node_ages(is_force_max_age=True) for node in tree.ageorder_node_iter(descending=True): pass t_end = time() elif m == 'biophylo': return NA elif m == 'treeswift': tree = read_tree_newick(treestr) t_start = time() for node in tree.traverse_rootdistorder(): pass t_end = time() elif m == 'ete3': return NA else: assert False, "Invalid tool: %s"%m return t_end-t_start
def finalize(): GC.final_sequences = {} TreeNode = MF.modules['TreeNode'] for root, treestr in GC.pruned_newick_trees: seq = root.get_seq() leaves = list() for node in read_tree_newick(treestr).traverse_leaves(): virus_name, cn_label, t_str = [ s.strip() for s in node.label.split('|') ] sample_time = float(t_str) if cn_label not in GC.final_sequences: GC.final_sequences[cn_label] = {} if sample_time not in GC.final_sequences[cn_label]: GC.final_sequences[cn_label][sample_time] = [] leaf = TreeNode( time=sample_time, seq=seq, contact_network_node=GC.contact_network.get_node(cn_label)) leaves.append(leaf) GC.final_sequences[cn_label][sample_time].append((leaf, seq)) root.set_leaves(leaves)