def refine(tree=None, aln=None, ref=None, dates=None, branch_length_inference='auto', confidence=False, resolve_polytomies=True, max_iter=2, precision='auto', infer_gtr=True, Tc=0.01, reroot=None, use_marginal=False, fixed_pi=None, clock_rate=None, clock_std=None, clock_filter_iqd=None, verbosity=1, covariance=True, **kwarks): from treetime import TreeTime try: #Tc could be a number or 'opt' or 'skyline'. TreeTime expects a float or int if a number. Tc = float(Tc) except ValueError: True #let it remain a string if (ref is not None) and (fixed_pi is None): #if VCF, fix pi #Otherwise mutation TO gaps is overestimated b/c of seq length fixed_pi = [ ref.count(base) / len(ref) for base in ['A', 'C', 'G', 'T', '-'] ] if fixed_pi[-1] == 0: fixed_pi[-1] = 0.05 fixed_pi = [v - 0.01 for v in fixed_pi] if ref is not None: # VCF -> adjust branch length #set branch length mode explicitly if auto, as informative-site only #trees can have big branch lengths, making this set incorrectly in TreeTime if branch_length_inference == 'auto': branch_length_inference = 'joint' #send ref, if is None, does no harm tt = TreeTime(tree=tree, aln=aln, ref=ref, dates=dates, verbose=verbosity, gtr='JC69', precision=precision) # conditionally run clock-filter and remove bad tips if clock_filter_iqd: # treetime clock filter will mark, but not remove bad tips tt.clock_filter(reroot=reroot, n_iqd=clock_filter_iqd, plot=False) #use whatever was specified # remove them explicitly leaves = [x for x in tt.tree.get_terminals()] for n in leaves: if n.bad_branch: tt.tree.prune(n) print('pruning leaf ', n.name) # fix treetime set-up for new tree topology tt.prepare_tree() if confidence and use_marginal: # estimate confidence intervals via marginal ML and assign # marginal ML times to nodes marginal = 'assign' else: marginal = confidence # uncertainty of the the clock rate is relevant if confidence intervals are estimated if confidence and clock_std: vary_rate = clock_std # if standard devivation of clock is specified, use that elif (clock_rate is None) and confidence and covariance: vary_rate = True # if run in covariance mode, standard deviation can be estimated else: vary_rate = False # otherwise, rate uncertainty will be ignored tt.run(infer_gtr=infer_gtr, root=reroot, Tc=Tc, time_marginal=marginal, branch_length_mode=branch_length_inference, resolve_polytomies=resolve_polytomies, max_iter=max_iter, fixed_pi=fixed_pi, fixed_clock_rate=clock_rate, vary_rate=vary_rate, use_covariation=covariance, **kwarks) if confidence: for n in tt.tree.find_clades(): n.num_date_confidence = list(tt.get_max_posterior_region(n, 0.9)) print( "\nInferred a time resolved phylogeny using TreeTime:" "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis" "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n" ) return tt
def refine(tree=None, aln=None, ref=None, dates=None, branch_length_inference='auto', confidence=False, resolve_polytomies=True, max_iter=2, infer_gtr=True, Tc=0.01, reroot=None, use_marginal=False, fixed_pi=None, clock_rate=None, clock_std=None, clock_filter_iqd=None, verbosity=1, **kwarks): from treetime import TreeTime try: #Tc could be a number or 'opt' or 'skyline'. TreeTime expects a float or int if a number. Tc = float(Tc) except ValueError: True #let it remain a string if (ref is not None) and (fixed_pi is None): #if VCF, fix pi #Otherwise mutation TO gaps is overestimated b/c of seq length fixed_pi = [ref.count(base)/len(ref) for base in ['A','C','G','T','-']] if fixed_pi[-1] == 0: fixed_pi[-1] = 0.05 fixed_pi = [v-0.01 for v in fixed_pi] if ref is not None: # VCF -> adjust branch length #set branch length mode explicitly if auto, as informative-site only #trees can have big branch lengths, making this set incorrectly in TreeTime if branch_length_inference == 'auto': branch_length_inference = 'joint' #send ref, if is None, does no harm tt = TreeTime(tree=tree, aln=aln, ref=ref, dates=dates, verbose=verbosity, gtr='JC69') # conditionally run clock-filter and remove bad tips if clock_filter_iqd: # treetime clock filter will mark, but not remove bad tips tt.clock_filter(reroot='best', n_iqd=clock_filter_iqd, plot=False) # remove them explicitly leaves = [x for x in tt.tree.get_terminals()] for n in leaves: if n.bad_branch: tt.tree.prune(n) print('pruning leaf ', n.name) # fix treetime set-up for new tree topology tt.prepare_tree() if confidence and use_marginal: # estimate confidence intervals via marginal ML and assign # marginal ML times to nodes marginal = 'assign' else: marginal = confidence vary_rate = False if clock_rate and clock_std: vary_rate = clock_std else: vary_rate = True tt.run(infer_gtr=infer_gtr, root=reroot, Tc=Tc, time_marginal=marginal, branch_length_mode=branch_length_inference, resolve_polytomies=resolve_polytomies, max_iter=max_iter, fixed_pi=fixed_pi, fixed_clock_rate=clock_rate, vary_rate=vary_rate, **kwarks) if confidence: for n in tt.tree.find_clades(): n.num_date_confidence = list(tt.get_max_posterior_region(n, 0.9)) print("\nInferred a time resolved phylogeny using TreeTime:" "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis" "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n") return tt
def estimate_clock_model(params): """ implementing treetime clock """ if assure_tree(params, tmp_dir='clock_model_tmp'): return 1 dates = utils.parse_dates(params.dates, date_col=params.date_column, name_col=params.name_column) if len(dates)==0: return 1 outdir = get_outdir(params, '_clock') ########################################################################### ### READ IN VCF ########################################################################### #sets ref and fixed_pi to None if not VCF aln, ref, fixed_pi = read_if_vcf(params) is_vcf = True if ref is not None else False ########################################################################### ### ESTIMATE ROOT (if requested) AND DETERMINE TEMPORAL SIGNAL ########################################################################### if params.aln is None and params.sequence_length is None: print("one of arguments '--aln' and '--sequence-length' is required.", file=sys.stderr) return 1 basename = get_basename(params, outdir) try: myTree = TreeTime(dates=dates, tree=params.tree, aln=aln, gtr='JC69', verbose=params.verbose, seq_len=params.sequence_length, ref=ref) except TreeTimeError as e: print("\nTreeTime setup failed. Please see above for error messages and/or rerun with --verbose 4\n") raise e myTree.tip_slack=params.tip_slack if params.clock_filter: n_bad = [n.name for n in myTree.tree.get_terminals() if n.bad_branch] myTree.clock_filter(n_iqd=params.clock_filter, reroot=params.reroot or 'least-squares') n_bad_after = [n.name for n in myTree.tree.get_terminals() if n.bad_branch] if len(n_bad_after)>len(n_bad): print("The following leaves don't follow a loose clock and " "will be ignored in rate estimation:\n\t" +"\n\t".join(set(n_bad_after).difference(n_bad))) if not params.keep_root: # reroot to optimal root, this assigns clock_model to myTree if params.covariation: # this requires branch length estimates myTree.run(root="least-squares", max_iter=0, use_covariation=params.covariation) try: res = myTree.reroot(params.reroot, force_positive=not params.allow_negative_rate) except TreeTimeError as e: print("ERROR: unknown root or rooting mechanism!") raise e myTree.get_clock_model(covariation=params.covariation) else: myTree.get_clock_model(covariation=params.covariation) d2d = utils.DateConversion.from_regression(myTree.clock_model) print('\n',d2d) print(fill('The R^2 value indicates the fraction of variation in' 'root-to-tip distance explained by the sampling times.' 'Higher values corresponds more clock-like behavior (max 1.0).')+'\n') print(fill('The rate is the slope of the best fit of the date to' 'the root-to-tip distance and provides an estimate of' 'the substitution rate. The rate needs to be positive!' 'Negative rates suggest an inappropriate root.')+'\n') print('\nThe estimated rate and tree correspond to a root date:') if params.covariation: reg = myTree.clock_model dp = np.array([reg['intercept']/reg['slope']**2,-1./reg['slope']]) droot = np.sqrt(reg['cov'][:2,:2].dot(dp).dot(dp)) print('\n--- root-date:\t %3.2f +/- %1.2f (one std-dev)\n\n'%(-d2d.intercept/d2d.clock_rate, droot)) else: print('\n--- root-date:\t %3.2f\n\n'%(-d2d.intercept/d2d.clock_rate)) if not params.keep_root: # write rerooted tree to file outtree_name = basename+'rerooted.newick' Phylo.write(myTree.tree, outtree_name, 'newick') print("--- re-rooted tree written to \n\t%s\n"%outtree_name) table_fname = basename+'rtt.csv' with open(table_fname, 'w') as ofile: ofile.write("#name, date, root-to-tip distance\n") ofile.write("#Dates of nodes that didn't have a specified date are inferred from the root-to-tip regression.\n") for n in myTree.tree.get_terminals(): if hasattr(n, "raw_date_constraint") and (n.raw_date_constraint is not None): if np.isscalar(n.raw_date_constraint): tmp_str = str(n.raw_date_constraint) elif len(n.raw_date_constraint): tmp_str = str(n.raw_date_constraint[0])+'-'+str(n.raw_date_constraint[1]) else: tmp_str = '' ofile.write("%s, %s, %f\n"%(n.name, tmp_str, n.dist2root)) else: ofile.write("%s, %f, %f\n"%(n.name, d2d.numdate_from_dist2root(n.dist2root), n.dist2root)) for n in myTree.tree.get_nonterminals(order='preorder'): ofile.write("%s, %f, %f\n"%(n.name, d2d.numdate_from_dist2root(n.dist2root), n.dist2root)) print("--- wrote dates and root-to-tip distances to \n\t%s\n"%table_fname) ########################################################################### ### PLOT AND SAVE RESULT ########################################################################### plot_rtt(myTree, outdir+params.plot_rtt) return 0
def estimate_clock_model(params): """ implementing treetime clock """ if assure_tree(params, tmp_dir='clock_model_tmp'): return 1 dates = utils.parse_dates(params.dates) if len(dates)==0: return 1 outdir = get_outdir(params, '_clock') ########################################################################### ### READ IN VCF ########################################################################### #sets ref and fixed_pi to None if not VCF aln, ref, fixed_pi = read_if_vcf(params) is_vcf = True if ref is not None else False ########################################################################### ### ESTIMATE ROOT (if requested) AND DETERMINE TEMPORAL SIGNAL ########################################################################### if params.aln is None and params.sequence_length is None: print("one of arguments '--aln' and '--sequence-length' is required.", file=sys.stderr) return 1 basename = get_basename(params, outdir) myTree = TreeTime(dates=dates, tree=params.tree, aln=aln, gtr='JC69', verbose=params.verbose, seq_len=params.sequence_length, ref=ref) myTree.tip_slack=params.tip_slack if myTree.tree is None: print("ERROR: tree loading failed. exiting...") return 1 if params.clock_filter: n_bad = [n.name for n in myTree.tree.get_terminals() if n.bad_branch] myTree.clock_filter(n_iqd=params.clock_filter, reroot=params.reroot or 'least-squares') n_bad_after = [n.name for n in myTree.tree.get_terminals() if n.bad_branch] if len(n_bad_after)>len(n_bad): print("The following leaves don't follow a loose clock and " "will be ignored in rate estimation:\n\t" +"\n\t".join(set(n_bad_after).difference(n_bad))) if not params.keep_root: # reroot to optimal root, this assigns clock_model to myTree if params.covariation: # this requires branch length estimates myTree.run(root="least-squares", max_iter=0, use_covariation=params.covariation) res = myTree.reroot(params.reroot, force_positive=not params.allow_negative_rate) myTree.get_clock_model(covariation=params.covariation) if res==ttconf.ERROR: print("ERROR: unknown root or rooting mechanism!\n" "\tvalid choices are 'least-squares', 'ML', and 'ML-rough'") return 1 else: myTree.get_clock_model(covariation=params.covariation) d2d = utils.DateConversion.from_regression(myTree.clock_model) print('\n',d2d) print('The R^2 value indicates the fraction of variation in' '\nroot-to-tip distance explained by the sampling times.' '\nHigher values corresponds more clock-like behavior (max 1.0).') print('\nThe rate is the slope of the best fit of the date to' '\nthe root-to-tip distance and provides an estimate of' '\nthe substitution rate. The rate needs to be positive!' '\nNegative rates suggest an inappropriate root.\n') print('\nThe estimated rate and tree correspond to a root date:') if params.covariation: reg = myTree.clock_model dp = np.array([reg['intercept']/reg['slope']**2,-1./reg['slope']]) droot = np.sqrt(reg['cov'][:2,:2].dot(dp).dot(dp)) print('\n--- root-date:\t %3.2f +/- %1.2f (one std-dev)\n\n'%(-d2d.intercept/d2d.clock_rate, droot)) else: print('\n--- root-date:\t %3.2f\n\n'%(-d2d.intercept/d2d.clock_rate)) if not params.keep_root: # write rerooted tree to file outtree_name = basename+'rerooted.newick' Phylo.write(myTree.tree, outtree_name, 'newick') print("--- re-rooted tree written to \n\t%s\n"%outtree_name) table_fname = basename+'rtt.csv' with open(table_fname, 'w') as ofile: ofile.write("#name, date, root-to-tip distance\n") ofile.write("#Dates of nodes that didn't have a specified date are inferred from the root-to-tip regression.\n") for n in myTree.tree.get_terminals(): if hasattr(n, "raw_date_constraint") and (n.raw_date_constraint is not None): if np.isscalar(n.raw_date_constraint): tmp_str = str(n.raw_date_constraint) elif len(n.raw_date_constraint): tmp_str = str(n.raw_date_constraint[0])+'-'+str(n.raw_date_constraint[1]) else: tmp_str = '' ofile.write("%s, %s, %f\n"%(n.name, tmp_str, n.dist2root)) else: ofile.write("%s, %f, %f\n"%(n.name, d2d.numdate_from_dist2root(n.dist2root), n.dist2root)) for n in myTree.tree.get_nonterminals(order='preorder'): ofile.write("%s, %f, %f\n"%(n.name, d2d.numdate_from_dist2root(n.dist2root), n.dist2root)) print("--- wrote dates and root-to-tip distances to \n\t%s\n"%table_fname) ########################################################################### ### PLOT AND SAVE RESULT ########################################################################### plot_rtt(myTree, outdir+params.plot_rtt) return 0