예제 #1
0
파일: refine.py 프로젝트: tomkinsc/augur
def refine(tree=None,
           aln=None,
           ref=None,
           dates=None,
           branch_length_inference='auto',
           confidence=False,
           resolve_polytomies=True,
           max_iter=2,
           precision='auto',
           infer_gtr=True,
           Tc=0.01,
           reroot=None,
           use_marginal=False,
           fixed_pi=None,
           clock_rate=None,
           clock_std=None,
           clock_filter_iqd=None,
           verbosity=1,
           covariance=True,
           **kwarks):
    from treetime import TreeTime

    try:  #Tc could be a number or  'opt' or 'skyline'. TreeTime expects a float or int if a number.
        Tc = float(Tc)
    except ValueError:
        True  #let it remain a string

    if (ref is not None) and (fixed_pi is None):  #if VCF, fix pi
        #Otherwise mutation TO gaps is overestimated b/c of seq length
        fixed_pi = [
            ref.count(base) / len(ref) for base in ['A', 'C', 'G', 'T', '-']
        ]
        if fixed_pi[-1] == 0:
            fixed_pi[-1] = 0.05
            fixed_pi = [v - 0.01 for v in fixed_pi]

    if ref is not None:  # VCF -> adjust branch length
        #set branch length mode explicitly if auto, as informative-site only
        #trees can have big branch lengths, making this set incorrectly in TreeTime
        if branch_length_inference == 'auto':
            branch_length_inference = 'joint'

    #send ref, if is None, does no harm
    tt = TreeTime(tree=tree,
                  aln=aln,
                  ref=ref,
                  dates=dates,
                  verbose=verbosity,
                  gtr='JC69',
                  precision=precision)

    # conditionally run clock-filter and remove bad tips
    if clock_filter_iqd:
        # treetime clock filter will mark, but not remove bad tips
        tt.clock_filter(reroot=reroot, n_iqd=clock_filter_iqd,
                        plot=False)  #use whatever was specified
        # remove them explicitly
        leaves = [x for x in tt.tree.get_terminals()]
        for n in leaves:
            if n.bad_branch:
                tt.tree.prune(n)
                print('pruning leaf ', n.name)
        # fix treetime set-up for new tree topology
        tt.prepare_tree()

    if confidence and use_marginal:
        # estimate confidence intervals via marginal ML and assign
        # marginal ML times to nodes
        marginal = 'assign'
    else:
        marginal = confidence

    # uncertainty of the the clock rate is relevant if confidence intervals are estimated
    if confidence and clock_std:
        vary_rate = clock_std  # if standard devivation of clock is specified, use that
    elif (clock_rate is None) and confidence and covariance:
        vary_rate = True  # if run in covariance mode, standard deviation can be estimated
    else:
        vary_rate = False  # otherwise, rate uncertainty will be ignored

    tt.run(infer_gtr=infer_gtr,
           root=reroot,
           Tc=Tc,
           time_marginal=marginal,
           branch_length_mode=branch_length_inference,
           resolve_polytomies=resolve_polytomies,
           max_iter=max_iter,
           fixed_pi=fixed_pi,
           fixed_clock_rate=clock_rate,
           vary_rate=vary_rate,
           use_covariation=covariance,
           **kwarks)

    if confidence:
        for n in tt.tree.find_clades():
            n.num_date_confidence = list(tt.get_max_posterior_region(n, 0.9))

    print(
        "\nInferred a time resolved phylogeny using TreeTime:"
        "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
        "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n"
    )
    return tt
예제 #2
0
def refine(tree=None, aln=None, ref=None, dates=None, branch_length_inference='auto',
             confidence=False, resolve_polytomies=True, max_iter=2,
             infer_gtr=True, Tc=0.01, reroot=None, use_marginal=False, fixed_pi=None,
             clock_rate=None, clock_std=None, clock_filter_iqd=None, verbosity=1, **kwarks):
    from treetime import TreeTime

    try: #Tc could be a number or  'opt' or 'skyline'. TreeTime expects a float or int if a number.
        Tc = float(Tc)
    except ValueError:
        True #let it remain a string

    if (ref is not None) and (fixed_pi is None): #if VCF, fix pi
        #Otherwise mutation TO gaps is overestimated b/c of seq length
        fixed_pi = [ref.count(base)/len(ref) for base in ['A','C','G','T','-']]
        if fixed_pi[-1] == 0:
            fixed_pi[-1] = 0.05
            fixed_pi = [v-0.01 for v in fixed_pi]

    if ref is not None: # VCF -> adjust branch length
        #set branch length mode explicitly if auto, as informative-site only
        #trees can have big branch lengths, making this set incorrectly in TreeTime
        if branch_length_inference == 'auto':
            branch_length_inference = 'joint'

    #send ref, if is None, does no harm
    tt = TreeTime(tree=tree, aln=aln, ref=ref, dates=dates,
                  verbose=verbosity, gtr='JC69')

    # conditionally run clock-filter and remove bad tips
    if clock_filter_iqd:
        # treetime clock filter will mark, but not remove bad tips
        tt.clock_filter(reroot='best', n_iqd=clock_filter_iqd, plot=False)
        # remove them explicitly
        leaves = [x for x in tt.tree.get_terminals()]
        for n in leaves:
            if n.bad_branch:
                tt.tree.prune(n)
                print('pruning leaf ', n.name)
        # fix treetime set-up for new tree topology
        tt.prepare_tree()

    if confidence and use_marginal:
        # estimate confidence intervals via marginal ML and assign
        # marginal ML times to nodes
        marginal = 'assign'
    else:
        marginal = confidence

    vary_rate = False
    if clock_rate and clock_std:
        vary_rate = clock_std
    else:
        vary_rate = True

    tt.run(infer_gtr=infer_gtr, root=reroot, Tc=Tc, time_marginal=marginal,
           branch_length_mode=branch_length_inference, resolve_polytomies=resolve_polytomies,
           max_iter=max_iter, fixed_pi=fixed_pi, fixed_clock_rate=clock_rate,
           vary_rate=vary_rate, **kwarks)

    if confidence:
        for n in tt.tree.find_clades():
            n.num_date_confidence = list(tt.get_max_posterior_region(n, 0.9))

    print("\nInferred a time resolved phylogeny using TreeTime:"
          "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
          "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n")
    return tt
예제 #3
0
def estimate_clock_model(params):
    """
    implementing treetime clock
    """

    if assure_tree(params, tmp_dir='clock_model_tmp'):
        return 1
    dates = utils.parse_dates(params.dates, date_col=params.date_column, name_col=params.name_column)
    if len(dates)==0:
        return 1

    outdir = get_outdir(params, '_clock')

    ###########################################################################
    ### READ IN VCF
    ###########################################################################
    #sets ref and fixed_pi to None if not VCF
    aln, ref, fixed_pi = read_if_vcf(params)
    is_vcf = True if ref is not None else False

    ###########################################################################
    ### ESTIMATE ROOT (if requested) AND DETERMINE TEMPORAL SIGNAL
    ###########################################################################
    if params.aln is None and params.sequence_length is None:
        print("one of arguments '--aln' and '--sequence-length' is required.", file=sys.stderr)
        return 1

    basename = get_basename(params, outdir)
    try:
        myTree = TreeTime(dates=dates, tree=params.tree, aln=aln, gtr='JC69',
                      verbose=params.verbose, seq_len=params.sequence_length,
                      ref=ref)
    except TreeTimeError as e:
        print("\nTreeTime setup failed. Please see above for error messages and/or rerun with --verbose 4\n")
        raise e

    myTree.tip_slack=params.tip_slack
    if params.clock_filter:
        n_bad = [n.name for n in myTree.tree.get_terminals() if n.bad_branch]
        myTree.clock_filter(n_iqd=params.clock_filter, reroot=params.reroot or 'least-squares')
        n_bad_after = [n.name for n in myTree.tree.get_terminals() if n.bad_branch]
        if len(n_bad_after)>len(n_bad):
            print("The following leaves don't follow a loose clock and "
                  "will be ignored in rate estimation:\n\t"
                  +"\n\t".join(set(n_bad_after).difference(n_bad)))

    if not params.keep_root:
        # reroot to optimal root, this assigns clock_model to myTree
        if params.covariation: # this requires branch length estimates
            myTree.run(root="least-squares", max_iter=0,
                       use_covariation=params.covariation)

        try:
            res = myTree.reroot(params.reroot,
                      force_positive=not params.allow_negative_rate)
        except TreeTimeError as e:
            print("ERROR: unknown root or rooting mechanism!")
            raise e

        myTree.get_clock_model(covariation=params.covariation)
    else:
        myTree.get_clock_model(covariation=params.covariation)

    d2d = utils.DateConversion.from_regression(myTree.clock_model)
    print('\n',d2d)
    print(fill('The R^2 value indicates the fraction of variation in'
          'root-to-tip distance explained by the sampling times.'
          'Higher values corresponds more clock-like behavior (max 1.0).')+'\n')

    print(fill('The rate is the slope of the best fit of the date to'
          'the root-to-tip distance and provides an estimate of'
          'the substitution rate. The rate needs to be positive!'
          'Negative rates suggest an inappropriate root.')+'\n')

    print('\nThe estimated rate and tree correspond to a root date:')
    if params.covariation:
        reg = myTree.clock_model
        dp = np.array([reg['intercept']/reg['slope']**2,-1./reg['slope']])
        droot = np.sqrt(reg['cov'][:2,:2].dot(dp).dot(dp))
        print('\n--- root-date:\t %3.2f +/- %1.2f (one std-dev)\n\n'%(-d2d.intercept/d2d.clock_rate, droot))
    else:
        print('\n--- root-date:\t %3.2f\n\n'%(-d2d.intercept/d2d.clock_rate))

    if not params.keep_root:
        # write rerooted tree to file
        outtree_name = basename+'rerooted.newick'
        Phylo.write(myTree.tree, outtree_name, 'newick')
        print("--- re-rooted tree written to \n\t%s\n"%outtree_name)

    table_fname = basename+'rtt.csv'
    with open(table_fname, 'w') as ofile:
        ofile.write("#name, date, root-to-tip distance\n")
        ofile.write("#Dates of nodes that didn't have a specified date are inferred from the root-to-tip regression.\n")
        for n in myTree.tree.get_terminals():
            if hasattr(n, "raw_date_constraint") and (n.raw_date_constraint is not None):
                if np.isscalar(n.raw_date_constraint):
                    tmp_str = str(n.raw_date_constraint)
                elif len(n.raw_date_constraint):
                    tmp_str = str(n.raw_date_constraint[0])+'-'+str(n.raw_date_constraint[1])
                else:
                    tmp_str = ''
                ofile.write("%s, %s, %f\n"%(n.name, tmp_str, n.dist2root))
            else:
                ofile.write("%s, %f, %f\n"%(n.name, d2d.numdate_from_dist2root(n.dist2root), n.dist2root))
        for n in myTree.tree.get_nonterminals(order='preorder'):
            ofile.write("%s, %f, %f\n"%(n.name, d2d.numdate_from_dist2root(n.dist2root), n.dist2root))
        print("--- wrote dates and root-to-tip distances to \n\t%s\n"%table_fname)


    ###########################################################################
    ### PLOT AND SAVE RESULT
    ###########################################################################
    plot_rtt(myTree, outdir+params.plot_rtt)
    return 0
예제 #4
0
def estimate_clock_model(params):
    """
    implementing treetime clock
    """

    if assure_tree(params, tmp_dir='clock_model_tmp'):
        return 1
    dates = utils.parse_dates(params.dates)
    if len(dates)==0:
        return 1

    outdir = get_outdir(params, '_clock')

    ###########################################################################
    ### READ IN VCF
    ###########################################################################
    #sets ref and fixed_pi to None if not VCF
    aln, ref, fixed_pi = read_if_vcf(params)
    is_vcf = True if ref is not None else False

    ###########################################################################
    ### ESTIMATE ROOT (if requested) AND DETERMINE TEMPORAL SIGNAL
    ###########################################################################
    if params.aln is None and params.sequence_length is None:
        print("one of arguments '--aln' and '--sequence-length' is required.", file=sys.stderr)
        return 1

    basename = get_basename(params, outdir)
    myTree = TreeTime(dates=dates, tree=params.tree, aln=aln, gtr='JC69',
                      verbose=params.verbose, seq_len=params.sequence_length,
                      ref=ref)
    myTree.tip_slack=params.tip_slack
    if myTree.tree is None:
        print("ERROR: tree loading failed. exiting...")
        return 1

    if params.clock_filter:
        n_bad = [n.name for n in myTree.tree.get_terminals() if n.bad_branch]
        myTree.clock_filter(n_iqd=params.clock_filter, reroot=params.reroot or 'least-squares')
        n_bad_after = [n.name for n in myTree.tree.get_terminals() if n.bad_branch]
        if len(n_bad_after)>len(n_bad):
            print("The following leaves don't follow a loose clock and "
                  "will be ignored in rate estimation:\n\t"
                  +"\n\t".join(set(n_bad_after).difference(n_bad)))

    if not params.keep_root:
        # reroot to optimal root, this assigns clock_model to myTree
        if params.covariation: # this requires branch length estimates
            myTree.run(root="least-squares", max_iter=0,
                       use_covariation=params.covariation)

        res = myTree.reroot(params.reroot,
                      force_positive=not params.allow_negative_rate)
        myTree.get_clock_model(covariation=params.covariation)

        if res==ttconf.ERROR:
            print("ERROR: unknown root or rooting mechanism!\n"
                  "\tvalid choices are 'least-squares', 'ML', and 'ML-rough'")
            return 1
    else:
        myTree.get_clock_model(covariation=params.covariation)

    d2d = utils.DateConversion.from_regression(myTree.clock_model)
    print('\n',d2d)
    print('The R^2 value indicates the fraction of variation in'
          '\nroot-to-tip distance explained by the sampling times.'
          '\nHigher values corresponds more clock-like behavior (max 1.0).')

    print('\nThe rate is the slope of the best fit of the date to'
          '\nthe root-to-tip distance and provides an estimate of'
          '\nthe substitution rate. The rate needs to be positive!'
          '\nNegative rates suggest an inappropriate root.\n')

    print('\nThe estimated rate and tree correspond to a root date:')
    if params.covariation:
        reg = myTree.clock_model
        dp = np.array([reg['intercept']/reg['slope']**2,-1./reg['slope']])
        droot = np.sqrt(reg['cov'][:2,:2].dot(dp).dot(dp))
        print('\n--- root-date:\t %3.2f +/- %1.2f (one std-dev)\n\n'%(-d2d.intercept/d2d.clock_rate, droot))
    else:
        print('\n--- root-date:\t %3.2f\n\n'%(-d2d.intercept/d2d.clock_rate))

    if not params.keep_root:
        # write rerooted tree to file
        outtree_name = basename+'rerooted.newick'
        Phylo.write(myTree.tree, outtree_name, 'newick')
        print("--- re-rooted tree written to \n\t%s\n"%outtree_name)

    table_fname = basename+'rtt.csv'
    with open(table_fname, 'w') as ofile:
        ofile.write("#name, date, root-to-tip distance\n")
        ofile.write("#Dates of nodes that didn't have a specified date are inferred from the root-to-tip regression.\n")
        for n in myTree.tree.get_terminals():
            if hasattr(n, "raw_date_constraint") and (n.raw_date_constraint is not None):
                if np.isscalar(n.raw_date_constraint):
                    tmp_str = str(n.raw_date_constraint)
                elif len(n.raw_date_constraint):
                    tmp_str = str(n.raw_date_constraint[0])+'-'+str(n.raw_date_constraint[1])
                else:
                    tmp_str = ''
                ofile.write("%s, %s, %f\n"%(n.name, tmp_str, n.dist2root))
            else:
                ofile.write("%s, %f, %f\n"%(n.name, d2d.numdate_from_dist2root(n.dist2root), n.dist2root))
        for n in myTree.tree.get_nonterminals(order='preorder'):
            ofile.write("%s, %f, %f\n"%(n.name, d2d.numdate_from_dist2root(n.dist2root), n.dist2root))
        print("--- wrote dates and root-to-tip distances to \n\t%s\n"%table_fname)


    ###########################################################################
    ### PLOT AND SAVE RESULT
    ###########################################################################
    plot_rtt(myTree, outdir+params.plot_rtt)
    return 0