예제 #1
0
    def get_neg_ll(cls,
            patterns, pattern_weights,
            ts, tv, syn, nonsyn, full_compo,
            theta,
            ):
        
        # pick the nt distn parameters from the end of the theta vector
        log_nt_distns = algopy.zeros((3, 4), dtype=theta)
        log_nt_distns_block = algopy.reshape(theta[-9:], (3, 3))
        log_nt_distns[:, :-1] = log_nt_distns_block
        reduced_theta = theta[:-9]
        unnormalized_nt_distns = algopy.exp(log_nt_distns)

        # normalize each of the three nucleotide distributions
        row_sums = algopy.sum(unnormalized_nt_distns, axis=1)
        nt_distns = (unnormalized_nt_distns.T / row_sums).T

        # get the implied codon distribution
        stationary_distn = codon1994.get_f3x4_codon_distn(
                full_compo,
                nt_distns,
                )

        return A.get_neg_ll(
                patterns, pattern_weights,
                stationary_distn,
                ts, tv, syn, nonsyn,
                reduced_theta,
                )
예제 #2
0
def main(args):

    # read the description of the genetic code
    with open(args.code_in) as fin_gcode:
        arr = list(csv.reader(fin_gcode, delimiter='\t'))
        indices, aminos, codons = zip(*arr)
        if [int(x) for x in indices] != range(len(indices)):
            raise ValueError

    aminos = [x.lower() for x in aminos]
    nstop = aminos.count('stop')
    if nstop not in (2, 3, 4):
        raise Exception('expected 2 or 3 or 4 stop codons')
    if any(x == 'stop' for x in aminos[:-nstop]):
        raise Exception('expected stop codons at the end of the genetic code')

    # trim the stop codons
    aminos = aminos[:-nstop]
    codons = codons[:-nstop]

    # load the ordered directed edges
    DE = np.loadtxt(args.edges_in, delimiter='\t', dtype=int)

    # load the alignment pattern
    patterns = np.loadtxt(args.patterns_in, delimiter='\t', dtype=int)

    # load the alignment weights
    weights = np.loadtxt(args.weights_in, delimiter='\t', dtype=float)

    # get the empirical codon distribution
    ncodons = len(codons)
    nsites = patterns.shape[0]
    ntaxa = patterns.shape[1]
    v_emp = np.zeros(ncodons, dtype=float)
    for i in range(nsites):
        for j in range(ntaxa):
            state = patterns[i, j]
            if state != -1:
                v_emp[state] += weights[i]
    v_emp /= np.sum(v_emp)
    print 'empirical codon distribution:'
    print v_emp
    print

    # precompute some design matrices
    adj = design.get_adjacency(codons)
    ts = design.get_nt_transitions(codons)
    tv = design.get_nt_transversions(codons)
    syn = design.get_syn(codons, aminos)
    nonsyn = design.get_nonsyn(codons, aminos)
    full_compo = design.get_full_compo(codons)

    # For all of the data in the alignment,
    # compute the grand total nucleotide counts at each of the three
    # nucleotide positions within a codon.
    # The full_compo ndarray has shape (ncodons, 3, 4)
    # whereas the nucleotide distribution ndarray has shape (3, 4).
    position_specific_nt_distns = np.tensordot(full_compo, v_emp, axes=(0,0))

    # This is pre-computed if we want to use an empirical
    # stationary distribution, but it is postponed if we want to use
    # max likelihood parameters for the stationary distribution.
    stationary_distn = codon1994.get_f3x4_codon_distn(
            full_compo,
            position_specific_nt_distns,
            )

    # construct the args to the neg log likelihood function
    likelihood_args_empirical = (
            patterns, weights,
            stationary_distn,
            ts, tv, syn, nonsyn,
            )
    
    likelihood_args_free = (
        patterns, weights,
        ts, tv, syn, nonsyn, full_compo,
        )

    # get the model A estimates using plain fmin
    model_A_opt = scipy.optimize.fmin(
            functools.partial(A.get_neg_ll, *likelihood_args_empirical),
            A.get_guess(),
            )
    print 'optimal params for model A:'
    print np.exp(model_A_opt)
    print

    # reconstruct the matrix
    d = position_specific_nt_distns
    lastcol = d[:, -1]
    dratio = (d.T / lastcol).T
    log_nt_guess = np.log(dratio[:, :-1]).reshape(9)
    guess = np.hstack([model_A_opt, log_nt_guess])


    # choose the model
    likelihood_args = likelihood_args_free
    #guess = get_guess_A_free
    get_neg_ll = A_free.get_neg_ll
    #likelihood_args = likelihood_args_empirical
    #guess = theta_model_A
    #get_neg_ll = get_neg_ll_model_A

    # define the objective function and the gradient and hessian
    f = functools.partial(get_neg_ll, *likelihood_args)
    g = functools.partial(eval_grad, f)
    h = functools.partial(eval_hess, f)

    # do the search, using information about the gradient and hessian
    """
    results = scipy.optimize.fmin_ncg(
            f,
            theta_model_A,
            g,
            fhess_p=None,
            fhess=h,
            avextol=1e-05,
            epsilon=1.4901161193847656e-08,
            maxiter=100,
            full_output=True,
            disp=1,
            retall=0,
            callback=None,
            )
    """

    #"""
    results = scipy.optimize.fmin(
            f,
            guess,
            )
    #"""

    # report the inital model results
    print 'model A with empirical frequencies:'
    print model_A_opt
    print numpy.exp(model_A_opt)
    print

    # report a summary of the maximum likelihood search
    print 'model A with free frequencies:'
    print results
    print numpy.exp(results)
    print