def get_neg_ll(cls, patterns, pattern_weights, ts, tv, syn, nonsyn, full_compo, theta, ): # pick the nt distn parameters from the end of the theta vector log_nt_distns = algopy.zeros((3, 4), dtype=theta) log_nt_distns_block = algopy.reshape(theta[-9:], (3, 3)) log_nt_distns[:, :-1] = log_nt_distns_block reduced_theta = theta[:-9] unnormalized_nt_distns = algopy.exp(log_nt_distns) # normalize each of the three nucleotide distributions row_sums = algopy.sum(unnormalized_nt_distns, axis=1) nt_distns = (unnormalized_nt_distns.T / row_sums).T # get the implied codon distribution stationary_distn = codon1994.get_f3x4_codon_distn( full_compo, nt_distns, ) return A.get_neg_ll( patterns, pattern_weights, stationary_distn, ts, tv, syn, nonsyn, reduced_theta, )
def main(args): # read the description of the genetic code with open(args.code_in) as fin_gcode: arr = list(csv.reader(fin_gcode, delimiter='\t')) indices, aminos, codons = zip(*arr) if [int(x) for x in indices] != range(len(indices)): raise ValueError aminos = [x.lower() for x in aminos] nstop = aminos.count('stop') if nstop not in (2, 3, 4): raise Exception('expected 2 or 3 or 4 stop codons') if any(x == 'stop' for x in aminos[:-nstop]): raise Exception('expected stop codons at the end of the genetic code') # trim the stop codons aminos = aminos[:-nstop] codons = codons[:-nstop] # load the ordered directed edges DE = np.loadtxt(args.edges_in, delimiter='\t', dtype=int) # load the alignment pattern patterns = np.loadtxt(args.patterns_in, delimiter='\t', dtype=int) # load the alignment weights weights = np.loadtxt(args.weights_in, delimiter='\t', dtype=float) # get the empirical codon distribution ncodons = len(codons) nsites = patterns.shape[0] ntaxa = patterns.shape[1] v_emp = np.zeros(ncodons, dtype=float) for i in range(nsites): for j in range(ntaxa): state = patterns[i, j] if state != -1: v_emp[state] += weights[i] v_emp /= np.sum(v_emp) print 'empirical codon distribution:' print v_emp print # precompute some design matrices adj = design.get_adjacency(codons) ts = design.get_nt_transitions(codons) tv = design.get_nt_transversions(codons) syn = design.get_syn(codons, aminos) nonsyn = design.get_nonsyn(codons, aminos) full_compo = design.get_full_compo(codons) # For all of the data in the alignment, # compute the grand total nucleotide counts at each of the three # nucleotide positions within a codon. # The full_compo ndarray has shape (ncodons, 3, 4) # whereas the nucleotide distribution ndarray has shape (3, 4). position_specific_nt_distns = np.tensordot(full_compo, v_emp, axes=(0,0)) # This is pre-computed if we want to use an empirical # stationary distribution, but it is postponed if we want to use # max likelihood parameters for the stationary distribution. stationary_distn = codon1994.get_f3x4_codon_distn( full_compo, position_specific_nt_distns, ) # construct the args to the neg log likelihood function likelihood_args_empirical = ( patterns, weights, stationary_distn, ts, tv, syn, nonsyn, ) likelihood_args_free = ( patterns, weights, ts, tv, syn, nonsyn, full_compo, ) # get the model A estimates using plain fmin model_A_opt = scipy.optimize.fmin( functools.partial(A.get_neg_ll, *likelihood_args_empirical), A.get_guess(), ) print 'optimal params for model A:' print np.exp(model_A_opt) print # reconstruct the matrix d = position_specific_nt_distns lastcol = d[:, -1] dratio = (d.T / lastcol).T log_nt_guess = np.log(dratio[:, :-1]).reshape(9) guess = np.hstack([model_A_opt, log_nt_guess]) # choose the model likelihood_args = likelihood_args_free #guess = get_guess_A_free get_neg_ll = A_free.get_neg_ll #likelihood_args = likelihood_args_empirical #guess = theta_model_A #get_neg_ll = get_neg_ll_model_A # define the objective function and the gradient and hessian f = functools.partial(get_neg_ll, *likelihood_args) g = functools.partial(eval_grad, f) h = functools.partial(eval_hess, f) # do the search, using information about the gradient and hessian """ results = scipy.optimize.fmin_ncg( f, theta_model_A, g, fhess_p=None, fhess=h, avextol=1e-05, epsilon=1.4901161193847656e-08, maxiter=100, full_output=True, disp=1, retall=0, callback=None, ) """ #""" results = scipy.optimize.fmin( f, guess, ) #""" # report the inital model results print 'model A with empirical frequencies:' print model_A_opt print numpy.exp(model_A_opt) print # report a summary of the maximum likelihood search print 'model A with free frequencies:' print results print numpy.exp(results) print