def objective_and_gradient(scene, X): delta = 1e-8 distn, kappa, tau, rates = unpack(X) scene['tree']['edge_rate_scaling_factors'] = rates.tolist() log_likelihood_request = {'property' : 'snnlogl'} derivatives_request = {'property' : 'sdnderi'} # Get the log likelihood and per-edge derivatives. # Note that the edge derivatives are of the log likelihood # with respect to logs of edge rates, and we will eventually # multiply them by -1 to get the gradient of the cost function # which we want to minimize rather than the log likelihood function # which we want to maximize. process_defn, root_prior = get_process_defn_and_prior(distn, kappa, tau) scene['root_prior'] = root_prior scene['process_definitions'] = [process_defn] j_in = { 'scene' : scene, 'requests' : [log_likelihood_request, derivatives_request] } j_out = jsonctmctree.interface.process_json_in(j_in) log_likelihood, edge_gradient = j_out['responses'] cost = -log_likelihood # For each non-edge-specific parameter get finite-differences # approximation of the gradient. nedges = len(scene['tree']['row_nodes']) nparams = len(X) - nedges gradient = [] for i in range(nparams): W = np.copy(X) W[i] += delta distn, kappa, tau, rates = unpack(W) process_defn, root_prior = get_process_defn_and_prior(distn, kappa, tau) scene['root_prior'] = root_prior scene['process_definitions'] = [process_defn] j_in = { 'scene' : scene, 'requests' : [log_likelihood_request] } j_out = jsonctmctree.interface.process_json_in(j_in) ll = j_out['responses'][0] c = -ll slope = (c - cost) / delta gradient.append(slope) gradient.extend([-x for x in edge_gradient]) gradient = np.array(gradient) # Return cost and gradient. return cost, gradient
def objective(scene, X): distn, kappa, tau, rates = unpack(X) scene['root_prior']['probabilities'] = distn.tolist() scene['tree']['edge_rate_scaling_factors'] = rates.tolist() triples = list(gen_transitions(distn, kappa, tau)) rows, cols, transition_rates = zip(*triples) process_definition = { 'row_states' : [list(x) for x in rows], 'column_states' : [list(x) for x in cols], 'transition_rates' : list(transition_rates) } scene['process_definitions'] = [process_definition] request = {'property' : 'snnlogl'} j_in = {'scene' : scene, 'requests' : [request]} j_out = jsonctmctree.interface.process_json_in(j_in) log_likelihood = j_out['responses'][0] cost = -log_likelihood return cost
def main(): name_to_node = { 'tamarin' : 0, 'macaque' : 1, 'orangutan' : 2, 'chimpanzee' : 3, 'gorilla' : 4} paralog_to_variable = { 'ecp' : 0, 'edn' : 1} nodes = [] variables = [] rows = [] with open('paralogs.fasta') as fin: while True: line = fin.readline().strip().lower() if not line: break name = line[1:-3] paralog = line[-3:] seq = fin.readline().strip() row = ['ACGT'.index(x) for x in seq] nodes.append(name_to_node[name]) variables.append(paralog_to_variable[paralog]) rows.append(row) columns = [list(x) for x in zip(*rows)] distn = [0.25, 0.25, 0.25, 0.25] rates = [1, 1, 1, 1, 1, 1, 1, 1] kappa = 2.0 tau = 3.0 process_defn, root_prior = get_process_defn_and_prior(distn, kappa, tau) scene = { "node_count" : 9, "process_count" : 1, "state_space_shape" : [4, 4], "tree" : { "row_nodes" : [5, 5, 6, 6, 7, 7, 8, 8], "column_nodes" : [0, 6, 1, 7, 2, 8, 3, 4], "edge_rate_scaling_factors" : rates, "edge_processes" : [0, 0, 0, 0, 0, 0, 0, 0] }, "root_prior" : root_prior, "process_definition" : process_defn, "observed_data" : { "nodes" : nodes, "variables" : variables, "iid_observations" : columns } } X = pack(distn, kappa, tau, rates) f = functools.partial(objective_and_gradient, scene) result = minimize(f, X, jac=True, method='L-BFGS-B') print('final value of objective function:', result.fun) distn, kappa, tau, rates = unpack(result.x) print('nucleotide distribution:') for nt, p in zip('ACGT', distn): print(' ', nt, ':', p) print('kappa:', kappa) print('tau:', tau) print('edge rate scaling factors:') for r in rates: print(' ', r)
def main(): name_to_node = { 'tamarin' : 0, 'macaque' : 1, 'orangutan' : 2, 'chimpanzee' : 3, 'gorilla' : 4} paralog_to_variable = { 'ecp' : 0, 'edn' : 1} nodes = [] variables = [] rows = [] with open('paralogs.fasta') as fin: while True: line = fin.readline().strip().lower() if not line: break name = line[1:-3] paralog = line[-3:] seq = fin.readline().strip() row = ['ACGT'.index(x) for x in seq] nodes.append(name_to_node[name]) variables.append(paralog_to_variable[paralog]) rows.append(row) columns = [list(x) for x in zip(*rows)] print('number of sites in the alignment:', len(columns)) print('number of sequences:', len(nodes)) # Compute the empirical distribution of the nucleotides. counts = np.zeros(4) for k in np.ravel(columns): counts[k] += 1 empirical_pi = counts / counts.sum() distn = empirical_pi rates = [1, 1, 1, 1, 1, 1, 1, 1] scene = { "node_count" : 9, "process_count" : 1, "state_space_shape" : [4, 4], "tree" : { "row_nodes" : [5, 5, 6, 6, 7, 7, 8, 8], "column_nodes" : [0, 6, 1, 7, 2, 8, 3, 4], "edge_rate_scaling_factors" : rates, "edge_processes" : [0, 0, 0, 0, 0, 0, 0, 0] }, "root_prior" : { "states" : [[0, 0], [1, 1], [2, 2], [3, 3]], "probabilities" : distn }, "observed_data" : { "nodes" : nodes, "variables" : variables, "iid_observations" : columns } } X = pack(distn, 2.0, 3.0, rates) f = functools.partial(objective, scene) result = minimize(f, X, method='L-BFGS-B') print('final value of objective function:', result.fun) distn, kappa, tau, rates = unpack(result.x) print('nucleotide distribution:') for nt, p in zip('ACGT', distn): print(' ', nt, ':', p) print('kappa:', kappa) print('tau:', tau) print('edge rate scaling factors:') for r in rates: print(' ', r)
def custom_unpack(rate_expansion, X): distn, kappa, tau, rates = unpack(X) return distn, kappa, tau, hardcoded_rate_expand(rate_expansion, rates)