def get_response_content(fs): # deserialize the xml data to create a DirectProteinMixture try: mixture_model = DirectProtein.deserialize_mixture_model(fs.model) except ValueError as e: raise HandlingError(e) expected_rate = mixture_model.get_expected_rate() codon_distribution = mixture_model.get_codon_stationary_distribution() aa_distribution = mixture_model.get_aa_stationary_distribution() nt_distribution = mixture_model.get_nt_stationary_distribution() ordered_codons = list(sorted(Codon.g_non_stop_codons)) # show the summary out = StringIO() print >> out, 'expected codon substitution rate:' print >> out, expected_rate print >> out, '' print >> out, 'nucleotide distribution:' for nt, proportion in zip(Codon.g_nt_letters, nt_distribution): print >> out, '%s : %s' % (nt, proportion) print >> out, '' print >> out, 'amino acid distribution:' for aa, proportion in zip(Codon.g_aa_letters, aa_distribution): print >> out, '%s : %s' % (aa, proportion) print >> out, '' print >> out, 'codon distribution:' for codon, proportion in zip(ordered_codons, codon_distribution): print >> out, '%s : %s' % (codon, proportion) return out.getvalue()
def get_response_content(fs): # get the nucleotide distribution nt_to_weight = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', nt_letters) # get the amino acid distribution aa_to_weight = SnippetUtil.get_distribution(fs.aminoacids, 'amino acid', aa_letters) # get results mutation_distribution = [nt_to_weight[nt] for nt in nt_letters] aa_distribution = [aa_to_weight[aa] for aa in aa_letters] pair = DirectProtein.get_nt_distribution_and_aa_energies( mutation_distribution, aa_distribution) nt_distribution, aa_energies = pair # write something out = StringIO() # write the stationary nucleotide distribution print >> out, 'nucleotide stationary distribution:' for nt, value in zip(nt_letters, nt_distribution): print >> out, '%s : %s' % (nt, value) print >> out, '' # write the amino acid energies print >> out, 'amino acid energies:' for aa, value in zip(aa_letters, aa_energies): print >> out, '%s : %s' % (aa, value) # return the response return out.getvalue()
def get_form(): """ @return: the body of a form """ # define the default xml string default_xml_string = DirectProtein.get_sample_xml_string() # define the form objects return [Form.MultiLine('model', 'mixture model', default_xml_string)]
def get_form(): """ @return: the body of a form """ # define the tree string tree_string = '(((Human:0.1, Chimpanzee:0.2):0.8, Gorilla:0.3):0.7, Orangutan:0.4, Gibbon:0.5);' tree = Newick.parse(tree_string, Newick.NewickTree) formatted_tree_string = Newick.get_narrow_newick_string(tree, 60) # define the form objects form_objects = [ Form.MultiLine('tree', 'newick tree', formatted_tree_string), Form.MultiLine('model', 'Direct Protein mixture model', DirectProtein.get_sample_xml_string().strip()), Form.Integer('ncols', 'sample this many codon columns', 100, low=1, high=1000)] return form_objects
def evaluate(self, X): """ @param X: the three vars defining the mutation process nt distribution. @return: a tuple which is the zero vector when the guess was right. """ if len(X) != 3: raise ValueError("incorrect number of parameters") x, y, z = X mutation_nt_weights = (1, math.exp(x), math.exp(y), math.exp(z)) mutation_nt_dist = normalized(mutation_nt_weights) if not almost_equals(sum(mutation_nt_dist), 1.0): msg_a = "detected possibly invalid objective function in put: " msg_b = str(X) raise ValueError(msg_a + msg_b) pair = DirectProtein.get_nt_distribution_and_aa_energies(mutation_nt_dist, self.aa_dist) stationary_nt_dist, aa_energies = pair evaluation = tuple(math.log(a / b) for a, b in zip(self.nt_dist[:3], stationary_nt_dist[:3])) return evaluation, aa_energies
def __call__(self, X): """ The input is a triple. These three variables define the mutation process nucleotide distribution @param X: a triple @return: the zero vector when the guess was right """ self.guesses.append(X) if len(X) != 3: raise ValueError('incorrect number of parameters') x, y, z = X mutation_nt_weights = (1, math.exp(x), math.exp(y), math.exp(z)) mutation_nt_dist = normalized(mutation_nt_weights) pair = DirectProtein.get_nt_distribution_and_aa_energies( mutation_nt_dist, self.aa_dist) stationary_nt_dist, aa_energies = pair return tuple(math.log(a/b) for a, b in zip(self.nt_dist[:3], stationary_nt_dist[:3]))
def get_response_content(fs): # get the mutation process nucleotide distribution nt_distribution = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', nt_ordered) # get the selection process amino acid energies aa_to_energy = SnippetUtil.get_dictionary(fs.aminoacids, 'amino acid', 'energy', aa_ordered) # create the direct protein rate matrix object nt_distribution_list = [nt_distribution[nt] for nt in nt_ordered] aa_energy_list = [aa_to_energy[aa] for aa in aa_ordered] rate_matrix_object = DirectProtein.DirectProteinRateMatrix(fs.kappa, nt_distribution_list, aa_energy_list) # write the response out = StringIO() if fs.srm: # write the scaled rate matrix rate_matrix_object.normalize() row_major_rate_matrix = rate_matrix_object.get_row_major_rate_matrix() print >> out, MatrixUtil.m_to_string(row_major_rate_matrix) elif fs.urm: # write the unscaled rate matrix row_major_rate_matrix = rate_matrix_object.get_row_major_rate_matrix() print >> out, MatrixUtil.m_to_string(row_major_rate_matrix) elif fs.cstat: # write the codon stationary distribution codon_distribution = rate_matrix_object.get_codon_distribution() for codon in codons_ordered: print >> out, codon, ':', codon_distribution[codon] elif fs.astat: # write the amino acid stationary distribution aa_distribution = rate_matrix_object.get_aa_distribution() for aa in aa_ordered: print >> out, aa, ':', aa_distribution[aa] elif fs.nstat: # write the nucleotide stationary distribution nt_distribution = rate_matrix_object.get_nt_distribution() for nt in nt_ordered: print >> out, nt, ':', nt_distribution[nt] elif fs.sf: # write the rate matrix scaling factor print >> out, rate_matrix_object.get_expected_rate() # return the response return out.getvalue() + '\n'
def __call__(self, X): """ The input is a triple. These three variables define the mutation process nucleotide distribution @param X: a triple @return: the zero vector when the guess was right """ self.guesses.append(X) if len(X) != 3: raise ValueError('incorrect number of parameters') x, y, z = X mutation_nt_weights = (1, math.exp(x), math.exp(y), math.exp(z)) mutation_nt_dist = normalized(mutation_nt_weights) pair = DirectProtein.get_nt_distribution_and_aa_energies( mutation_nt_dist, self.aa_dist) stationary_nt_dist, aa_energies = pair return tuple( math.log(a / b) for a, b in zip(self.nt_dist[:3], stationary_nt_dist[:3]))
def evaluate(self, X): """ @param X: the three vars defining the mutation process nt distribution. @return: a tuple which is the zero vector when the guess was right. """ if len(X) != 3: raise ValueError('incorrect number of parameters') x, y, z = X mutation_nt_weights = (1, math.exp(x), math.exp(y), math.exp(z)) mutation_nt_dist = normalized(mutation_nt_weights) if not almost_equals(sum(mutation_nt_dist), 1.0): msg_a ='detected possibly invalid objective function in put: ' msg_b = str(X) raise ValueError(msg_a + msg_b) pair = DirectProtein.get_nt_distribution_and_aa_energies( mutation_nt_dist, self.aa_dist) stationary_nt_dist, aa_energies = pair evaluation = tuple(math.log(a/b) for a, b in zip(self.nt_dist[:3], stationary_nt_dist[:3])) return evaluation, aa_energies
def get_response_content(fs): # get the newick string try: tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() except Newick.NewickSyntaxError as e: raise HandlingError(e) # get the normalized Direct RNA mixture model mixture_model = DirectProtein.deserialize_mixture_model(fs.model) mixture_model.normalize() # simulate the alignment try: alignment = PhyLikelihood.simulate_alignment(tree, mixture_model, fs.ncols) except PhyLikelihood.SimulationError as e: raise HandlingError(e) # get the alignment arr = [] for node in tree.gen_tips(): arr.append(alignment.get_fasta_sequence(node.name)) # return the alignment string return '\n'.join(arr) + '\n'
def get_response_content(fs): # deserialize the xml data to create a DirectProteinMixture try: mixture_model = DirectProtein.deserialize_mixture_model(fs.model) except ValueError as e: raise HandlingError(e) # Normalize the mixture model to have an expected rate of one # substitution per unit of branch length. mixture_model.normalize() # begin writing the html file out = StringIO() # write the html header print >> out, '<html>' print >> out, '<head>' print >> out, '<style type="text/css">td{font-size:x-small;}</style>' print >> out, '</head>' print >> out, '<body>' # write the symmetric components of the rate matrices for category_i, matrix_object in enumerate(mixture_model.rate_matrices): codon_v = matrix_object.get_stationary_distribution() matrix = matrix_object.dictionary_rate_matrix symmetric_matrix = {} for ca, pa in zip(codons, codon_v): for cb, pb in zip(codons, codon_v): value = matrix[(ca, cb)] / (math.sqrt(pb) / math.sqrt(pa)) symmetric_matrix[(ca, cb)] = value print >> out, 'the symmetric component of the rate matrix' print >> out, 'for category %d:' % (category_i + 1) print >> out, '<table>' print >> out, RateMatrix.codon_rate_matrix_to_html_string( symmetric_matrix) print >> out, '</table>' print >> out, '<br/><br/>' # write the html footer print >> out, '</body>' print >> out, '</html>' # return the response return out.getvalue()
def get_form(): """ @return: the body of a form """ default_xml_string = DirectProtein.get_sample_xml_string().strip() return [Form.MultiLine('model', 'mixture model', default_xml_string)]
def get_response_content(fs): # get the nucleotide distribution nt_to_weight = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', nt_letters) # get the amino acid distribution aa_to_weight = SnippetUtil.get_distribution(fs.aminoacids, 'amino acid', aa_letters) # get distributions in convenient list form stationary_nt_distribution = [nt_to_weight[nt] for nt in nt_letters] aa_distribution = [aa_to_weight[aa] for aa in aa_letters] codon_distribution = [] implied_stationary_nt_distribution = [] if fs.corrected: # define the objective function objective_function = MyObjective(aa_distribution, stationary_nt_distribution) initial_guess = (0, 0, 0) iterations = 20 best = scipy.optimize.nonlin.broyden2(objective_function, initial_guess, iterations) x, y, z = best best_mutation_weights = (1, math.exp(x), math.exp(y), math.exp(z)) best_mutation_distribution = normalized(best_mutation_weights) # Given the mutation distribution and the amino acid distribution, # get the stationary distribution. result = DirectProtein.get_nt_distribution_and_aa_energies( best_mutation_distribution, aa_distribution) implied_stationary_nt_distribution, result_aa_energies = result # Get the codon distribution; # kappa doesn't matter because we are only concerned # with stationary distributions kappa = 1.0 dpm = DirectProtein.DirectProteinRateMatrix( kappa, best_mutation_distribution, result_aa_energies) codon_distribution = dpm.get_stationary_distribution() elif fs.hb: # get the codon distribution unnormalized_codon_distribution = [] for codon in codons: aa = Codon.g_codon_to_aa_letter[codon] sibling_codons = Codon.g_aa_letter_to_codons[aa] codon_aa_weight = aa_to_weight[aa] codon_nt_weight = np.prod([nt_to_weight[nt] for nt in codon]) sibling_nt_weight_sum = sum( np.prod([nt_to_weight[nt] for nt in sibling]) for sibling in sibling_codons) weight = codon_aa_weight * codon_nt_weight weight /= sibling_nt_weight_sum unnormalized_codon_distribution.append(weight) codon_distribution = normalized(unnormalized_codon_distribution) nt_to_weight = dict(zip(nt_letters, [0] * 4)) for codon, p in zip(codons, codon_distribution): for nt in codon: nt_to_weight[nt] += p implied_stationary_nt_distribution = normalized(nt_to_weight[nt] for nt in nt_letters) # start the output text string out = StringIO() # write the codon stationary distribution print >> out, 'estimated codon stationary distribution:' for codon, p in zip(codons, codon_distribution): print >> out, '%s : %s' % (codon, p) print >> out, '' # write the nucleotide stationary distribution print >> out, 'implied nucleotide stationary distribution:' for nt, p in zip(nt_letters, implied_stationary_nt_distribution): print >> out, '%s : %s' % (nt, p) # return the response return out.getvalue()
def get_response_content(fs): # get the nucleotide distribution nt_to_weight = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', nt_letters) # get the amino acid distribution aa_to_weight = SnippetUtil.get_distribution(fs.aminoacids, 'amino acid', aa_letters) # get distributions in convenient list form stationary_nt_distribution = [nt_to_weight[nt] for nt in nt_letters] aa_distribution = [aa_to_weight[aa] for aa in aa_letters] codon_distribution = [] implied_stationary_nt_distribution = [] if fs.corrected: # define the objective function objective_function = MyObjective(aa_distribution, stationary_nt_distribution) initial_guess = (0, 0, 0) iterations = 20 best = scipy.optimize.nonlin.broyden2(objective_function, initial_guess, iterations) x, y, z = best best_mutation_weights = (1, math.exp(x), math.exp(y), math.exp(z)) best_mutation_distribution = normalized(best_mutation_weights) # Given the mutation distribution and the amino acid distribution, # get the stationary distribution. result = DirectProtein.get_nt_distribution_and_aa_energies( best_mutation_distribution, aa_distribution) implied_stationary_nt_distribution, result_aa_energies = result # Get the codon distribution; # kappa doesn't matter because we are only concerned # with stationary distributions kappa = 1.0 dpm = DirectProtein.DirectProteinRateMatrix( kappa, best_mutation_distribution, result_aa_energies) codon_distribution = dpm.get_stationary_distribution() elif fs.hb: # get the codon distribution unnormalized_codon_distribution = [] for codon in codons: aa = Codon.g_codon_to_aa_letter[codon] sibling_codons = Codon.g_aa_letter_to_codons[aa] codon_aa_weight = aa_to_weight[aa] codon_nt_weight = np.prod([nt_to_weight[nt] for nt in codon]) sibling_nt_weight_sum = sum(np.prod([nt_to_weight[nt] for nt in sibling]) for sibling in sibling_codons) weight = codon_aa_weight * codon_nt_weight weight /= sibling_nt_weight_sum unnormalized_codon_distribution.append(weight) codon_distribution = normalized(unnormalized_codon_distribution) nt_to_weight = dict(zip(nt_letters, [0]*4)) for codon, p in zip(codons, codon_distribution): for nt in codon: nt_to_weight[nt] += p implied_stationary_nt_distribution = normalized(nt_to_weight[nt] for nt in nt_letters) # start the output text string out = StringIO() # write the codon stationary distribution print >> out, 'estimated codon stationary distribution:' for codon, p in zip(codons, codon_distribution): print >> out, '%s : %s' % (codon, p) print >> out, '' # write the nucleotide stationary distribution print >> out, 'implied nucleotide stationary distribution:' for nt, p in zip(nt_letters, implied_stationary_nt_distribution): print >> out, '%s : %s' % (nt, p) # return the response return out.getvalue()
objective_function = MyCodonObjective(aa_distribution, observed_nt_stationary_distribution) initial_stationary_guess = halpern_bruno_nt_estimate(nt_to_probability, aa_to_probability) A, C, G, T = initial_stationary_guess initial_guess = (math.log(C / A), math.log(G / A), math.log(T / A)) iterations = 20 try: best = scipy.optimize.nonlin.broyden2(objective_function, initial_guess, iterations) except Exception, e: debugging_information = objective_function.get_history() raise HandlingError(str(e) + "\n" + debugging_information) x, y, z = best best_mutation_weights = (1, math.exp(x), math.exp(y), math.exp(z)) best_mutation_distribution = normalized(best_mutation_weights) # Given the mutation distribution and the amino acid distribution, # get the stationary distribution. result = DirectProtein.get_nt_distribution_and_aa_energies(best_mutation_distribution, aa_distribution) result_stationary_nt_dist, result_aa_energies = result # make a results string out = StringIO() # write the stationary nucleotide distribution of the mutation process print >> out, "mutation nucleotide stationary distribution:" for nt, probability in zip(nt_letters, best_mutation_distribution): print >> out, "%s : %s" % (nt, probability) # write the centered amino acid energies print >> out, "" print >> out, "amino acid energies:" for aa, energy in zip(aa_letters, result_aa_energies): print >> out, "%s : %s" % (aa, energy) # return the response return out.getvalue()
aa_to_probability) A, C, G, T = initial_stationary_guess initial_guess = (math.log(C/A), math.log(G/A), math.log(T/A)) iterations = 20 try: best = scipy.optimize.nonlin.broyden2(objective_function, initial_guess, iterations) except Exception, e: debugging_information = objective_function.get_history() raise HandlingError(str(e) + '\n' + debugging_information) x, y, z = best best_mutation_weights = (1, math.exp(x), math.exp(y), math.exp(z)) best_mutation_distribution = normalized(best_mutation_weights) # Given the mutation distribution and the amino acid distribution, # get the stationary distribution. result = DirectProtein.get_nt_distribution_and_aa_energies( best_mutation_distribution, aa_distribution) result_stationary_nt_dist, result_aa_energies = result # make a results string out = StringIO() # write the stationary nucleotide distribution of the mutation process print >> out, 'mutation nucleotide stationary distribution:' for nt, probability in zip(nt_letters, best_mutation_distribution): print >> out, '%s : %s' % (nt, probability) # write the centered amino acid energies print >> out, '' print >> out, 'amino acid energies:' for aa, energy in zip(aa_letters, result_aa_energies): print >> out, '%s : %s' % (aa, energy) # return the response return out.getvalue()