def testLikelihood(self): """ Assert that no errors occur during the analysis """ # define a simple (but not completely degenerate) alignment sa = 'AAAACCCCGGGGTTAA' sb = 'GAAACCTCGGCGTAAA' sequence_pair = (sa, sb) # get estimates according to an analytical formula which is not necessarily the mle distance_mle, kappa_mle, A_mle, C_mle, G_mle, T_mle = get_closed_form_estimates( (sa, sb)) nt_distribution_mle = (A_mle, C_mle, G_mle, T_mle) rate_matrix_object = create_rate_matrix(kappa_mle, nt_distribution_mle) log_likelihood_mle = PairLikelihood.get_log_likelihood( distance_mle, sequence_pair, rate_matrix_object) # get the maximum likelihood estimates according to a numeric optimizer. f = Objective((sa, sb)) values = list(f.get_initial_parameters()) result = scipy.optimize.fmin(f, values, ftol=.0000000001, disp=0) distance_opt, kappa_opt, wC_opt, wG_opt, wT_opt = result nt_distribution_opt = parameters_to_distribution( (wC_opt, wG_opt, wT_opt)) rate_matrix_object = create_rate_matrix(kappa_opt, nt_distribution_opt) log_likelihood_opt = PairLikelihood.get_log_likelihood( distance_opt, sequence_pair, rate_matrix_object)
def get_response_content(fs): # read the nucleotide weights nt_weights = [fs.A, fs.C, fs.G, fs.T] # convert the nucleotide weights to probabilities nt_probs = [x / float(sum(nt_weights)) for x in nt_weights] # Assert that the kappa value and the nucleotide # probabilities are compatible. A, C, G, T = nt_probs R = float(A + G) Y = float(C + T) if R <= 0: raise HandlingError("the frequency of a purine must be positive") if Y <= 0: raise HandlingError("the frequency of a pyrimidine must be positive") if fs.kappa <= max(-Y, -R): msg_a = "kappa must be greater than max(-R, -Y) " msg_b = "where R and Y are the purine and pyrimidine frequencies" raise HandlingError(msg_a + msg_b) # Create the rate matrix object # which is automatically scaled to a rate of 1.0. model = F84.create_rate_matrix(fs.kappa, nt_probs) # simulate a pair of sequences sequence_pair = PairLikelihood.simulate_sequence_pair(fs.distance, model, fs.length) # convert the pair of sequences to an alignment object aln = StringIO() print >> aln, ">first" print >> aln, "".join(sequence_pair[0]) print >> aln, ">second" print >> aln, "".join(sequence_pair[1]) return Fasta.Alignment(StringIO(aln.getvalue())).to_fasta_string() + "\n"
def get_response_content(fs): # read the nucleotide weights nt_weights = [fs.A, fs.C, fs.G, fs.T] # convert the nucleotide weights to probabilities nt_probs = [x / float(sum(nt_weights)) for x in nt_weights] # Assert that the kappa value and the nucleotide # probabilities are compatible. A, C, G, T = nt_probs R = float(A + G) Y = float(C + T) if R <= 0: raise HandlingError('the frequency of a purine must be positive') if Y <= 0: raise HandlingError('the frequency of a pyrimidine must be positive') if fs.kappa <= max(-Y, -R): msg_a = 'kappa must be greater than max(-R, -Y) ' msg_b = 'where R and Y are the purine and pyrimidine frequencies' raise HandlingError(msg_a + msg_b) # Create the rate matrix object # which is automatically scaled to a rate of 1.0. model = F84.create_rate_matrix(fs.kappa, nt_probs) # simulate a pair of sequences sequence_pair = PairLikelihood.simulate_sequence_pair( fs.distance, model, fs.length) # convert the pair of sequences to an alignment object aln = StringIO() print >> aln, '>first' print >> aln, ''.join(sequence_pair[0]) print >> aln, '>second' print >> aln, ''.join(sequence_pair[1]) return Fasta.Alignment(StringIO(aln.getvalue())).to_fasta_string() + '\n'
def testLikelihood(self): """ Assert that no errors occur during the analysis """ # define a simple (but not completely degenerate) alignment sa = 'AAAACCCCGGGGTTAA' sb = 'GAAACCTCGGCGTAAA' sequence_pair = (sa, sb) # get estimates according to an analytical formula which is not necessarily the mle distance_mle, kappa_mle, A_mle, C_mle, G_mle, T_mle = get_closed_form_estimates((sa, sb)) nt_distribution_mle = (A_mle, C_mle, G_mle, T_mle) rate_matrix_object = create_rate_matrix(kappa_mle, nt_distribution_mle) log_likelihood_mle = PairLikelihood.get_log_likelihood(distance_mle, sequence_pair, rate_matrix_object) # get the maximum likelihood estimates according to a numeric optimizer. f = Objective((sa, sb)) values = list(f.get_initial_parameters()) result = scipy.optimize.fmin(f, values, ftol=.0000000001, disp=0) distance_opt, kappa_opt, wC_opt, wG_opt, wT_opt = result nt_distribution_opt = parameters_to_distribution((wC_opt, wG_opt, wT_opt)) rate_matrix_object = create_rate_matrix(kappa_opt, nt_distribution_opt) log_likelihood_opt = PairLikelihood.get_log_likelihood(distance_opt, sequence_pair, rate_matrix_object)
def __call__(self, branch_length): """ This will be called by a one dimensional minimizer. @param branch_length: the distance between the two aligned sequences @return: the negative log likelihood """ if branch_length < 0: return float('inf') log_likelihood = PairLikelihood.get_log_likelihood( branch_length, self.sequence_pair, self.rate_matrix) if log_likelihood is None: return float('inf') return -log_likelihood
def __call__(self, theta): """ @param theta: the vector of estimated parameters @return: the negative log likelihood to be minimized """ # unpack the parameters distance, kappa, wC, wG, wT = theta nt_distribution = parameters_to_distribution((wC, wG, wT)) # make the rate matrix model = create_rate_matrix(kappa, nt_distribution) # get the likelihood log_likelihood = PairLikelihood.get_log_likelihood(distance, self.sequence_pair, model) return -log_likelihood
def __call__(self, theta): """ @param theta: the vector of estimated parameters @return: the negative log likelihood to be minimized """ # unpack the parameters distance, kappa, wC, wG, wT = theta nt_distribution = parameters_to_distribution((wC, wG, wT)) # make the rate matrix model = create_rate_matrix(kappa, nt_distribution) # get the likelihood log_likelihood = PairLikelihood.get_log_likelihood( distance, self.sequence_pair, model) return -log_likelihood
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(StringIO(fs.fasta)) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates according to a numeric optimizer. f = F84.Objective(alignment.sequences) values = list(f.get_initial_parameters()) result = scipy.optimize.fmin(f, values, ftol=1e-10, disp=0) distance, kappa, wC, wG, wT = result nt_distribution = F84.parameters_to_distribution((wC, wG, wT)) A, C, G, T = nt_distribution model = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood(distance, alignment.sequences, model) # begin the response out = StringIO() print >> out, 'ML distance:', distance print >> out, 'ML kappa:', kappa print >> out, 'ML A frequency:', A print >> out, 'ML C frequency:', C print >> out, 'ML G frequency:', G print >> out, 'ML T frequency:', T print >> out, 'log likelihood:', log_likelihood # write the response return out.getvalue()
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(StringIO(fs.fasta)) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates according to a numeric optimizer. f = F84.Objective(alignment.sequences) values = list(f.get_initial_parameters()) result = scipy.optimize.fmin(f, values, ftol=1e-10, disp=0) distance, kappa, wC, wG, wT= result nt_distribution = F84.parameters_to_distribution((wC, wG, wT)) A, C, G, T = nt_distribution model = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood( distance, alignment.sequences, model) # begin the response out = StringIO() print >> out, 'ML distance:', distance print >> out, 'ML kappa:', kappa print >> out, 'ML A frequency:', A print >> out, 'ML C frequency:', C print >> out, 'ML G frequency:', G print >> out, 'ML T frequency:', T print >> out, 'log likelihood:', log_likelihood # write the response return out.getvalue()
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates sequence_pair = alignment.sequences distance, kappa, A, C, G, T = F84.get_closed_form_estimates(sequence_pair) # get the log likelihood nt_distribution = (A, C, G, T) rate_matrix_object = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood(distance, alignment.sequences, rate_matrix_object) # begin the response out = StringIO() print >> out, 'distance:', distance print >> out, 'kappa:', kappa print >> out, 'A frequency:', A print >> out, 'C frequency:', C print >> out, 'G frequency:', G print >> out, 'T frequency:', T print >> out, 'log likelihood:', log_likelihood # return the response return out.getvalue()
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates sequence_pair = alignment.sequences distance, kappa, A, C, G, T = F84.get_closed_form_estimates(sequence_pair) # get the log likelihood nt_distribution = (A, C, G, T) rate_matrix_object = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood( distance, alignment.sequences, rate_matrix_object) # begin the response out = StringIO() print >> out, 'distance:', distance print >> out, 'kappa:', kappa print >> out, 'A frequency:', A print >> out, 'C frequency:', C print >> out, 'G frequency:', G print >> out, 'T frequency:', T print >> out, 'log likelihood:', log_likelihood # return the response return out.getvalue()