def test_unequal_log_likelihood(self): """Assert that the log likelihood function gives greater log likelihood to the more likely choice.""" # define a sequence pair with many more transitions than transversions sequence_pair = ('AAAAAAAAAACCCCCCCCCC', 'ATGGGGGGGGCGTTTTTTTT') uniform_nt_dist = (0.25, 0.25, 0.25, 0.25) more_likely_model = F84.create_rate_matrix(1.0, uniform_nt_dist) less_likely_model = F84.create_rate_matrix(0.0, uniform_nt_dist) greater_log_likelihood = get_log_likelihood(1.0, sequence_pair, more_likely_model) smaller_log_likelihood = get_log_likelihood(1.0, sequence_pair, less_likely_model) self.failUnless(greater_log_likelihood > smaller_log_likelihood)
def get_response_content(fs): # read the nucleotide weights nt_weights = [fs.A, fs.C, fs.G, fs.T] # convert the nucleotide weights to probabilities nt_probs = [x / float(sum(nt_weights)) for x in nt_weights] # Assert that the kappa value and the nucleotide # probabilities are compatible. A, C, G, T = nt_probs R = float(A + G) Y = float(C + T) if R <= 0: raise HandlingError("the frequency of a purine must be positive") if Y <= 0: raise HandlingError("the frequency of a pyrimidine must be positive") if fs.kappa <= max(-Y, -R): msg_a = "kappa must be greater than max(-R, -Y) " msg_b = "where R and Y are the purine and pyrimidine frequencies" raise HandlingError(msg_a + msg_b) # Create the rate matrix object # which is automatically scaled to a rate of 1.0. model = F84.create_rate_matrix(fs.kappa, nt_probs) # simulate a pair of sequences sequence_pair = PairLikelihood.simulate_sequence_pair(fs.distance, model, fs.length) # convert the pair of sequences to an alignment object aln = StringIO() print >> aln, ">first" print >> aln, "".join(sequence_pair[0]) print >> aln, ">second" print >> aln, "".join(sequence_pair[1]) return Fasta.Alignment(StringIO(aln.getvalue())).to_fasta_string() + "\n"
def get_response_content(fs): # read the nucleotide weights nt_weights = [fs.A, fs.C, fs.G, fs.T] # convert the nucleotide weights to probabilities nt_probs = [x / float(sum(nt_weights)) for x in nt_weights] # Assert that the kappa value and the nucleotide # probabilities are compatible. A, C, G, T = nt_probs R = float(A + G) Y = float(C + T) if R <= 0: raise HandlingError('the frequency of a purine must be positive') if Y <= 0: raise HandlingError('the frequency of a pyrimidine must be positive') if fs.kappa <= max(-Y, -R): msg_a = 'kappa must be greater than max(-R, -Y) ' msg_b = 'where R and Y are the purine and pyrimidine frequencies' raise HandlingError(msg_a + msg_b) # Create the rate matrix object # which is automatically scaled to a rate of 1.0. model = F84.create_rate_matrix(fs.kappa, nt_probs) # simulate a pair of sequences sequence_pair = PairLikelihood.simulate_sequence_pair( fs.distance, model, fs.length) # convert the pair of sequences to an alignment object aln = StringIO() print >> aln, '>first' print >> aln, ''.join(sequence_pair[0]) print >> aln, '>second' print >> aln, ''.join(sequence_pair[1]) return Fasta.Alignment(StringIO(aln.getvalue())).to_fasta_string() + '\n'
def test_basic_log_likelihood(self): """Assert that the log likelihood function does not give an error for simple input.""" kappa = 0.5 nt_dist = (0.25, 0.25, 0.25, 0.25) model = F84.create_rate_matrix(kappa, nt_dist) sequence_pair = ('AAAA', 'ACGT') t = 2.0 result = get_log_likelihood(t, sequence_pair, model)
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(StringIO(fs.fasta)) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates according to a numeric optimizer. f = F84.Objective(alignment.sequences) values = list(f.get_initial_parameters()) result = scipy.optimize.fmin(f, values, ftol=1e-10, disp=0) distance, kappa, wC, wG, wT = result nt_distribution = F84.parameters_to_distribution((wC, wG, wT)) A, C, G, T = nt_distribution model = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood(distance, alignment.sequences, model) # begin the response out = StringIO() print >> out, 'ML distance:', distance print >> out, 'ML kappa:', kappa print >> out, 'ML A frequency:', A print >> out, 'ML C frequency:', C print >> out, 'ML G frequency:', G print >> out, 'ML T frequency:', T print >> out, 'log likelihood:', log_likelihood # write the response return out.getvalue()
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(StringIO(fs.fasta)) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates according to a numeric optimizer. f = F84.Objective(alignment.sequences) values = list(f.get_initial_parameters()) result = scipy.optimize.fmin(f, values, ftol=1e-10, disp=0) distance, kappa, wC, wG, wT= result nt_distribution = F84.parameters_to_distribution((wC, wG, wT)) A, C, G, T = nt_distribution model = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood( distance, alignment.sequences, model) # begin the response out = StringIO() print >> out, 'ML distance:', distance print >> out, 'ML kappa:', kappa print >> out, 'ML A frequency:', A print >> out, 'ML C frequency:', C print >> out, 'ML G frequency:', G print >> out, 'ML T frequency:', T print >> out, 'log likelihood:', log_likelihood # write the response return out.getvalue()
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates sequence_pair = alignment.sequences distance, kappa, A, C, G, T = F84.get_closed_form_estimates(sequence_pair) # get the log likelihood nt_distribution = (A, C, G, T) rate_matrix_object = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood(distance, alignment.sequences, rate_matrix_object) # begin the response out = StringIO() print >> out, 'distance:', distance print >> out, 'kappa:', kappa print >> out, 'A frequency:', A print >> out, 'C frequency:', C print >> out, 'G frequency:', G print >> out, 'T frequency:', T print >> out, 'log likelihood:', log_likelihood # return the response return out.getvalue()
def test_basic_simulation(self): """Assert that the sequence pair simulation does not give an error for simple input.""" kappa = 0.5 nt_dist = (0.25, 0.25, 0.25, 0.25) model = F84.create_rate_matrix(kappa, nt_dist) t = 2.0 n_sites = 100 result = simulate_sequence_pair(t, model, n_sites) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 100) self.assertEqual(len(result[1]), 100) result_state_set = set(result[0]+result[1]) self.failUnless(result_state_set <= set('ACGT'))
def test_basic_simulation(self): """Assert that the sequence pair simulation does not give an error for simple input.""" kappa = 0.5 nt_dist = (0.25, 0.25, 0.25, 0.25) model = F84.create_rate_matrix(kappa, nt_dist) t = 2.0 n_sites = 100 result = simulate_sequence_pair(t, model, n_sites) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 100) self.assertEqual(len(result[1]), 100) result_state_set = set(result[0] + result[1]) self.failUnless(result_state_set <= set('ACGT'))
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates sequence_pair = alignment.sequences distance, kappa, A, C, G, T = F84.get_closed_form_estimates(sequence_pair) # get the log likelihood nt_distribution = (A, C, G, T) rate_matrix_object = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood( distance, alignment.sequences, rate_matrix_object) # begin the response out = StringIO() print >> out, 'distance:', distance print >> out, 'kappa:', kappa print >> out, 'A frequency:', A print >> out, 'C frequency:', C print >> out, 'G frequency:', G print >> out, 'T frequency:', T print >> out, 'log likelihood:', log_likelihood # return the response return out.getvalue()