def _get_proposed_sample(self): """ @return: a proposed (sequence_list, distance_matrix) pair that may need to be rejected """ sequence_list = JC69.sample_sequences(self.tree, self.ordered_names, self.sequence_length) D = JC69.get_ML_distance_matrix(sequence_list) return sequence_list, D
def gen_samples_or_none(self, count, max_steps): """ Yield (ordered sequence list, distance matrix) pairs or None. The generator will stop if it sees that it cannot meet its goal in the allotted number of steps. The time between yielded results is bounded. @param count: the requested number of distance matrices or None for no bound @param max_steps: an upper bound on the number of steps allowed for the computation or None for no bound """ # record the requested number of samples self.requested_matrix_count = count if self.sequence_length == float('inf'): # get the true distance matrix distance_matrix = self.tree.get_distance_matrix() # if any of the off-diagonal elements are wack then return straight away if self.reject_zero is None: if matrix_has_zero_off_diagonal(distance_matrix): error_message = 'the true distance matrix has a zero off-diagonal entry' raise DMSamplerError(error_message) if self.reject_inf: if matrix_has_inf_off_diagonal(distance_matrix): error_message = 'the true distance matrix has an infinite off-diagonal entry' raise DMSamplerError(error_message) # yield a bunch of copies of the true distance matrix for i in range(count): self.accepted_sample_count += 1 yield (None, distance_matrix) else: # do some rejection sampling while True: # if we are done sampling then return if count is not None: if self.accepted_sample_count >= count: return # if we are taking too many computrons then bail with an error message if max_steps is not None: if self.get_complexity() > max_steps: raise DMSamplerError(self._get_error_message()) # do the sampling sequence_list = JC69.sample_sequences(self.tree, self.ordered_names, self.sequence_length) # get the estimated distance matrix distance_matrix = JC69.get_ML_distance_matrix(sequence_list) # look for degeneracies if self.reject_zero and matrix_has_zero_off_diagonal( distance_matrix): self.rejected_zero_sample_count += 1 yield None elif self.reject_inf and matrix_has_inf_off_diagonal( distance_matrix): self.rejected_inf_sample_count += 1 yield None else: self.accepted_sample_count += 1 yield sequence_list, distance_matrix
def gen_samples_or_none(self, count, max_steps): """ Yield (ordered sequence list, distance matrix) pairs or None. The generator will stop if it sees that it cannot meet its goal in the allotted number of steps. The time between yielded results is bounded. @param count: the requested number of distance matrices or None for no bound @param max_steps: an upper bound on the number of steps allowed for the computation or None for no bound """ # record the requested number of samples self.requested_matrix_count = count if self.sequence_length == float('inf'): # get the true distance matrix distance_matrix = self.tree.get_distance_matrix() # if any of the off-diagonal elements are wack then return straight away if self.reject_zero is None: if matrix_has_zero_off_diagonal(distance_matrix): error_message = 'the true distance matrix has a zero off-diagonal entry' raise DMSamplerError(error_message) if self.reject_inf: if matrix_has_inf_off_diagonal(distance_matrix): error_message = 'the true distance matrix has an infinite off-diagonal entry' raise DMSamplerError(error_message) # yield a bunch of copies of the true distance matrix for i in range(count): self.accepted_sample_count += 1 yield (None, distance_matrix) else: # do some rejection sampling while True: # if we are done sampling then return if count is not None: if self.accepted_sample_count >= count: return # if we are taking too many computrons then bail with an error message if max_steps is not None: if self.get_complexity() > max_steps: raise DMSamplerError(self._get_error_message()) # do the sampling sequence_list = JC69.sample_sequences(self.tree, self.ordered_names, self.sequence_length) # get the estimated distance matrix distance_matrix = JC69.get_ML_distance_matrix(sequence_list) # look for degeneracies if self.reject_zero and matrix_has_zero_off_diagonal(distance_matrix): self.rejected_zero_sample_count += 1 yield None elif self.reject_inf and matrix_has_inf_off_diagonal(distance_matrix): self.rejected_inf_sample_count += 1 yield None else: self.accepted_sample_count += 1 yield sequence_list, distance_matrix
def get_tikz_body(fs): out = StringIO() # define user variables plot_width = fs.plot_width plot_height = fs.plot_height timescale = fs.t_max # create the function objects f_a = JC69.IdentitySlopeInformation(fs.a_mu, fs.a_N) f_b = JC69.IdentitySlopeInformation(fs.b_mu, fs.b_N) # Define some times for evaluation of the curve. times = [timescale * 2**-i for i in range(10)] # define some more intermediate values ymax = max(f_a(min(times)), f_b(min(times))) * 1.2 plotscale = np.array((plot_width / timescale, plot_height / ymax)) origin = (0, 0) # draw the boundary of the plot print >> out, r'\draw[color=gray] %s %s {%s} %s;' % ( tikz.point_to_tikz(origin), 'edge node[color=black,below]', '$t$', tikz.point_to_tikz((plot_width, 0))) print >> out, r'\draw[color=gray] ' + get_segment(origin, (0, plot_height)) # draw the bezier curves hitting the right knots for f in (f_a, f_b): bchunks = [] for a, b in iterutils.pairwise(sorted(times)): pta = np.array((a, f(a))) ptb = np.array((b, f(b))) dta = np.array((1, f.deriv(a))) dtb = np.array((1, f.deriv(b))) bchunk = bezier.create_bchunk_hermite(a, b, pta * plotscale, ptb * plotscale, dta * plotscale, dtb * plotscale) bchunks.append(bchunk) print >> out, r'\draw[color=gray] ' + get_tikz_bezier(bchunks) # draw filled black dots at some intersections dot_points = [origin] dot_points.append((0, f_a(0))) dot_points.append((0, f_b(0))) for p in dot_points: print >> out, r'\fill[color=black,inner sep=0pt]', print >> out, tikz.point_to_tikz(np.array(p) * plotscale), print >> out, 'circle (1pt);' # draw some text annotations pt_txt_pairs = [ ((0, 0), '0'), ] for i, (pt, txt) in enumerate(pt_txt_pairs): print >> out, r'\node[anchor=east] (%s) at %s {%s};' % ( 'ylabel%d' % i, tikz.point_to_tikz(pt), txt) # return out.getvalue().rstrip()
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the sequence order if it exists ordered_names = Util.get_stripped_lines(fs.order.splitlines()) if ordered_names: observed_name_set = set(ordered_names) expected_name_set = set(node.get_name() for node in tree.gen_tips()) extra_names = observed_name_set - expected_name_set missing_names = expected_name_set - observed_name_set if extra_names: msg_a = 'the list of ordered names includes these names ' msg_b = 'not found in the tree: %s' % str(tuple(extra_names)) raise HandlingError(msg_a + msg_b) if missing_names: msg_a = 'the tree includes these names not found in the list ' msg_b = 'of ordered names: %s' % str(tuple(missing_names)) raise HandlingError(msg_a + msg_b) else: ordered_names = list(tip.get_name() for name in tree.gen_tips()) # do the sampling sampled_sequences = JC69.sample_sequences(tree, ordered_names, fs.length) alignment = Fasta.create_alignment(ordered_names, sampled_sequences) # return the response return alignment.to_fasta_string() + '\n'
def get_zygosity_distribution(ref_length, child_length): """ This is based on the Jukes-Cantor model on a three taxon tree. @param ref_length: length of the reference taxon branch @param child_length: length of each child taxon branch @return: the distribution (RR, RA, AA, AB) """ p_ref_change = JC69.distance_to_probability(ref_length) p_child_change = JC69.distance_to_probability(child_length) # For now sum over all possibilities of non-reference nodes. # This could be done more efficiently using Felsenstein pruning, # but I am ignoring this for now. p_RR = 0.0 p_RA = 0.0 p_AA = 0.0 p_AB = 0.0 ref = 0 for c12 in range(4): if c12 == ref: p12 = 1.0 - p_ref_change else: p12 = p_ref_change / 3.0 for c1 in range(4): if c1 == c12: p1 = p12 * (1.0 - p_child_change) else: p1 = p12 * (p_child_change / 3.0) for c2 in range(4): if c2 == c12: p2 = p1 * (1.0 - p_child_change) else: p2 = p1 * (p_child_change / 3.0) # Classify the joint distribution # and add weight to the appropriate state. if c1 == ref and c2 == ref: p_RR += p2 elif c1 == ref or c2 == ref: p_RA += p2 elif c1 == c2: p_AA += p2 else: p_AB += p2 v = (p_RR, p_RA, p_AA, p_AB) total = sum(v) if abs(total - 1) > 1e-7: raise DGRPError('probabilities do not sum to one') return v
def gen_distance_matrices(self, count, max_steps): """ Yield (ordered sequence list, distance matrix) pairs . The generator will stop if it sees that it cannot meet its goal in the allotted number of steps. @param count: the requested number of distance matrices @param max_steps: an upper bound on the allowed number of steps """ # define the jukes cantor rate matrix dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) model = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # record the requested number of samples self.requested_matrix_count = count # do some rejection sampling while True: if self.get_complexity() >= max_steps: break if self.accepted_sample_count >= count: break # simulate an alignment from the tree alignment = PhyLikelihood.simulate_alignment( self.tree, model, self.sequence_length) # extract the ordered list of sequences from the alignment object name_to_sequence = dict(zip(alignment.headers, alignment.sequences)) sequence_list = [ name_to_sequence[name] for name in self.ordered_names ] # get the estimated distance matrix distance_matrix = JC69.get_ML_distance_matrix(sequence_list) # look for degeneracies has_zero_off_diagonal = False has_inf_off_diagonal = False for i, row in enumerate(distance_matrix): for j, value in enumerate(row): if i != j: if value == 0.0: has_zero_off_diagonal = True if value == float('inf'): has_inf_off_diagonal = True if has_zero_off_diagonal: self.rejected_zero_sample_count += 1 elif has_inf_off_diagonal: self.rejected_inf_sample_count += 1 else: self.accepted_sample_count += 1 yield sequence_list, distance_matrix
def gen_distance_matrices(self, count, max_steps): """ Yield (ordered sequence list, distance matrix) pairs . The generator will stop if it sees that it cannot meet its goal in the allotted number of steps. @param count: the requested number of distance matrices @param max_steps: an upper bound on the allowed number of steps """ # define the jukes cantor rate matrix dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) model = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # record the requested number of samples self.requested_matrix_count = count # do some rejection sampling while True: if self.get_complexity() >= max_steps: break if self.accepted_sample_count >= count: break # simulate an alignment from the tree alignment = PhyLikelihood.simulate_alignment( self.tree, model, self.sequence_length) # extract the ordered list of sequences from the alignment object name_to_sequence = dict(zip(alignment.headers, alignment.sequences)) sequence_list = [name_to_sequence[name] for name in self.ordered_names] # get the estimated distance matrix distance_matrix = JC69.get_ML_distance_matrix(sequence_list) # look for degeneracies has_zero_off_diagonal = False has_inf_off_diagonal = False for i, row in enumerate(distance_matrix): for j, value in enumerate(row): if i != j: if value == 0.0: has_zero_off_diagonal = True if value == float('inf'): has_inf_off_diagonal = True if has_zero_off_diagonal: self.rejected_zero_sample_count += 1 elif has_inf_off_diagonal: self.rejected_inf_sample_count += 1 else: self.accepted_sample_count += 1 yield sequence_list, distance_matrix
def get_response_content(fs): # read the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('fasta alignment error: ' + str(e)) if alignment.get_sequence_count() < 2: raise HandlingError('expected at least two sequences') # Create the distance matrix, # replacing values of None with the representation for infinity. row_major_distance_matrix = [] for row in JC69.get_ML_distance_matrix(alignment.sequences): corrected_row = [fs.infinity if x == float('inf') else x for x in row] row_major_distance_matrix.append(corrected_row) # return the response return MatrixUtil.m_to_string(row_major_distance_matrix) + '\n'
def sample_distance_matrix(xtree_root, sequence_length): sequences = JC69.sample_xtree_sequences(xtree_root, sequence_length) nsequences = len(sequences) pairwise_mismatch_count = np.zeros((nsequences, nsequences)) for i, sa in enumerate(sequences): for j, sb in enumerate(sequences): if i < j: nmismatches = sum(1 for a, b in zip(sa, sb) if a != b) if not nmismatches: raise ZeroDistanceError() if nmismatches * 4 >= sequence_length * 3: raise InfiniteDistanceError() pairwise_mismatch_count[i][j] = nmismatches D = np.zeros_like(pairwise_mismatch_count) for i in range(nsequences): for j in range(nsequences): if i < j: d_raw = pairwise_mismatch_count[i][j] / float(sequence_length) b = 0.75 d_mle = -b*math.log(1 - d_raw/b) D[i][j] = d_mle D[j][i] = d_mle return D
def sample_distance_matrix(xtree_root, sequence_length): sequences = JC69.sample_xtree_sequences(xtree_root, sequence_length) nsequences = len(sequences) pairwise_mismatch_count = np.zeros((nsequences, nsequences)) for i, sa in enumerate(sequences): for j, sb in enumerate(sequences): if i < j: nmismatches = sum(1 for a, b in zip(sa, sb) if a != b) if not nmismatches: raise ZeroDistanceError() if nmismatches * 4 >= sequence_length * 3: raise InfiniteDistanceError() pairwise_mismatch_count[i][j] = nmismatches D = np.zeros_like(pairwise_mismatch_count) for i in range(nsequences): for j in range(nsequences): if i < j: d_raw = pairwise_mismatch_count[i][j] / float(sequence_length) b = 0.75 d_mle = -b * math.log(1 - d_raw / b) D[i][j] = d_mle D[j][i] = d_mle return D
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError("alignment error: " + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError("expected a pair of sequences") # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError("nucleotide alignment error: " + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = "expected a gap-free unambiguous nucleotide alignment" raise HandlingError(msg) # get the maximum likelihood estimate sequence_pair = alignment.sequences mle = JC69.get_ML_distance(*sequence_pair) # return the response return "ML distance estimate: %f\n" % mle
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gap-free unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimate sequence_pair = alignment.sequences mle = JC69.get_ML_distance(*sequence_pair) # return the response return 'ML distance estimate: %f\n' % mle
def get_tikz_body(fs): out = StringIO() # define user variables plot_width = fs.plot_width plot_height = fs.plot_height timescale = fs.t_max fast_mu = fs.fast_mu slow_mu = fs.slow_mu if fs.info_identity_slope: f_fast = JC69.IdentitySlopeInformation(fast_mu) f_slow = JC69.IdentitySlopeInformation(slow_mu) elif fs.info_mi: f_fast = JC69.MutualInformation(fast_mu) f_slow = JC69.MutualInformation(slow_mu) elif fs.info_fi: #f_fast = JC69.FisherInformationTheano(fast_mu) #f_slow = JC69.FisherInformationTheano(slow_mu) f_fast = JC69.FisherInformation(fast_mu) f_slow = JC69.FisherInformation(slow_mu) # Define some times for evaluation of the curve. times = [timescale * 2**-i for i in range(10)] if fs.info_identity_slope: # Compute the intersection time. t_x = math.log(fast_mu / slow_mu) / (fast_mu - slow_mu) times.extend([t_x / 2, t_x, (t_x + timescale) / 2]) # define some more intermediate values ymax = max(f_fast(min(times)), f_slow(min(times))) * 1.2 plotscale = np.array((plot_width / timescale, plot_height / ymax)) origin = (0, 0) # draw the boundary of the plot print >> out, r'\draw[color=gray] %s %s {%s} %s;' % ( tikz.point_to_tikz(origin), 'edge node[color=black,below]', '$t$', tikz.point_to_tikz((plot_width, 0))) print >> out, r'\draw[color=gray] ' + get_segment(origin, (0, plot_height)) # draw the bezier curves hitting the right knots for f in (f_slow, f_fast): bchunks = [] for a, b in iterutils.pairwise(sorted(times)): pta = np.array((a, f(a))) ptb = np.array((b, f(b))) dta = np.array((1, f.deriv(a))) dtb = np.array((1, f.deriv(b))) bchunk = bezier.create_bchunk_hermite(a, b, pta * plotscale, ptb * plotscale, dta * plotscale, dtb * plotscale) bchunks.append(bchunk) print >> out, r'\draw[color=gray] ' + get_tikz_bezier(bchunks) # draw filled black dots at some intersections dot_points = [origin] if not fs.info_fi: dot_points.append((0, f_fast(0))) dot_points.append((0, f_slow(0))) if fs.info_identity_slope: dot_points.append((t_x, f_slow(t_x))) for p in dot_points: print >> out, r'\fill[color=black,inner sep=0pt]', print >> out, tikz.point_to_tikz(np.array(p) * plotscale), print >> out, 'circle (1pt);' # draw some text annotations pt_txt_pairs = [ ((0, 0), '0'), ] for i, (pt, txt) in enumerate(pt_txt_pairs): print >> out, r'\node[anchor=east] (%s) at %s {%s};' % ( 'ylabel%d' % i, tikz.point_to_tikz(pt), txt) # return out.getvalue().rstrip()