def get_orthoplanar_intersection_times(self): """ Get the list of intersection points per axis. This is a geometric concept. """ npoints = len(self.points) abstol = 1e-6 time_seqs = [] for axis in range(self.ndim): time_seq = [] # check points for exact intersections for p, t in zip(self.points, self.times): if abs(p[axis]) < abstol: time_seq.append(t) # check line segments for intersections for i, j in iterutils.pairwise(range(npoints)): pa, pb = self.points[i], self.points[j] ta, tb = self.times[i], self.times[j] if abs(pa[axis]) > abstol and abs(pb[axis]) > abstol: if pa[axis]*pb[axis] < 0: t_local = pa[axis] / (pa[axis] - pb[axis]) t_global = ta + t_local * (tb - ta) time_seq.append(t_global) time_seqs.append(sorted(time_seq)) return time_seqs
def get_orthoplanar_intersection_times(self): """ Get the list of intersection points per axis. This is a geometric concept. """ npoints = len(self.points) abstol = 1e-6 time_seqs = [] for axis in range(self.ndim): time_seq = [] # check points for exact intersections for p, t in zip(self.points, self.times): if abs(p[axis]) < abstol: time_seq.append(t) # check line segments for intersections for i, j in iterutils.pairwise(range(npoints)): pa, pb = self.points[i], self.points[j] ta, tb = self.times[i], self.times[j] if abs(pa[axis]) > abstol and abs(pb[axis]) > abstol: if pa[axis] * pb[axis] < 0: t_local = pa[axis] / (pa[axis] - pb[axis]) t_global = ta + t_local * (tb - ta) time_seq.append(t_global) time_seqs.append(sorted(time_seq)) return time_seqs
def annotate_posteriors(self, stickiness, hidden_models): """ @param stickiness: a nonnegative integer that defines the transition matrix @param hidden_models: a list of statistical models """ # define the transition matrix nhidden = len(hidden_models) prandom = .01**stickiness transition_object = TransitionMatrix.UniformTransitionObject( prandom, nhidden) # define the HMM cache_size = 100000 hmm = FastHMM.Model(transition_object, hidden_models, cache_size) # define the observations and distances observations = [ tuple(sorted(coverage)) for coverage in self.nt_coverages ] distances = [b - a for a, b in iterutils.pairwise(self.offsets)] # get the posterior distribution for each observation dp_info = hmm.get_dp_info(observations, distances) distribution_list = hmm.scaled_posterior_durbin(dp_info) # store the MAP state and conditional MAP substate annotation_list = [] for obs, distribution in zip(observations, distribution_list): map_p, map_index = max((p, i) for i, p in enumerate(distribution)) map_subindex = hidden_models[map_index].get_maximum_posterior(obs) annotation_list.append((map_index, map_subindex)) self.annotation_lists.append(annotation_list)
def annotate_posteriors(self, stickiness, hidden_models): """ @param stickiness: a nonnegative integer that defines the transition matrix @param hidden_models: a list of statistical models """ # define the transition matrix nhidden = len(hidden_models) prandom = .01**stickiness transition_object = TransitionMatrix.UniformTransitionObject(prandom, nhidden) # define the HMM cache_size = 100000 hmm = FastHMM.Model(transition_object, hidden_models, cache_size) # define the observations and distances observations = [tuple(sorted(coverage)) for coverage in self.nt_coverages] distances = [b - a for a, b in iterutils.pairwise(self.offsets)] # get the posterior distribution for each observation dp_info = hmm.get_dp_info(observations, distances) distribution_list = hmm.scaled_posterior_durbin(dp_info) # store the MAP state and conditional MAP substate annotation_list = [] for obs, distribution in zip(observations, distribution_list): map_p, map_index = max((p, i) for i, p in enumerate(distribution)) map_subindex = hidden_models[map_index].get_maximum_posterior(obs) annotation_list.append((map_index, map_subindex)) self.annotation_lists.append(annotation_list)
def get_table_string_and_scripts(fs): nstates = fs.nresidues**fs.nsites if nstates > 256: raise ValueError('the mutation rate matrix is too big') # get the mutation matrix Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites) # sample a bunch of mutation-selection rate matrices Q_sels = [] for selection_index in range(fs.nselections): # sample the selection parameters if fs.low_var: v = 0.2 elif fs.medium_var: v = 1 elif fs.high_var: v = 5.0 elif fs.really_high_var: v = 25.0 s = math.sqrt(v) if fs.neg_skew: sels = [-random.expovariate(1 / s) for i in range(nstates)] elif fs.no_skew: sels = [random.gauss(0, s) for i in range(nstates)] elif fs.pos_skew: sels = [random.expovariate(1 / s) for i in range(nstates)] # define the mutation-selection rate matrix using Halpern-Bruno Q = np.zeros_like(Q_mut) for i in range(nstates): for j in range(nstates): if i != j: tau = math.exp(-(sels[j] - sels[i])) coeff = math.log(tau) / (1 - 1 / tau) Q[i, j] = Q_mut[i, j] * coeff for i in range(nstates): Q[i, i] = -np.sum(Q[i]) Q_sels.append(Q) # define the time points incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1) times = [fs.t_low + i * incr for i in range(fs.ntimes)] # compute the statistics nsels = len(Q_sels) pairs = [get_time_point_summary(Q_mut, Q_sels, t) for t in times] mi_sign_lists, time_stats = zip(*pairs) ncrossing_list = [] # look at how the signs change over time for each selection sample for signs in zip(*mi_sign_lists): count = 0 for sign_a, sign_b in iterutils.pairwise(signs): if sign_a != sign_b: count += 1 ncrossing_list.append(count) # get the R scripts scripts = [ get_r_band_script(nsels, time_stats), get_r_prop_script(nsels, time_stats), get_r_cross_script(ncrossing_list) ] table_string = RUtil.get_table_string(time_stats, g_time_stats_headers) return table_string, scripts
def get_bezier_path(self): bchunks = [] for i, (pa, pb) in enumerate(iterutils.pairwise(self.points)): b = bezier.create_bchunk_line_segment(pa, pb) b.start_time = float(i) b.stop_time = float(i + 1) bchunks.append(b) return pcurve.BezierPath(bchunks)
def get_bezier_path(self): bchunks = [] for i, (pa, pb) in enumerate(iterutils.pairwise(self.points)): b = bezier.create_bchunk_line_segment(pa, pb) b.start_time = float(i) b.stop_time = float(i+1) bchunks.append(b) return pcurve.BezierPath(bchunks)
def get_table_string_and_scripts(fs): nstates = fs.nresidues ** fs.nsites if nstates > 256: raise ValueError('the mutation rate matrix is too big') # get the mutation matrix Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites) # sample a bunch of mutation-selection rate matrices Q_sels = [] for selection_index in range(fs.nselections): # sample the selection parameters if fs.low_var: v = 0.2 elif fs.medium_var: v = 1 elif fs.high_var: v = 5.0 elif fs.really_high_var: v = 25.0 s = math.sqrt(v) if fs.neg_skew: sels = [-random.expovariate(1/s) for i in range(nstates)] elif fs.no_skew: sels = [random.gauss(0, s) for i in range(nstates)] elif fs.pos_skew: sels = [random.expovariate(1/s) for i in range(nstates)] # define the mutation-selection rate matrix using Halpern-Bruno Q = np.zeros_like(Q_mut) for i in range(nstates): for j in range(nstates): if i != j: tau = math.exp(-(sels[j] - sels[i])) coeff = math.log(tau) / (1 - 1/tau) Q[i, j] = Q_mut[i, j] * coeff for i in range(nstates): Q[i, i] = -np.sum(Q[i]) Q_sels.append(Q) # define the time points incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1) times = [fs.t_low + i*incr for i in range(fs.ntimes)] # compute the statistics nsels = len(Q_sels) pairs = [get_time_point_summary(Q_mut, Q_sels, t) for t in times] mi_sign_lists, time_stats = zip(*pairs) ncrossing_list = [] # look at how the signs change over time for each selection sample for signs in zip(*mi_sign_lists): count = 0 for sign_a, sign_b in iterutils.pairwise(signs): if sign_a != sign_b: count += 1 ncrossing_list.append(count) # get the R scripts scripts = [ get_r_band_script(nsels, time_stats), get_r_prop_script(nsels, time_stats), get_r_cross_script(ncrossing_list)] table_string = RUtil.get_table_string(time_stats, g_time_stats_headers) return table_string, scripts
def get_tikz_body(fs): out = StringIO() # define user variables plot_width = fs.plot_width plot_height = fs.plot_height timescale = fs.t_max # create the function objects f_a = JC69.IdentitySlopeInformation(fs.a_mu, fs.a_N) f_b = JC69.IdentitySlopeInformation(fs.b_mu, fs.b_N) # Define some times for evaluation of the curve. times = [timescale*2**-i for i in range(10)] # define some more intermediate values ymax = max(f_a(min(times)), f_b(min(times))) * 1.2 plotscale = np.array((plot_width / timescale, plot_height / ymax)) origin = (0, 0) # draw the boundary of the plot print >> out, r'\draw[color=gray] %s %s {%s} %s;' % ( tikz.point_to_tikz(origin), 'edge node[color=black,below]', '$t$', tikz.point_to_tikz((plot_width, 0))) print >> out, r'\draw[color=gray] ' + get_segment( origin, (0, plot_height)) # draw the bezier curves hitting the right knots for f in (f_a, f_b): bchunks = [] for a, b in iterutils.pairwise(sorted(times)): pta = np.array((a, f(a))) ptb = np.array((b, f(b))) dta = np.array((1, f.deriv(a))) dtb = np.array((1, f.deriv(b))) bchunk = bezier.create_bchunk_hermite( a, b, pta * plotscale, ptb * plotscale, dta * plotscale, dtb * plotscale) bchunks.append(bchunk) print >> out, r'\draw[color=gray] ' + get_tikz_bezier(bchunks) # draw filled black dots at some intersections dot_points = [origin] dot_points.append((0, f_a(0))) dot_points.append((0, f_b(0))) for p in dot_points: print >> out, r'\fill[color=black,inner sep=0pt]', print >> out, tikz.point_to_tikz(np.array(p) * plotscale), print >> out, 'circle (1pt);' # draw some text annotations pt_txt_pairs = [ ((0, 0), '0'), ] for i, (pt, txt) in enumerate(pt_txt_pairs): print >> out, r'\node[anchor=east] (%s) at %s {%s};' % ( 'ylabel%d' % i, tikz.point_to_tikz(pt), txt) # return out.getvalue().rstrip()
def test_clocklike(self): nleaves = 10 R, B = sample(nleaves) paths_to_root = get_paths_to_root(R) ages = [] for path in paths_to_root: age = sum(B[frozenset(p)] for p in iterutils.pairwise(path)) ages.append(age) self.assertEqual(len(ages), nleaves) self.assertEqual(len(set(ages)), 1)
def get_bezier_path(self): bchunks = [] npoints = len(self.points) for i, j in iterutils.pairwise(range(npoints)): pa, pb = self.points[i], self.points[j] ta, tb = self.times[i], self.times[j] b = bezier.create_bchunk_line_segment(pa, pb) b.start_time = ta b.stop_time = tb bchunks.append(b) return pcurve.BezierPath(bchunks)
def create_adjacency_matrix(affinity, nvertices): """ @param affinity: affinity between adjacent vertices @param nvertices: the number of vertices in the graph @return: a numpy matrix """ A = np.zeros((nvertices, nvertices)) for i, j in iterutils.pairwise(range(nvertices)): A[i,j] = affinity A[j,i] = affinity return A
def draw_curve(self): scale = np.array((self.plot_width / self.timescale, self.plot_height)) times = self._get_knot_times() bchunks = [] for a, b in iterutils.pairwise(times): pta = np.array((a, self.f(a))) ptb = np.array((b, self.f(b))) dta = np.array((1, self.f.deriv(a))) dtb = np.array((1, self.f.deriv(b))) bchunk = bezier.create_bchunk_hermite(a, b, pta * scale, ptb * scale, dta * scale, dtb * scale) bchunks.append(bchunk) return r"\draw " + get_tikz_bezier(bchunks)
def get_expected_transitions_brute(prandom, nstates, nsteps): """ This function is for transition matrices defined by their size and a single parameter. Use brute force to compute transition expectations. This function returns two values. The first value is the expected number of transitions when the endpoints are the same. The second value is the expected number of transitions when the endpoints are different. @param prandom: the probability of randomization at each step @param nstates: the number of states in the chain @param nsteps: one fewer than the length of the sequence @return: (expected_t_same, expected_t_different) """ # handle corner cases if not nsteps: return 0.0, float('nan') if nsteps == 1: return 0.0, 1.0 if not prandom: return 0.0, float('nan') # precalculate stuff p_notrans = prandom / nstates + (1 - prandom) p_particular_trans = prandom / nstates p_any_trans = p_particular_trans * (nstates - 1) # initialize probabilities total_p_different = 0 total_p_same = 0 # initialize expectations e_same = 0 e_different = 0 # define expectations for sequence in itertools.product(range(nstates), repeat=nsteps+1): # Calculate the probability of the sequence # and the number of transitions. ntransitions = 0 p = 1.0 / nstates for a, b in iterutils.pairwise(sequence): if a == b: p *= p_notrans else: p *= p_particular_trans ntransitions += 1 # add to the expectation if sequence[0] == sequence[-1]: total_p_same += p e_same += p * ntransitions else: total_p_different += p e_different += p * ntransitions e_same /= total_p_same e_different /= total_p_different return e_same, e_different
def get_expected_transitions_brute(prandom, nstates, nsteps): """ This function is for transition matrices defined by their size and a single parameter. Use brute force to compute transition expectations. This function returns two values. The first value is the expected number of transitions when the endpoints are the same. The second value is the expected number of transitions when the endpoints are different. @param prandom: the probability of randomization at each step @param nstates: the number of states in the chain @param nsteps: one fewer than the length of the sequence @return: (expected_t_same, expected_t_different) """ # handle corner cases if not nsteps: return 0.0, float('nan') if nsteps == 1: return 0.0, 1.0 if not prandom: return 0.0, float('nan') # precalculate stuff p_notrans = prandom / nstates + (1 - prandom) p_particular_trans = prandom / nstates p_any_trans = p_particular_trans * (nstates - 1) # initialize probabilities total_p_different = 0 total_p_same = 0 # initialize expectations e_same = 0 e_different = 0 # define expectations for sequence in itertools.product(range(nstates), repeat=nsteps + 1): # Calculate the probability of the sequence # and the number of transitions. ntransitions = 0 p = 1.0 / nstates for a, b in iterutils.pairwise(sequence): if a == b: p *= p_notrans else: p *= p_particular_trans ntransitions += 1 # add to the expectation if sequence[0] == sequence[-1]: total_p_same += p e_same += p * ntransitions else: total_p_different += p e_different += p * ntransitions e_same /= total_p_same e_different /= total_p_different return e_same, e_different
def shatter(self, times): """ Return a collection of BezierPath objects. The returned objects should be annotated with characteristic times corresponding to intersections. @param times: sorted filtered intersection times @return: a collection of BezierPath objects """ # handle the edge case of no intersections if not times: self.characteristic_time = 0.5 * (self.get_start_time() + self.get_stop_time()) return [self] # handle the edge case of a single intersection if len(times) == 1: self.characteristic_time = times[0] return [self] # Compute quiescence times. # TODO use weak spatially quiescent midpoints # instead of naive temporally quiescent midpoints quiescence_times = [ 0.5 * (a + b) for a, b in iterutils.pairwise(times) ] # Construct the bchunks sequences. # Use whole bchunks when possible, # but at quiescence times we might have to split the bchuncks. remaining = deque(self.bchunks) groups = [] g = [] # repeatedly split the remaining sequence for q in quiescence_times: while True: b = remaining.popleft() if b.start_time <= q <= b.stop_time: ba, bb = b.split_global(q) g.append(ba) remaining.appendleft(bb) groups.append(g) g = [] break else: g.append(b) g.extend(remaining) groups.append(g) # Create a piecewise bezier curve from each group, # and give each piecewise curve a characteristic time. piecewise_curves = [] for t, group in zip(times, groups): curve = self.__class__(group) curve.characteristic_time = t piecewise_curves.append(curve) return piecewise_curves
def create_laplacian_matrix(nvertices): """ @param affinity: affinity between adjacent vertices @param nvertices: the number of vertices in the graph @return: a numpy matrix """ affinity = nvertices * 2.0 A = np.zeros((nvertices, nvertices), dtype=float) for i, j in iterutils.pairwise(range(nvertices)): A[i,j] = affinity A[j,i] = affinity L = Euclid.adjacency_to_laplacian(A) return L
def annotate_posteriors(self, T, hidden_models): """ @param T: a matrix of transition probabilities among the hidden states @param hidden_models: a list of statistical models """ # define the HMM hmm = MissingHMM.MissingHMM(T, hidden_models) # define the observations and distances observations = self.nt_coverages distances = [b - a for a, b in iterutils.pairwise(self.offsets)] # do the annotation self.posterior_distributions = hmm.scaled_posterior_durbin( observations, distances)
def get_orthoplanar_intersection_times(self): """ Get the intersection times for the plane orthogonal to each axis. Note that this function assumes interlacing roots. """ root_seqs = [[]] for f in self.fps: root_seq = [] for low, high in iterutils.pairwise( [self.t_initial] + root_seqs[-1] + [self.t_final]): root_seq.append(scipy.optimize.brentq(f, low, high)) root_seqs.append(root_seq) return root_seqs[1:]
def get_transition_expectations_brute(self, initial_state, final_state, nsteps): """ @return: a matrix of expected transition counts """ T = self.transition_object.get_transition_probability # initialize the matrix of expected counts A = np.zeros((self.nstates, self.nstates)) # compute the probability of observing the final state conditional on the first state p_total = T(initial_state, final_state, nsteps) # iterate over all possible sequences of missing states for missing_sequence in itertools.product(range(self.nstates), repeat=nsteps-1): sequence = [initial_state] + list(missing_sequence) + [final_state] # get the probability of observing this continuation of the initial state p = 1.0 for a, b in iterutils.pairwise(sequence): p *= T(a, b) # add the weighted transitions of each type for a, b in iterutils.pairwise(sequence): A[a, b] += p # divide by the total probability so that the conditioning is correct A /= p_total return A
def get_tikz_body(fs): out = StringIO() # define user variables plot_width = fs.plot_width plot_height = fs.plot_height timescale = fs.t_max # create the function objects f_a = JC69.IdentitySlopeInformation(fs.a_mu, fs.a_N) f_b = JC69.IdentitySlopeInformation(fs.b_mu, fs.b_N) # Define some times for evaluation of the curve. times = [timescale * 2**-i for i in range(10)] # define some more intermediate values ymax = max(f_a(min(times)), f_b(min(times))) * 1.2 plotscale = np.array((plot_width / timescale, plot_height / ymax)) origin = (0, 0) # draw the boundary of the plot print >> out, r'\draw[color=gray] %s %s {%s} %s;' % ( tikz.point_to_tikz(origin), 'edge node[color=black,below]', '$t$', tikz.point_to_tikz((plot_width, 0))) print >> out, r'\draw[color=gray] ' + get_segment(origin, (0, plot_height)) # draw the bezier curves hitting the right knots for f in (f_a, f_b): bchunks = [] for a, b in iterutils.pairwise(sorted(times)): pta = np.array((a, f(a))) ptb = np.array((b, f(b))) dta = np.array((1, f.deriv(a))) dtb = np.array((1, f.deriv(b))) bchunk = bezier.create_bchunk_hermite(a, b, pta * plotscale, ptb * plotscale, dta * plotscale, dtb * plotscale) bchunks.append(bchunk) print >> out, r'\draw[color=gray] ' + get_tikz_bezier(bchunks) # draw filled black dots at some intersections dot_points = [origin] dot_points.append((0, f_a(0))) dot_points.append((0, f_b(0))) for p in dot_points: print >> out, r'\fill[color=black,inner sep=0pt]', print >> out, tikz.point_to_tikz(np.array(p) * plotscale), print >> out, 'circle (1pt);' # draw some text annotations pt_txt_pairs = [ ((0, 0), '0'), ] for i, (pt, txt) in enumerate(pt_txt_pairs): print >> out, r'\node[anchor=east] (%s) at %s {%s};' % ( 'ylabel%d' % i, tikz.point_to_tikz(pt), txt) # return out.getvalue().rstrip()
def get_patches(self, times): """ The idea is to patch over the quiescent joints. This will erase the small imperfection caused by drawing two background-erased curves butted against each other or overlapping each other. The characteristic times of the returned bpaths should be equal to the quiescence time. The endpoints of the patches should be halfway between the characteristic quiescence time and the neighboring intersection times. @param times: sorted filtered intersection times @return: a collection of BezierPath objects """ # if no quiescence time exists then no patch is needed if len(times) < 2: return [] # avoid numerical error at piecewise boundaries abstol = 1e-6 # define the patch endtimes and characteristic times patch_triples = [] for intersect_a, intersect_b in iterutils.pairwise(times): tq = 0.5 * (intersect_a + intersect_b) ta = (2.0 / 3.0) * intersect_a + (1.0 / 3.0) * intersect_b tb = (1.0 / 3.0) * intersect_a + (2.0 / 3.0) * intersect_b patch_triples.append((ta, tq, tb)) # make the patches patches = [] remaining = deque(self.bchunks) for ta, tq, tb in patch_triples: # chop until we are near time ta while remaining[0].start_time < ta - abstol: b = remaining.popleft() if ta < b.stop_time: ba, bb = b.split_global(ta) remaining.appendleft(bb) # eat until we are near time tb g = [] while remaining[0].start_time < tb - abstol: b = remaining.popleft() if tb < b.stop_time: ba, bb = b.split_global(tb) g.append(ba) remaining.appendleft(bb) else: g.append(b) # add the patch patch = self.__class__(g) patch.characteristic_time = tq patches.append(patch) return patches
def draw_curve(self): scale = np.array((self.plot_width / self.timescale, self.plot_height)) times = self._get_knot_times() bchunks = [] for a, b in iterutils.pairwise(times): pta = np.array((a, self.f(a))) ptb = np.array((b, self.f(b))) dta = np.array((1, self.f.deriv(a))) dtb = np.array((1, self.f.deriv(b))) bchunk = bezier.create_bchunk_hermite(a, b, pta * scale, ptb * scale, dta * scale, dtb * scale) bchunks.append(bchunk) return r'\draw ' + get_tikz_bezier(bchunks)
def annotate_posteriors(self, transition_object, hidden_models): """ @param transition_object: has transition matrix information @param hidden_models: a list of statistical models """ # define the HMM cache_size = 10000 hmm = FastHMM.Model(transition_object, hidden_models, cache_size) # define the observations and distances observations = [tuple(sorted(coverage[:-1])) for coverage in self.nt_coverages] distances = [b - a for a, b in iterutils.pairwise(self.offsets)] # do the annotation dp_info = hmm.get_dp_info(observations, distances) self.posterior_distributions = hmm.scaled_posterior_durbin(dp_info)
def shatter(self, times): """ Return a collection of BezierPath objects. The returned objects should be annotated with characteristic times corresponding to intersections. @param times: sorted filtered intersection times @return: a collection of BezierPath objects """ # handle the edge case of no intersections if not times: self.characteristic_time = 0.5 * ( self.get_start_time() + self.get_stop_time()) return [self] # handle the edge case of a single intersection if len(times) == 1: self.characteristic_time = times[0] return [self] # Compute quiescence times. # TODO use weak spatially quiescent midpoints # instead of naive temporally quiescent midpoints quiescence_times = [0.5*(a+b) for a, b in iterutils.pairwise(times)] # Construct the bchunks sequences. # Use whole bchunks when possible, # but at quiescence times we might have to split the bchuncks. remaining = deque(self.bchunks) groups = [] g = [] # repeatedly split the remaining sequence for q in quiescence_times: while True: b = remaining.popleft() if b.start_time <= q <= b.stop_time: ba, bb = b.split_global(q) g.append(ba) remaining.appendleft(bb) groups.append(g) g = [] break else: g.append(b) g.extend(remaining) groups.append(g) # Create a piecewise bezier curve from each group, # and give each piecewise curve a characteristic time. piecewise_curves = [] for t, group in zip(times, groups): curve = self.__class__(group) curve.characteristic_time = t piecewise_curves.append(curve) return piecewise_curves
def get_orthoplanar_intersection_times(self): """ Get the intersection times for the plane orthogonal to each axis. Note that this function assumes interlacing roots. """ root_seqs = [[]] for f in self.fps: root_seq = [] for low, high in iterutils.pairwise([self.t_initial] + root_seqs[-1] + [self.t_final]): root_seq.append(scipy.optimize.brentq(f, low, high)) root_seqs.append(root_seq) return root_seqs[1:]
def evaluate(self, t_target): """ This is slow. @param t_target: target time """ if not self.times[0] <= t_target <= self.times[-1]: raise ValueError('out of range') npoints = len(self.points) for i, j in iterutils.pairwise(range(npoints)): pa, pb = self.points[i], self.points[j] ta, tb = self.times[i], self.times[j] if ta <= t_target <= tb: t_local = (t_target - ta) / (tb - ta) p = (1 - t_local) * pa + t_local * pb return p
def get_transition_expectations_brute(self, initial_state, final_state, nsteps): """ @return: a matrix of expected transition counts """ T = self.transition_object.get_transition_probability # initialize the matrix of expected counts A = np.zeros((self.nstates, self.nstates)) # compute the probability of observing the final state conditional on the first state p_total = T(initial_state, final_state, nsteps) # iterate over all possible sequences of missing states for missing_sequence in itertools.product(range(self.nstates), repeat=nsteps - 1): sequence = [initial_state] + list(missing_sequence) + [final_state] # get the probability of observing this continuation of the initial state p = 1.0 for a, b in iterutils.pairwise(sequence): p *= T(a, b) # add the weighted transitions of each type for a, b in iterutils.pairwise(sequence): A[a, b] += p # divide by the total probability so that the conditioning is correct A /= p_total return A
def annotate_posteriors(self, transition_object, hidden_models): """ @param transition_object: has transition matrix information @param hidden_models: a list of statistical models """ # define the HMM cache_size = 10000 hmm = FastHMM.Model(transition_object, hidden_models, cache_size) # define the observations and distances observations = [ tuple(sorted(coverage[:-1])) for coverage in self.nt_coverages ] distances = [b - a for a, b in iterutils.pairwise(self.offsets)] # do the annotation dp_info = hmm.get_dp_info(observations, distances) self.posterior_distributions = hmm.scaled_posterior_durbin(dp_info)
def get_segmentation(p, t0, t1): """ A segmentation is a sequence of triples (left, right, sign). @param p: a sympy Poly @param t0: initial time @param t1: final time @return: a segmentation """ roots = sorted(float(r) for r in sympy.roots(p)) points = [t0] + roots + [t1] segmentation = [] for left, right in iterutils.pairwise(points): mid = (left + right) / 2 sign = -1 if p.eval(mid) <= 0 else 1 seg = (left, right, sign) segmentation.append(seg) return segmentation
def test_check_sign_lacing_true(self): id_to_adj = { 1: [5], 2: [5], 3: [6], 4: [6], 5: [1, 2, 6], 6: [3, 4, 5]} vs = [ {1:1, 2:1, 3:1, 4:1, 5:1, 6:1}, {1:-1, 2:-1, 3:1, 4:1, 5:-1, 6:1}, {1:-1, 2:1, 3:1, 4:-1, 5:1, 6:1}, {1:-1, 2:-1, 3:1, 4:-1, 5:1, 6:1}] for va, vb in iterutils.pairwise(vs): observed = check_sign_lacing(id_to_adj, va, vb) expected = True self.assertEqual(observed, expected)
def annotate_posteriors(self, stickiness, hidden_models): """ @param stickiness: a nonnegative integer that defines the transition matrix @param hidden_models: a list of statistical models """ # use unlimited cache sizes cache_limit = None # define the transition matrix nhidden = len(hidden_models) prandom = .1**stickiness transition_object = TransitionMatrix.UniformTransitionObject( prandom, nhidden, cache_limit) # define the HMM hmm = FastHMM.Model(transition_object, hidden_models, cache_limit) # define the observations and distances observations = [ tuple(sorted(coverage)) for coverage in self.nt_coverages ] distances = [b - a for a, b in iterutils.pairwise(self.offsets)] # get the posterior distribution for each observation dp_info = hmm.get_dp_info(observations, distances) distribution_list = hmm.scaled_posterior_durbin(dp_info) # initialize the counts for model in hidden_models: self.expected_count_vectors.append(np.zeros(len(model.states))) # accumulate the counts for observation, distribution in zip(observations, distribution_list): for p in distribution: if math.isnan(p): raise ValueError('nan in distribution: %s' % distribution) vectors = [ model.get_posterior_distribution(observation) for model in hidden_models ] for v in vectors: for x in v: if math.isnan(x): raise ValueError('nan in posterior mixture: %s' % v) normalized_vectors = [v * p for v, p in zip(vectors, distribution)] for i, v in enumerate(normalized_vectors): self.expected_count_vectors[i] += v # compute the log likelihood self.log_likelihood = hmm.get_log_likelihood(dp_info) # compute the expected number of hidden state transitions self.ntransitions_expected = hmm.scaled_ntransitions_expected(dp_info)
def get_bezier_path(fp, fv, t_initial, t_final, nchunks): """ @param fp: a python function from t to position vector @param fv: a python function from t to velocity vector @param t_initial: initial time @param t_final: final time @param nchunks: use this many chunks in the piecewise approximation @return: a BezierPath """ bchunks = [] npoints = nchunks + 1 duration = t_final - t_initial incr = duration / nchunks times = [t_initial + i*incr for i in range(npoints)] for ta, tb in iterutils.pairwise(times): b = bezier.create_bchunk_hermite( ta, tb, fp(ta), fp(tb), fv(ta), fv(tb)) bchunks.append(b) return BezierPath(bchunks)
def get_bezier_path(fp, fv, t_initial, t_final, nchunks): """ @param fp: a python function from t to position vector @param fv: a python function from t to velocity vector @param t_initial: initial time @param t_final: final time @param nchunks: use this many chunks in the piecewise approximation @return: a BezierPath """ bchunks = [] npoints = nchunks + 1 duration = t_final - t_initial incr = duration / nchunks times = [t_initial + i * incr for i in range(npoints)] for ta, tb in iterutils.pairwise(times): b = bezier.create_bchunk_hermite(ta, tb, fp(ta), fp(tb), fv(ta), fv(tb)) bchunks.append(b) return BezierPath(bchunks)
def test_check_sign_lacing_true(self): id_to_adj = { 1: [5], 2: [5], 3: [6], 4: [6], 5: [1, 2, 6], 6: [3, 4, 5] } vs = [{ 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1 }, { 1: -1, 2: -1, 3: 1, 4: 1, 5: -1, 6: 1 }, { 1: -1, 2: 1, 3: 1, 4: -1, 5: 1, 6: 1 }, { 1: -1, 2: -1, 3: 1, 4: -1, 5: 1, 6: 1 }] for va, vb in iterutils.pairwise(vs): observed = check_sign_lacing(id_to_adj, va, vb) expected = True self.assertEqual(observed, expected)
def transition_expectations(self, observations, forward, backward): """ @param observations: an observation source @param forward: a source of forward vectors @param backward: a source of backward vectors @return: a matrix of expected hidden state transition counts """ nhidden = len(self.hidden_state_objects) # initialize the matrix of expected counts A = np.zeros((nhidden, nhidden)) # get the expected counts for each transition dp_source = itertools.izip(observations, forward, backward) for old, new in iterutils.pairwise(dp_source): o_old, f_old, b_old = old o_new, f_new, b_new = new likelihoods = self.get_likelihoods(o_new) for i, j in itertools.product(range(nhidden), repeat=2): tprob = self.T.get_transition_probability(i, j) A[i, j] += f_old[i] * tprob * likelihoods[j] * b_new[j] return A
def get_linear_tikz_pane( shape, width, height, time_lists, t_initial, t_final, vgap, cut_radius): abstol = 1e-6 duration = float(t_final - t_initial) arr = [] for i in range(4): c = g_colors[i] xa = t_initial * (width / duration) xb = t_final * (width / duration) # draw the thin line of the correct color line = '\\draw[%s] %s -- %s;' % ( c, tikz.point_to_tikz((xa, -i*vgap)), tikz.point_to_tikz((xb, -i*vgap))) arr.append(line) # draw the thick segments of the correct color if i: augmented_times = [t_initial] + time_lists[i-1] + [t_final] for ta, tb in iterutils.pairwise(augmented_times): t = (ta + tb) / 2.0 xa = ta * (width / duration) xb = tb * (width / duration) value = shape.fps[i-1](t) if value > 0: line = '\\draw[very thick,%s] %s -- %s;' % ( c, tikz.point_to_tikz((xa, -i*vgap)), tikz.point_to_tikz((xb, -i*vgap))) arr.append(line) # draw the cuts in black ink if i < 3: times = time_lists[i] for t in times: x = t * (width / duration) line = '\\draw %s -- %s;' % ( tikz.point_to_tikz((x, cut_radius-i*vgap)), tikz.point_to_tikz((x, -cut_radius-i*vgap))) arr.append(line) return '\n'.join(arr)
def annotate_posteriors(self, stickiness, hidden_models): """ @param stickiness: a nonnegative integer that defines the transition matrix @param hidden_models: a list of statistical models """ # define the transition matrix nhidden = len(hidden_models) prandom = .1**stickiness transition_object = TransitionMatrix.UniformTransitionObject(prandom, nhidden) # define the HMM cache_size = 10000 hmm = FastHMM.Model(transition_object, hidden_models, cache_size) # define the observations and distances observations = [tuple(sorted(coverage[:-1])) for coverage in self.nt_coverages] distances = [b - a for a, b in iterutils.pairwise(self.offsets)] # do the annotation dp_info = hmm.get_dp_info(observations, distances) distribution_list = hmm.scaled_posterior_durbin(dp_info) # store the annotation with its respective stickiness self.posterior_distribution_lists.append(distribution_list) self.stickinesses.append(stickiness)
def test_lfdi_approximation(self): """ As N increases, the approximation should become closer. More precisely, as N becomes large, multiplying N by ten should add one decimal place of accuracy to the approximation. Where the accuracy of the approximation is taken to be the frobenius norm of the error matrix. """ lfdo = tree_string_to_LFDO(g_tree_string) lfdi = LFDO_to_LFDI(lfdo) # For these values of N, # the error for N should be more than 9 times the error for 10N. # When N is very large, # the error for N should approach 10 times the error for 10N. Ns = (10, 100, 1000, 10000) lfdns = [LFDO_to_LFDN(lfdo, N) for N in Ns] error_norms = [np.linalg.norm(lfdi.M - lfdn.M) for lfdn in lfdns] for ea, eb in iterutils.pairwise(error_norms): # ea should be more than nine times as bad as eb self.assertTrue(ea / eb > 9)
def get_orthoplanar_intersections(self): """ Get the list of intersection points per axis. This is a geometric concept. """ abstol = 1e-6 point_seqs = [] for axis in range(self.ndim): point_seq = [] # check points for exact intersections for p in self.points: if abs(p[axis]) < abstol: point_seq.append(p) # check line segments for intersections for pa, pb in iterutils.pairwise(self.points): if abs(pa[axis]) > abstol and abs(pb[axis]) > abstol: if pa[axis]*pb[axis] < 0: p = (pb[axis]*pa - pa[axis]*pb) / (pb[axis] - pa[axis]) point_seq.append(p) point_seqs.append(point_seq) return point_seqs
def get_response_content(fs): # get the combo info combo_triples = list(gen_combo_line_triples(fs.combo.splitlines())) names, lows, highs = zip(*combo_triples) ranges = zip(lows, highs) if lows[0] != 1: raise ValueError('expected the first lower bound to be 1') for (low, high), (nlow, nhigh) in iterutils.pairwise(ranges): if high + 1 != nlow: raise ValueError( 'expected the next lower bound ' 'to be one more than the current upper bound') # get the phylip info headers, sequences = Phylip.decode(fs.phylip.splitlines()) phylip_columns = zip(*sequences) counts = [len(set(col)) for col in phylip_columns] # validate the compatibility between the combo and phylip data if highs[-1] != len(phylip_columns): raise ValueError( 'expected the last upper bound to be ' 'equal to the number of columns of the phylip alignment') # get the sum of counts in each combination group combo_counts = [] for i, (low, high) in enumerate(ranges): combo_count = 0 # note that low and high are 1-based and inclusive for j in range(low-1, high): combo_count += counts[j] combo_counts.append(combo_count) # write the new combo log out = StringIO() print >> out, 'Loci combined' print >> out k = 0 for name, count in zip(names, combo_counts): low = k + 1 high = k + count print >> out, '%s\t%d-%d' % (name, low, high) k += count return out.getvalue()
def annotate_posteriors(self, stickiness, hidden_models): """ @param stickiness: a nonnegative integer that defines the transition matrix @param hidden_models: a list of statistical models """ # use unlimited cache sizes cache_limit = None # define the transition matrix nhidden = len(hidden_models) prandom = .1**stickiness transition_object = TransitionMatrix.UniformTransitionObject(prandom, nhidden, cache_limit) # define the HMM hmm = FastHMM.Model(transition_object, hidden_models, cache_limit) # define the observations and distances observations = [tuple(sorted(coverage)) for coverage in self.nt_coverages] distances = [b - a for a, b in iterutils.pairwise(self.offsets)] # get the posterior distribution for each observation dp_info = hmm.get_dp_info(observations, distances) distribution_list = hmm.scaled_posterior_durbin(dp_info) # initialize the counts for model in hidden_models: self.expected_count_vectors.append(np.zeros(len(model.states))) # accumulate the counts for observation, distribution in zip(observations, distribution_list): for p in distribution: if math.isnan(p): raise ValueError('nan in distribution: %s' % distribution) vectors = [model.get_posterior_distribution(observation) for model in hidden_models] for v in vectors: for x in v: if math.isnan(x): raise ValueError('nan in posterior mixture: %s' % v) normalized_vectors = [v*p for v, p in zip(vectors, distribution)] for i, v in enumerate(normalized_vectors): self.expected_count_vectors[i] += v # compute the log likelihood self.log_likelihood = hmm.get_log_likelihood(dp_info) # compute the expected number of hidden state transitions self.ntransitions_expected = hmm.scaled_ntransitions_expected(dp_info)
def brute_posterior_decoding(self, observations): """ Get the distribution of hidden states at each position given the observed sequence. This is done inefficiently by summing over each possible hidden state sequence. @param observations: the sequence of observations @return: hidden state distributions at each position, and total probability """ nhidden = len(self.hidden_state_objects) total_log_sums = [] # precalculate the log likelihood for each observation for each hidden state position_log_likelihoods = [] for obs in observations: log_likelihoods = [state.get_log_likelihood(obs) for state in self.hidden_state_objects] position_log_likelihoods.append(log_likelihoods) # each hidden state at each position gets a list of log likelihoods total_accum = [[[] for i in range(nhidden)] for j in observations] # calculate the log likelihood for each hidden sequence for hidden_sequence in itertools.product(range(nhidden), repeat=len(observations)): accum = 0 accum += math.log(self.initial_distribution[hidden_sequence[0]]) for i, j in iterutils.pairwise(hidden_sequence): accum += math.log(self.transition_matrix[i, j]) for index, log_likelihoods in zip(hidden_sequence, position_log_likelihoods): accum += log_likelihoods[index] # accumulate the log likelihood for i, hidden_state in enumerate(hidden_sequence): total_accum[i][hidden_state].append(accum) # add to the total probability total_log_sums.append(accum) # get the distribution at each position distributions = [] for log_distribution_lists in total_accum: distribution = [scipy.misc.logsumexp(x) for x in log_distribution_lists] distribution = [d - max(distribution) for d in distribution] distribution = [math.exp(d) for d in distribution] distribution = [d / sum(distribution) for d in distribution] distributions.append(distribution) total_probability = math.exp(scipy.misc.logsumexp(total_log_sums)) return distributions, total_probability
def get_orthoplanar_intersections(self): """ Get the list of intersection points per axis. This is a geometric concept. """ abstol = 1e-6 point_seqs = [] for axis in range(self.ndim): point_seq = [] # check points for exact intersections for p in self.points: if abs(p[axis]) < abstol: point_seq.append(p) # check line segments for intersections for pa, pb in iterutils.pairwise(self.points): if abs(pa[axis]) > abstol and abs(pb[axis]) > abstol: if pa[axis] * pb[axis] < 0: p = (pb[axis] * pa - pa[axis] * pb) / (pb[axis] - pa[axis]) point_seq.append(p) point_seqs.append(point_seq) return point_seqs
def parse_hky_output(lines): """ @param lines: lines of output @return: a dictionary with keys 'kappa', 'A', 'C', 'G', 'T', and 'lnL' """ d = {} lines = Util.get_stripped_lines(lines) for line in lines: # read kappa if line.startswith('kappa under HKY85'): arr = [x.strip() for x in line.split(':')] d['kappa'] = float(arr[1]) # read the log likelihood if line.startswith('lnL('): arr = line.split() d['lnL'] = float(arr[-2]) # read the frequency parameters for first, second in iterutils.pairwise(lines): if first.startswith('base frequency parameters'): bases = list('TCAG') frequencies = [float(x) for x in second.split()] d.update(zip(bases, frequencies)) return d
def get_joint_log_likelihood(self, hidden_seq, observed_seq): """ The two arguments are conformantly ordered. @param hidden_seq: a sequence of hidden state indices @param observed_seq: a conformant sequence of observation objects @return: the joint likelihood of the hidden and observed sequences """ # do validation if len(hidden_seq) != len(observed_seq): raise ValueError('expected conformant input sequences') # initialize the log likelihood log_accum = 0 # add the contribution of the initial hidden state initial_hidden_state = hidden_seq[0] log_accum += math.log(self.initial_distribution[initial_hidden_state]) # add the contribution of hidden state transitions for i, j in iterutils.pairwise(hidden_seq): log_accum += math.log(self.transition_matrix[i, j]) # add the contribution of emissions for i, observation in zip(hidden_seq, observed_seq): log_accum += self.hidden_state_objects[i].get_log_likelihood(observation) # return the log likelihood return log_accum
def get_expectations_brute(self, initial_state, final_state, nsteps): """ Get the number of times each state was expected to occur between the initial and final positions. @return: an expectation for each state """ T = self.transition_object.get_transition_probability # initialize the vector of expected counts v = np.zeros(self.nstates) # compute the probability of observing the final state conditional on the first state p_total = T(initial_state, final_state, nsteps) # iterate over all possible sequences of missing states for missing_sequence in itertools.product(range(self.nstates), repeat=nsteps-1): sequence = [initial_state] + list(missing_sequence) + [final_state] # get the probability of observing this continuation of the initial state p = 1.0 for a, b in iterutils.pairwise(sequence): p *= T(a, b) # add the weighted transitions of each type for state in missing_sequence: v[state] += p # divide by the total probability so that the conditioning is correct v /= p_total return v
def get_linear_tikz_pane(shape, width, height, time_lists, t_initial, t_final, vgap, cut_radius): abstol = 1e-6 duration = float(t_final - t_initial) arr = [] for i in range(4): c = g_colors[i] xa = t_initial * (width / duration) xb = t_final * (width / duration) # draw the thin line of the correct color line = '\\draw[%s] %s -- %s;' % (c, tikz.point_to_tikz( (xa, -i * vgap)), tikz.point_to_tikz((xb, -i * vgap))) arr.append(line) # draw the thick segments of the correct color if i: augmented_times = [t_initial] + time_lists[i - 1] + [t_final] for ta, tb in iterutils.pairwise(augmented_times): t = (ta + tb) / 2.0 xa = ta * (width / duration) xb = tb * (width / duration) value = shape.fps[i - 1](t) if value > 0: line = '\\draw[very thick,%s] %s -- %s;' % ( c, tikz.point_to_tikz((xa, -i * vgap)), tikz.point_to_tikz((xb, -i * vgap))) arr.append(line) # draw the cuts in black ink if i < 3: times = time_lists[i] for t in times: x = t * (width / duration) line = '\\draw %s -- %s;' % (tikz.point_to_tikz( (x, cut_radius - i * vgap)), tikz.point_to_tikz( (x, -cut_radius - i * vgap))) arr.append(line) return '\n'.join(arr)