def test_without_endgaps_basic(self): a = AlignedPair( ("q", "--ABCDE---"), ("s", "FGHIJKLMNO")) r = AlignedRegion.without_endgaps(a) self.assertEqual(r.start_idx, 2) self.assertEqual(r.end_idx, 7)
def _get_indiv_probability(self, alignment): region = AlignedRegion.without_endgaps(alignment).trim_ends() region_positions = region.alignment_len region_matches = region.count_matches() region_mismatches = region_positions - region_matches alpha = region_mismatches + self.prior_alpha beta = region_matches + self.prior_beta nonregion_subject_positions = (alignment.subject_len - region.subject_len) total_positions = (region_positions + nonregion_subject_positions) species_mismatch_threshold = 1 - self.species_threshold max_total_mismatches = int( math.floor(species_mismatch_threshold * total_positions)) max_nonregion_mismatches = max_total_mismatches - region_mismatches prob_compatible = beta_binomial_cdf(max_nonregion_mismatches, nonregion_subject_positions, alpha, beta) prob_incompatible = 1 - prob_compatible return { "typestrain_id": alignment.subject_id, "probability_incompatible": prob_incompatible, "region_mismatches": region_mismatches, "region_positions": region_positions, "region_matches": region_matches, "nonregion_positions_in_subject": nonregion_subject_positions, "max_nonregion_mismatches": max_nonregion_mismatches, }
def test_without_endgaps_hard(self): a = AlignedPair( ("q", "--A-CDEFGH"), # ||||| ("s", "FG-IJ-L---")) r = AlignedRegion.without_endgaps(a) self.assertEqual(r.start_idx, 2) self.assertEqual(r.end_idx, 7)
def unassign_threshold(self, min_id=0.975, soft_threshold=False): # Use all the beta-binomial logic from ConstantMismatchRate, # just adjust alpha and beta based on reference # sequences. Here's how. Reparameterize beta as mu and v, # following wikipedia. Hold v constant. We are going to update # mu. In the constant rate algorithm, mu2 = mu1. In the # variable rate algorithm, we determine log(mu2 / mu1) by # averaging the observed values from the reference # sequences. To stabilize things, start with a list of # [0,0,0,0,0]. Then, for each reference sequence, compute mu2 # and mu1, take the log, and append to the list. Average the # values in the list. Now use this as the new value of mu2 for # the query sequence. # Clip out the aligned region region = AlignedRegion.without_endgaps(self.alignment) region_alignment = region.trim_ends() region_positions = region_alignment.alignment_len region_matches = region_alignment.count_matches() region_mismatches = region_positions - region_matches region_subject_positions = region_alignment.subject_len # Calcuate alpha, beta, mu, and v in aligned region alpha1 = region_mismatches + 0.5 beta1 = region_matches + 0.5 v1 = alpha1 + beta1 mu1 = alpha1 / v1 # Compute number of positions outside aligned region nonregion_subject_positions = (self.alignment.subject_len - region_subject_positions) total_positions = (region_positions + nonregion_subject_positions) # Get mismatches from database typestrain_id = self.alignment.subject_id typestrain_start_idx, typestrain_end_idx = region.in_subject() reference_mismatches = self._get_mismatches(typestrain_id, typestrain_start_idx, typestrain_end_idx) # Get estimate for gamma = log(mu2 / mu1) # From reference alignments reference_logvals = [0, 0, 0, 0, 0] for region_mms, nonregion_mms in reference_mismatches: # mu = alpha / (alpha + beta) # alpha = mismatches + 0.5 # beta = matches + 0.5 # matches = len - mismatches # beta = len - mismatches + 0.5 # mu = (mismatches + 0.5) / (len + 1) ref_mu1 = (region_mms + 0.5) / (region_subject_positions + 1) ref_mu2 = (nonregion_mms + 0.5) / (nonregion_subject_positions + 1) log_mu2_mu1 = math.log(ref_mu2 / ref_mu1) reference_logvals.append(log_mu2_mu1) # TODO: add weighting gamma = numpy.mean(reference_logvals) # Calculate mu2, get alpha2 and beta2 # log(mu2 / mu1) = gamma # log(mu2) - log(mu1) = gamma # log(mu2) = log(mu1) + gamma # mu2 = exp(log(mu1) + gamma) mu2 = math.exp(math.log(mu1) + gamma) v2 = v1 # alpha2, beta2 # mu2 = alpha2 / v2 # alpha2 = mu2 * v2 alpha2 = mu2 * v2 # v2 = alpha2 + beta2 beta2 = v2 - alpha2 # Maximum number of mismatches outside observed region species_mismatch_threshold = 1 - min_id max_total_mismatches = int( math.floor(species_mismatch_threshold * total_positions)) max_nonregion_mismatches = max_total_mismatches - region_mismatches # Compute probability if soft_threshold: threshold_fcn = soft_species_probability else: threshold_fcn = hard_species_probability prob_compatible = threshold_assignment_probability( region_mismatches, region_positions, nonregion_subject_positions, alpha2, beta2, 100 * species_mismatch_threshold, threshold_fcn, ) prob_incompatible = 1 - prob_compatible return { "typestrain_id": self.alignment.subject_id, "region_mismatches": region_mismatches, "region_positions": region_positions, "probability_incompatible": prob_incompatible, "mu1": mu1, "num_references": len(reference_logvals), "mu2": mu2, "nonregion_positions_in_subject": nonregion_subject_positions, "max_nonregion_mismatches": max_nonregion_mismatches, }