def expectedOutcomes(self, search_results, expected_outcomes): best_from_grouped_results = [ get_best_match_in_group(group) for group in group_matches(search_results) ] best_from_grouped_exepected_outcomes = [ get_best_match_in_group(group) for group in group_matches(expected_outcomes) ] return self.assertEqual(best_from_grouped_results, best_from_grouped_exepected_outcomes)
def expectedOutcomes(self, search_results, expected_outcomes, *args, **kwargs): best_from_grouped_results = [ get_best_match_in_group(group) for group in group_matches(search_results) ] best_from_grouped_exepected_outcomes = [ get_best_match_in_group(group) for group in group_matches(expected_outcomes) ] return self.assertEqual(best_from_grouped_results, best_from_grouped_exepected_outcomes, *args, **kwargs)
def trim_primer( seq_record: SeqIO.SeqRecord, primer_seqs: List[str], max_mismatch: Union[float, int] = 0.14, ) -> SeqIO.SeqRecord: """ Trim primer sequences. Parameters ---------- seq_record : Bio.Seq.SeqRecord input sequence record primer_seqs : list list of the foward and reverse primer sequnces max_mismatch : float Maximum number (or proportion) of mismatches allowed for searching primer sequeces (default: 0.14) """ seq = seq_record.seq fwd, rev = primer_seqs rev_rc = revc(rev) len_fwd, len_rev = len(fwd), len(rev) if max_mismatch > 1: max_l_dist1 = max_l_dist2 = max_mismatch elif max_mismatch > 0: max_l_dist1 = round(len_fwd * max_mismatch) max_l_dist2 = round(len_rev * max_mismatch) else: raise ValueError("max_mismatch must be a positive value") m0 = find_near_matches(fwd, str(seq), max_l_dist=max_l_dist1) m1 = find_near_matches(rev_rc, str(seq), max_l_dist=max_l_dist2) if len(m0) > 0: match_fwd = get_best_match_in_group(m0) if len(m1) > 0: match_rev_rc = get_best_match_in_group(m1) if len(m0) > 0 and len(m1) > 0: tr = seq_record[match_fwd.end : match_rev_rc.start] elif len(m0) > 0: tr = seq_record[match_fwd.end :] elif len(m1) > 0: tr = seq_record[: match_rev_rc.start] else: tr = seq_record[:] return tr
def find_near_matches_levenshtein(subsequence, sequence, max_l_dist): """Find near-matches of the subsequence in the sequence. This chooses a suitable fuzzy search implementation according to the given parameters. Returns a list of fuzzysearch.Match objects describing the matching parts of the sequence. """ if not subsequence: raise ValueError('Given subsequence is empty!') if max_l_dist < 0: raise ValueError('Maximum Levenshtein distance must be >= 0!') if max_l_dist == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] elif len(subsequence) // (max_l_dist + 1) >= 3: return find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist) else: matches = find_near_matches_levenshtein_linear_programming( subsequence, sequence, max_l_dist) match_groups = group_matches(matches) best_matches = [ get_best_match_in_group(group) for group in match_groups ] return sorted(best_matches)
def expectedOutcomes(self, search_result, expected_outcomes, *args, **kwargs): best_from_groups = [ get_best_match_in_group(group) for group in group_matches(search_result) ] self.assertEqual(search_result, best_from_groups, *args, **kwargs)
def search(self, subsequence, sequence, max_subs): return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams( subsequence, sequence, LevenshteinSearchParams(max_subs, 0, 0, max_subs))) ]
def find_near_matches_substitutions_ngrams(subsequence, sequence, max_substitutions): if not ( isinstance(subsequence, text_type) or isinstance(sequence, text_type) ): try: results = _subs_only_fnm_ngram_byteslike( subsequence, sequence, max_substitutions) except TypeError: pass else: matches = [ Match( index, index + len(subsequence), count_differences_with_maximum( sequence[index:index+len(subsequence)], subsequence, max_substitutions + 1, ), ) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ] return py_find_near_matches_substitutions_ngrams( subsequence, sequence, max_substitutions)
def search(self, subsequence, sequence, max_l_dist): return [ get_best_match_in_group(group) for group in group_matches( fnm_generic_lp( subsequence, sequence, LevenshteinSearchParams(max_l_dist, max_l_dist, max_l_dist, max_l_dist))) ]
def search(self, subsequence, sequence, max_l_dist): return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_lp(subsequence, sequence, LevenshteinSearchParams(max_l_dist, max_l_dist, max_l_dist, max_l_dist)) ) ]
def search(self, subsequence, sequence, max_subs): if max_subs >= len(subsequence): self.skipTest("avoiding calling c_fnm_generic_ngrams() " + "with max_subs >= len(subsequence)") return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams( subsequence, sequence, LevenshteinSearchParams(max_subs, 0, 0, max_subs))) ]
def expectedOutcomes(self, search_results, expected_outcomes, *args, **kwargs): consolidated_results = [ get_best_match_in_group(group) for group in group_matches(search_results) ] consolidated_expected_outcomes = \ consolidate_overlapping_matches(expected_outcomes) return self.assertEqual(consolidated_results, consolidated_expected_outcomes, *args, **kwargs)
def search(self, subsequence, sequence, max_subs): if max_subs >= len(subsequence): self.skipTest("avoiding calling c_fnm_generic_ngrams() " + "with max_subs >= len(subsequence)") return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams(subsequence, sequence, LevenshteinSearchParams(max_subs, 0, 0, max_subs)) ) ]
def search(self, pattern, sequence, max_subs, max_ins, max_dels, max_l_dist=None): return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams(pattern, sequence, LevenshteinSearchParams( max_subs, max_ins, max_dels, max_l_dist, )) ) ]
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist): subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_l_dist + 1) if ngram_len == 0: raise ValueError( 'the subsequence length must be greater than max_l_dist') matches = [] for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before_reversed = subsequence[:ngram_start][::-1] subseq_after = subsequence[ngram_end:] start_index = max(0, ngram_start - max_l_dist) end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist) for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index): # try to expand left and/or right according to n_ngram dist_right, right_expand_size = _expand( subseq_after, sequence[index + ngram_len:index - ngram_start + subseq_len + max_l_dist], max_l_dist, ) if dist_right is None: continue dist_left, left_expand_size = _expand( subseq_before_reversed, sequence[max(0, index - ngram_start - (max_l_dist - dist_right)):index][::-1], max_l_dist - dist_right, ) if dist_left is None: continue assert dist_left + dist_right <= max_l_dist matches.append( Match( start=index - left_expand_size, end=index + ngram_len + right_expand_size, dist=dist_left + dist_right, )) # don't return overlapping matches; instead, group overlapping matches # together and return the best match from each group match_groups = group_matches(matches) best_matches = [get_best_match_in_group(group) for group in match_groups] return sorted(best_matches)
def search(self, subsequence, sequence, max_subs): results = fnm_subs_ngrams_byteslike(subsequence, sequence, max_subs) matches = [ Match(index, index + len(subsequence), count_differences_with_maximum( sequence[index:index + len(subsequence)], subsequence, max_subs + 1, ), matched=sequence[index:index + len(subsequence)]) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ]
def search(self, pattern, sequence, max_subs, max_ins, max_dels, max_l_dist=None): return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams( pattern, sequence, LevenshteinSearchParams( max_subs, max_ins, max_dels, max_l_dist, ))) ]
def search(self, subsequence, sequence, max_subs): results = fnm_subs_ngrams_byteslike(subsequence, sequence, max_subs) matches = [ Match( index, index + len(subsequence), count_differences_with_maximum( sequence[index:index+len(subsequence)], subsequence, max_subs + 1, ), ) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ]
def find_near_matches_generic_ngrams(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the maximum allowed number of character substitutions * the maximum allowed number of new characters inserted * and the maximum allowed number of character deletions * the total number of substitutions, insertions and deletions """ if not subsequence: raise ValueError('Given subsequence is empty!') matches = list( _find_near_matches_generic_ngrams(subsequence, sequence, search_params)) # don't return overlapping matches; instead, group overlapping matches # together and return the best match from each group match_groups = group_matches(matches) best_matches = [get_best_match_in_group(group) for group in match_groups] return sorted(best_matches)
def find_near_matches_generic(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the maximum allowed number of character substitutions * the maximum allowed number of new characters inserted * and the maximum allowed number of character deletions * the total number of substitutions, insertions and deletions """ if not subsequence: raise ValueError('Given subsequence is empty!') # if the limitations are so strict that only exact matches are allowed, # use search_exact() if search_params.max_l_dist == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] # if the n-gram length would be at least 3, use the n-gram search method elif len(subsequence) // (search_params.max_l_dist + 1) >= 3: return find_near_matches_generic_ngrams(subsequence, sequence, search_params) # use the linear programming search method else: matches = find_near_matches_generic_linear_programming( subsequence, sequence, search_params) match_groups = group_matches(matches) best_matches = [ get_best_match_in_group(group) for group in match_groups ] return sorted(best_matches)
def search(self, subsequence, sequence, max_l_dist): return [ get_best_match_in_group(group) for group in group_matches( fnm_levenshtein_lp(subsequence, sequence, max_l_dist)) ]