def expectedOutcomes(self, search_results, expected_outcomes): best_from_grouped_results = [ get_best_match_in_group(group) for group in group_matches(search_results) ] best_from_grouped_exepected_outcomes = [ get_best_match_in_group(group) for group in group_matches(expected_outcomes) ] return self.assertEqual(best_from_grouped_results, best_from_grouped_exepected_outcomes)
def expectedOutcomes(self, search_results, expected_outcomes, *args, **kwargs): best_from_grouped_results = [ get_best_match_in_group(group) for group in group_matches(search_results) ] best_from_grouped_exepected_outcomes = [ get_best_match_in_group(group) for group in group_matches(expected_outcomes) ] return self.assertEqual(best_from_grouped_results, best_from_grouped_exepected_outcomes, *args, **kwargs)
def expectedOutcomes(self, search_result, expected_outcomes, *args, **kwargs): best_from_groups = [ get_best_match_in_group(group) for group in group_matches(search_result) ] self.assertEqual(search_result, best_from_groups, *args, **kwargs)
def expectedOutcomes(self, search_result, expected_outcomes, *args, **kwargs): best_from_groups = [ get_best_match_in_group(group) for group in group_matches(search_result) ] self.assertEqual(search_result, best_from_groups, *args, **kwargs)
def search(self, subsequence, sequence, max_subs): return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams( subsequence, sequence, LevenshteinSearchParams(max_subs, 0, 0, max_subs))) ]
def find_variable_ssrs(align, min_variants=3, **kwargs): """ Find variable SSR regions from a multiple sequence alignment Parameters ---------- align: Bio.AlignIO.MultipleSeqAlignment input alignment min_variants: int Minimum number of variants for valiable SSR regions **kwargs The keyward arguments are used for find_ssr() See Also ---------- find_ssrs() """ matches = [find_ssrs(str(a.seq), max_interrupt=0, **kwargs) for a in align] match_groups = group_matches(chain(*matches)) ssr_regions = [] motifs = [] for group in match_groups: if len(group) >= min_variants: starts, ends = list(zip(*[[rep.start, rep.end] for rep in group])) ssr_regions.append((np.min(starts), np.max(ends))) motifs.append(get_longest_RepData(group).motif) return ssr_regions, motifs
def find_near_matches_substitutions_ngrams(subsequence, sequence, max_substitutions): if not ( isinstance(subsequence, text_type) or isinstance(sequence, text_type) ): try: results = _subs_only_fnm_ngram_byteslike( subsequence, sequence, max_substitutions) except TypeError: pass else: matches = [ Match( index, index + len(subsequence), count_differences_with_maximum( sequence[index:index+len(subsequence)], subsequence, max_substitutions + 1, ), ) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ] return py_find_near_matches_substitutions_ngrams( subsequence, sequence, max_substitutions)
def find_near_matches_levenshtein(subsequence, sequence, max_l_dist): """Find near-matches of the subsequence in the sequence. This chooses a suitable fuzzy search implementation according to the given parameters. Returns a list of fuzzysearch.Match objects describing the matching parts of the sequence. """ if not subsequence: raise ValueError('Given subsequence is empty!') if max_l_dist < 0: raise ValueError('Maximum Levenshtein distance must be >= 0!') if max_l_dist == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] elif len(subsequence) // (max_l_dist + 1) >= 3: return find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist) else: matches = find_near_matches_levenshtein_linear_programming( subsequence, sequence, max_l_dist) match_groups = group_matches(matches) best_matches = [ get_best_match_in_group(group) for group in match_groups ] return sorted(best_matches)
def search(self, subsequence, sequence, max_l_dist): return [ get_best_match_in_group(group) for group in group_matches( fnm_generic_lp( subsequence, sequence, LevenshteinSearchParams(max_l_dist, max_l_dist, max_l_dist, max_l_dist))) ]
def search(self, subsequence, sequence, max_l_dist): return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_lp(subsequence, sequence, LevenshteinSearchParams(max_l_dist, max_l_dist, max_l_dist, max_l_dist)) ) ]
def test_separate(self): matches = [ Match(start=19, end=29, dist=1, matched='x'*10), Match(start=42, end=52, dist=1, matched='x'*10), Match(start=99, end=109, dist=0, matched='x'*10), ] self.assertEqual( group_matches(matches), [{m} for m in matches], )
def test_separate_with_duplicate(self): matches = [ Match(start=19, end=29, dist=1), Match(start=42, end=52, dist=1), Match(start=99, end=109, dist=0), ] self.assertEqual( group_matches(matches + [matches[1]]), [set([m]) for m in matches], )
def expectedOutcomes(self, search_results, expected_outcomes, *args, **kwargs): consolidated_results = [ get_best_match_in_group(group) for group in group_matches(search_results) ] consolidated_expected_outcomes = \ consolidate_overlapping_matches(expected_outcomes) return self.assertEqual(consolidated_results, consolidated_expected_outcomes, *args, **kwargs)
def search(self, subsequence, sequence, max_subs): if max_subs >= len(subsequence): self.skipTest("avoiding calling c_fnm_generic_ngrams() " + "with max_subs >= len(subsequence)") return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams( subsequence, sequence, LevenshteinSearchParams(max_subs, 0, 0, max_subs))) ]
def test_separate_with_duplicate(self): matches = [ Match(start=19, end=29, dist=1), Match(start=42, end=52, dist=1), Match(start=99, end=109, dist=0), ] self.assertEqual( group_matches(matches + [matches[1]]), [set([m]) for m in matches], )
def search(self, subsequence, sequence, max_subs): if max_subs >= len(subsequence): self.skipTest("avoiding calling c_fnm_generic_ngrams() " + "with max_subs >= len(subsequence)") return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams(subsequence, sequence, LevenshteinSearchParams(max_subs, 0, 0, max_subs)) ) ]
def search(self, pattern, sequence, max_subs, max_ins, max_dels, max_l_dist=None): return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams(pattern, sequence, LevenshteinSearchParams( max_subs, max_ins, max_dels, max_l_dist, )) ) ]
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist): subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_l_dist + 1) if ngram_len == 0: raise ValueError( 'the subsequence length must be greater than max_l_dist') matches = [] for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before_reversed = subsequence[:ngram_start][::-1] subseq_after = subsequence[ngram_end:] start_index = max(0, ngram_start - max_l_dist) end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist) for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index): # try to expand left and/or right according to n_ngram dist_right, right_expand_size = _expand( subseq_after, sequence[index + ngram_len:index - ngram_start + subseq_len + max_l_dist], max_l_dist, ) if dist_right is None: continue dist_left, left_expand_size = _expand( subseq_before_reversed, sequence[max(0, index - ngram_start - (max_l_dist - dist_right)):index][::-1], max_l_dist - dist_right, ) if dist_left is None: continue assert dist_left + dist_right <= max_l_dist matches.append( Match( start=index - left_expand_size, end=index + ngram_len + right_expand_size, dist=dist_left + dist_right, )) # don't return overlapping matches; instead, group overlapping matches # together and return the best match from each group match_groups = group_matches(matches) best_matches = [get_best_match_in_group(group) for group in match_groups] return sorted(best_matches)
def search(self, subsequence, sequence, max_subs): results = fnm_subs_ngrams_byteslike(subsequence, sequence, max_subs) matches = [ Match(index, index + len(subsequence), count_differences_with_maximum( sequence[index:index + len(subsequence)], subsequence, max_subs + 1, ), matched=sequence[index:index + len(subsequence)]) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ]
def search(self, pattern, sequence, max_subs, max_ins, max_dels, max_l_dist=None): return [ get_best_match_in_group(group) for group in group_matches( c_fnm_generic_ngrams( pattern, sequence, LevenshteinSearchParams( max_subs, max_ins, max_dels, max_l_dist, ))) ]
def search(self, subsequence, sequence, max_subs): results = fnm_subs_ngrams_byteslike(subsequence, sequence, max_subs) matches = [ Match( index, index + len(subsequence), count_differences_with_maximum( sequence[index:index+len(subsequence)], subsequence, max_subs + 1, ), ) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ]
def find_near_matches_generic_ngrams(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the maximum allowed number of character substitutions * the maximum allowed number of new characters inserted * and the maximum allowed number of character deletions * the total number of substitutions, insertions and deletions """ if not subsequence: raise ValueError('Given subsequence is empty!') matches = list( _find_near_matches_generic_ngrams(subsequence, sequence, search_params)) # don't return overlapping matches; instead, group overlapping matches # together and return the best match from each group match_groups = group_matches(matches) best_matches = [get_best_match_in_group(group) for group in match_groups] return sorted(best_matches)
def find_near_matches_generic(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the maximum allowed number of character substitutions * the maximum allowed number of new characters inserted * and the maximum allowed number of character deletions * the total number of substitutions, insertions and deletions """ if not subsequence: raise ValueError('Given subsequence is empty!') # if the limitations are so strict that only exact matches are allowed, # use search_exact() if search_params.max_l_dist == 0: return [ Match(start_index, start_index + len(subsequence), 0) for start_index in search_exact(subsequence, sequence) ] # if the n-gram length would be at least 3, use the n-gram search method elif len(subsequence) // (search_params.max_l_dist + 1) >= 3: return find_near_matches_generic_ngrams(subsequence, sequence, search_params) # use the linear programming search method else: matches = find_near_matches_generic_linear_programming( subsequence, sequence, search_params) match_groups = group_matches(matches) best_matches = [ get_best_match_in_group(group) for group in match_groups ] return sorted(best_matches)
def find_ssrs( seq: str, min_repeats: int = 3, motif: Union[str, List[str]] = [], motif_class: Union[str, List[str]] = [], min_motif_len: int = 2, max_motif_len: int = 6, max_interrupt: int = 0, start: int = 0, end: Optional[int] = None, **kwargs, ) -> Optional[List[RepData]]: """ Find short sequence repeats in the given sequence string. Parameters ---------- seq: str input sequence string min_repeats: int minimum number of repeats to search (default: 3) motif: str or list repeat motif to search (default: None) motif_class: str or list class of repeat motif to search (default: None) min_motif_len: int minimum length of repeat motif (default: 2) max_motif_len: int maximum length of repeat motif (default: 6) max_interrupt: int maximum length of interruption to allow (default: None) start: int starting postion where ssr needs to be search end: int ending postion where ssr needs to be search """ subseq = seq[start:end] matches = [] if motif: if isinstance(motif, list): motifs = motif elif isinstance(motif, str): motifs = [motif] else: raise TypeError("motif must be a str or list") motif_classes = [get_motif_class(m) for m in motifs] for m, mcls in zip(motifs, motif_classes): for s, e, n, rs in _find_ssrs(subseq, m, min_repeats, max_interrupt): matches.append(RepData(s + start, e + start, n, rs, m, mcls)) else: if not motif_class: motif_classes = list( gen_motif_classes(min_motif_len, max_motif_len)) elif isinstance(motif_class, list): motif_classes = motif_class elif isinstance(motif_class, str): motif_classes = [motif_class] else: raise TypeError("motif_class must be a str or list") for mcls in motif_classes: for m in motif_set(mcls): for s, e, n, rs in _find_ssrs(subseq, m, min_repeats, max_interrupt): matches.append( RepData(s + start, e + start, n, rs, m, mcls)) match_groups = group_matches(matches) best_matches = [get_longest_RepData(g) for g in match_groups if g] return list(sorted(best_matches, key=lambda m: m.start))
def search(self, subsequence, sequence, max_l_dist): return [ get_best_match_in_group(group) for group in group_matches( fnm_levenshtein_lp(subsequence, sequence, max_l_dist)) ]