def longest_target_self_matches(self, minimum_length = None): min_len = self._minimum_length matches = {} for target in self.targets: seq = target.seq seq_len = target.n index = 0 candidate = (None, None, None) while True: query = seq[index:index+min_len] match_target, match_index = self.find_exact(query, exclude = (target, index)) if match_target == target: assert(match_index != index) left, right = longest_match(seq, (index, min_len), match_target.seq, (match_index, min_len)) total_len = min_len + left + right if not candidate[1] or total_len > candidate[1]: candidate = (index, total_len, match_index) #print("C {} {} matches {}".format(target.name, candidate, match_target.name)) index += 1 if index >= seq_len - max(min_len, candidate[1]): break # note that we can't say there's no self-match below min_len, # so if we didn't find one, just return that matches[target.name] = candidate[1] or min_len return matches
def find_partial_prefix(self, query): min_len = self._minimum_length word_len = self._index_word_length query_len = len(query) candidate = [None, None, None, None] if query_len < word_len: return candidate site_key = query[:word_len] #print(" fpp query: " + query + " --> " + site_key) for target, index in self._index.get(site_key, []): left, right = longest_match(query, (0, word_len), target.seq, (index, word_len)) total_len = left + right + word_len #print(" extends: <--{}, -->{} / {} ({})".format(left, right, total_len, min_len)) if total_len >= min_len: if not candidate[2] or total_len > candidate[2]: # keep it if it's the best match so far candidate = [target, 0 - left, total_len, index - left] #print("C: {}".format(candidate)) elif total_len == candidate[2] and target != candidate[0]: # need to keep track if multiple candidates have this same max length if isinstance(candidate[0], list): candidate[0] = candidate[0] if target in candidate[0] else [ target ] + candidate[0] else: candidate[0] = [ target, candidate[0] ] return candidate
def _find_partial(self, query, force_target, min_length_override, multiple): min_len = min_length_override or self._minimum_length * multiple word_len = self._index_word_length check_every = max(min_len - word_len, 1) # norah has proved that this guarantees finding a match if it exists query_len = len(query) last = query_len - max(check_every, word_len) check_sites = range(0, last, check_every) check_sites.append(last) candidate = [None, None, None, None] # NOTE: it's important to check all sites, and all hits -- to find the longest match. for site in check_sites: site_key = query[site:site+word_len] #print("CS: {}, {}".format(site, site_key)) for target, index in self._index.get(site_key, []): if force_target and target != force_target: continue #print("GOT: " + str(index) + " -- " + target.name) left, right = longest_match(query, (site, word_len), target.seq, (index, word_len)) total_len = left + right + word_len #print("extends: <--{}, -->{} / {} ({})".format(left, right, total_len, min_len)) if total_len >= min_len: if not candidate[2] or total_len > candidate[2]: # keep it if it's the best match so far candidate = [target, site - left, total_len, index - left] #print("C: {}".format(candidate)) elif total_len == candidate[2] and target != candidate[0]: # need to keep track if multiple candidates have this same max length if isinstance(candidate[0], list): candidate[0] = candidate[0] if target in candidate[0] else [ target ] + candidate[0] else: candidate[0] = [ target, candidate[0] ] return candidate
def longest_target_self_matches(self, minimum_length=None): min_len = self._minimum_length matches = {} for target in self.targets: seq = target.seq seq_len = target.n index = 0 candidate = (None, None, None) while True: query = seq[index:index + min_len] match_target, match_index = self.find_exact(query, exclude=(target, index)) if match_target == target: assert (match_index != index) left, right = longest_match(seq, (index, min_len), match_target.seq, (match_index, min_len)) total_len = min_len + left + right if not candidate[1] or total_len > candidate[1]: candidate = (index, total_len, match_index) #print("C {} {} matches {}".format(target.name, candidate, match_target.name)) index += 1 if index >= seq_len - max(min_len, candidate[1]): break # note that we can't say there's no self-match below min_len, # so if we didn't find one, just return that matches[target.name] = candidate[1] or min_len return matches
def find_partial_all(self, query, min_length=0): min_len = min_length or self._minimum_length word_len = self._index_word_length check_every = max( min_len - word_len, 1 ) # norah has proved that this guarantees finding a match if it exists query_len = len(query) last = query_len - max(check_every, word_len) check_sites = range(0, last, check_every) check_sites.append(last) candidates = [] # NOTE: it's important to check all sites, and all hits -- to find the longest match. for site in check_sites: site_key = query[site:site + word_len] #print("CS: {}, {}".format(site, site_key)) for target, index in self._index.get(site_key, []): #print("GOT: " + str(index) + " -- " + target.name) left, right = longest_match(query, (site, word_len), target.seq, (index, word_len)) total_len = left + right + word_len #print("extends: <--{}, -->{} / {} ({})".format(left, right, total_len, min_len)) if total_len >= min_len: candidate = [target, site - left, total_len, index - left] if candidate not in candidates: candidates.append(candidate) return candidates
def find_partial_prefix(self, query): min_len = self._minimum_length word_len = self._index_word_length query_len = len(query) candidate = [None, None, None, None] if query_len < word_len: return candidate site_key = query[:word_len] #print(" fpp query: " + query + " --> " + site_key) for target, index in self._index.get(site_key, []): left, right = longest_match(query, (0, word_len), target.seq, (index, word_len)) total_len = left + right + word_len #print(" extends: <--{}, -->{} / {} ({})".format(left, right, total_len, min_len)) if total_len >= min_len: if not candidate[2] or total_len > candidate[2]: # keep it if it's the best match so far candidate = [target, 0 - left, total_len, index - left] #print("C: {}".format(candidate)) elif total_len == candidate[2] and target != candidate[0]: # need to keep track if multiple candidates have this same max length if isinstance(candidate[0], list): candidate[0] = candidate[0] if target in candidate[ 0] else [target] + candidate[0] else: candidate[0] = [target, candidate[0]] return candidate
def _find_partial(self, query, force_target, min_length_override, multiple): min_len = min_length_override or self._minimum_length * multiple word_len = self._index_word_length check_every = max( min_len - word_len, 1 ) # norah has proved that this guarantees finding a match if it exists query_len = len(query) last = query_len - max(check_every, word_len) check_sites = range(0, last, check_every) check_sites.append(last) candidate = [None, None, None, None] # NOTE: it's important to check all sites, and all hits -- to find the longest match. for site in check_sites: site_key = query[site:site + word_len] #print("CS: {}, {}".format(site, site_key)) for target, index in self._index.get(site_key, []): if force_target and target != force_target: continue #print("GOT: " + str(index) + " -- " + target.name) left, right = longest_match(query, (site, word_len), target.seq, (index, word_len)) total_len = left + right + word_len #print("extends: <--{}, -->{} / {} ({})".format(left, right, total_len, min_len)) if total_len >= min_len: if not candidate[2] or total_len > candidate[2]: # keep it if it's the best match so far candidate = [ target, site - left, total_len, index - left ] #print("C: {}".format(candidate)) elif total_len == candidate[2] and target != candidate[0]: # need to keep track if multiple candidates have this same max length if isinstance(candidate[0], list): candidate[0] = candidate[0] if target in candidate[ 0] else [target] + candidate[0] else: candidate[0] = [target, candidate[0]] return candidate
def find_partial_all(self, query, min_length = 0): min_len = min_length or self._minimum_length word_len = self._index_word_length check_every = max(min_len - word_len, 1) # norah has proved that this guarantees finding a match if it exists query_len = len(query) last = query_len - max(check_every, word_len) check_sites = range(0, last, check_every) check_sites.append(last) candidates = [] # NOTE: it's important to check all sites, and all hits -- to find the longest match. for site in check_sites: site_key = query[site:site+word_len] #print("CS: {}, {}".format(site, site_key)) for target, index in self._index.get(site_key, []): #print("GOT: " + str(index) + " -- " + target.name) left, right = longest_match(query, (site, word_len), target.seq, (index, word_len)) total_len = left + right + word_len #print("extends: <--{}, -->{} / {} ({})".format(left, right, total_len, min_len)) if total_len >= min_len: candidate = [target, site - left, total_len, index - left] if candidate not in candidates: candidates.append(candidate) return candidates