def suffix_search_lcs(a, b): # left and right bounds, max sizes len_a, len_b = len(a) + 1, len(b) + 1 short = min(len(a), len(b)) tree = SuffixTree(False, [a]) # returns if there is a common substring of length m between a, b def found_common(m): return any(tree.findStringIdx(b[i-m:i]) for i in range(m, len_b)) # exponentially increase l and r l, r = 0, 1 while r < len_a and found_common(r): l, r = r + 1, r * 2 r = min(r, short) # right-most binary search on if substring length is possible while l <= r: m = (l + r) // 2 if found_common(m): l = m + 1 else: r = m - 1 return r
def transform(self, s): """ Burrows-Wheeler transform with SuffixTree """ assert self.EOS not in s, "Input string cannot contain null character ('%s')" % self.EOS # add end of text marker s += self.EOS st = SuffixTree() # construct a suffix tree O(n * log n) # can also be done in O(n) time st.add(s) # walk inorder to find sorted suffixes # only get the length of each suffix lens = self._walk(st.root) # as the last column letter will be left of the suffix # this means it's len(suffix) + 1 # from the end of the input string s r = [0] * len(lens) for i in xrange(len(lens)): l = lens[i] if l == len(lens): r[i] = self.EOS else: r[i] = s[-l - 1] return ''.join(r)
def __init__(self, dna): self.dna = dna self.suffix_tree = SuffixTree(len(dna)) self.suffix_array = [] self.first_col = [] self.bwt = [] self.ltof = [] self.init_self()
def suffix_search_lcs(a, b): # left and right bounds, max sizes len_a, len_b = len(a) + 1, len(b) + 1 short = min(len(a), len(b)) tree = SuffixTree(True, [a]) print('Completed suffix tree') # returns if there is a common substring of length m between a, b def found_common(m): return any(tree.findStringIdx(b[i - m:i]) for i in range(m, len_b)) # exponentially increase l and r l, r = 0, 1 print(l, r) while r < len_a and found_common(r): l, r = r + 1, r * 2 print(l, r) r = min(r, short) print(l, r) # right-most binary search on if substring length is possible while l <= r: m = (l + r) // 2 print(m) if found_common(m): l = m + 1 else: r = m - 1 print('Longest Common Substrings:') print('\n'.join( set(b[i - r:i] for i in range(r, len_b) if tree.findStringIdx(b[i - r:i])))) return r
@dataclass class Point: lat: float lon: float def __hash__(self) -> int: return hash((self.lat, self.lon)) def __eq__(self, o: object) -> bool: return self.__hash__() == o.__hash__() def __str__(self) -> str: return f'{self.lat},{self.lon}' s = SuffixTree() s.generate( ( Point(1,1), Point(1,0), Point(0,1), Point(1,1), Point(1,0), Point(0,0), ) ) # s.generate('MISSISSIPPI$') # annotate graph for vizualization for i in range(1, s.order()): parent = s.parent_id(i)
''' Created on Oct 23, 2018 @author: ckennington ''' from stlm import STLM from suffixtree import SuffixTree from sequence import Sequence trie = SuffixTree() text = 'c a c a o'.split() for w in text: print('adding', w) trie.add(w) print('\n') trie.print_tree() print('\n') trie.update_all_counts() stlm = STLM(trie) tests = [ 'c a'.split(), 'c a o'.split(), 'a o'.split(), 'o'.split(), 'c'.split() ] for test in tests: seq = Sequence()