def transform(self, s): """ Burrows-Wheeler transform with SuffixTree """ assert self.EOS not in s, "Input string cannot contain null character ('%s')" % self.EOS # add end of text marker s += self.EOS st = SuffixTree() # construct a suffix tree O(n * log n) # can also be done in O(n) time st.add(s) # walk inorder to find sorted suffixes # only get the length of each suffix lens = self._walk(st.root) # as the last column letter will be left of the suffix # this means it's len(suffix) + 1 # from the end of the input string s r = [0] * len(lens) for i in xrange(len(lens)): l = lens[i] if l == len(lens): r[i] = self.EOS else: r[i] = s[-l - 1] return ''.join(r)
def transform(self, s): """ Burrows-Wheeler transform with SuffixTree """ assert self.EOS not in s, "Input string cannot contain null character ('%s')" % self.EOS # add end of text marker s += self.EOS st = SuffixTree() # construct a suffix tree O(n * log n) # can also be done in O(n) time st.add(s) # walk inorder to find sorted suffixes # only get the length of each suffix lens = self._walk(st.root) # as the last column letter will be left of the suffix # this means it's len(suffix) + 1 # from the end of the input string s r = [0]*len(lens) for i in xrange(len(lens)): l = lens[i] if l == len(lens): r[i] = self.EOS else: r[i] = s[-l-1] return ''.join(r)
def __init__(self, dna): self.dna = dna self.suffix_tree = SuffixTree(len(dna)) self.suffix_array = [] self.first_col = [] self.bwt = [] self.ltof = [] self.init_self()
def suffix_search_lcs(a, b): # left and right bounds, max sizes len_a, len_b = len(a) + 1, len(b) + 1 short = min(len(a), len(b)) tree = SuffixTree(False, [a]) # returns if there is a common substring of length m between a, b def found_common(m): return any(tree.findStringIdx(b[i-m:i]) for i in range(m, len_b)) # exponentially increase l and r l, r = 0, 1 while r < len_a and found_common(r): l, r = r + 1, r * 2 r = min(r, short) # right-most binary search on if substring length is possible while l <= r: m = (l + r) // 2 if found_common(m): l = m + 1 else: r = m - 1 return r
class Notebook(object): """ Notebook object that represents a single text document that is loaded into memory to allow the program to query and report information about its contents. The core logic of the program should be found within this class. """ def __init__(self, file_path): self.file_path = file_path self.suffix_tree = SuffixTree() self._parse_file(self.file_path) def reload(self, file_path=None): if file_path: self.file_path = file_path self.suffix_tree.root = Node() # Restart the suffix tree self._parse_file(self.file_path) def _parse_file(self, file_path): with open(file_path,'r') as notebook: # build the contents found in the file for i, line in enumerate(notebook.readlines()): word = '' # word buffer start = -1 for j, char in enumerate(line): if char in WHITE_KEYS or char in SPECIAL_CHARS: self._add_suffixes(word, start, i) word = '' start = -1 else: if start==-1: start = j word += char # Adds a word and all of its suffixes def _add_suffixes(self, word, position, line_no, whole_word=True): if len(word) == 0: return self.suffix_tree.add_word(word.lower(), ((position, line_no), whole_word))
def suffix_search_lcs(a, b): # left and right bounds, max sizes len_a, len_b = len(a) + 1, len(b) + 1 short = min(len(a), len(b)) tree = SuffixTree(True, [a]) print('Completed suffix tree') # returns if there is a common substring of length m between a, b def found_common(m): return any(tree.findStringIdx(b[i - m:i]) for i in range(m, len_b)) # exponentially increase l and r l, r = 0, 1 print(l, r) while r < len_a and found_common(r): l, r = r + 1, r * 2 print(l, r) r = min(r, short) print(l, r) # right-most binary search on if substring length is possible while l <= r: m = (l + r) // 2 print(m) if found_common(m): l = m + 1 else: r = m - 1 print('Longest Common Substrings:') print('\n'.join( set(b[i - r:i] for i in range(r, len_b) if tree.findStringIdx(b[i - r:i])))) return r
@dataclass class Point: lat: float lon: float def __hash__(self) -> int: return hash((self.lat, self.lon)) def __eq__(self, o: object) -> bool: return self.__hash__() == o.__hash__() def __str__(self) -> str: return f'{self.lat},{self.lon}' s = SuffixTree() s.generate( ( Point(1,1), Point(1,0), Point(0,1), Point(1,1), Point(1,0), Point(0,0), ) ) # s.generate('MISSISSIPPI$') # annotate graph for vizualization for i in range(1, s.order()): parent = s.parent_id(i)
''' Created on Oct 23, 2018 @author: ckennington ''' from stlm import STLM from suffixtree import SuffixTree from sequence import Sequence trie = SuffixTree() text = 'c a c a o'.split() for w in text: print('adding', w) trie.add(w) print('\n') trie.print_tree() print('\n') trie.update_all_counts() stlm = STLM(trie) tests = [ 'c a'.split(), 'c a o'.split(), 'a o'.split(), 'o'.split(), 'c'.split() ] for test in tests: seq = Sequence()
class Mapper: def __init__(self, dna): self.dna = dna self.suffix_tree = SuffixTree(len(dna)) self.suffix_array = [] self.first_col = [] self.bwt = [] self.ltof = [] self.init_self() def init_self(self): """initializes lists needed for mapping""" for c in self.dna: self.suffix_tree.add_char(c) root = self.suffix_tree.nodes[self.suffix_tree.root] self.traverse_tree(root, root.start) self.first_col = create_subscripts( [self.dna[x] for x in self.suffix_array]) self.bwt = create_subscripts([ self.dna[x - 1] if x > 0 else self.dna[-1] for x in self.suffix_array ]) self.ltof = [self.first_col.index(x) for x in self.bwt] def traverse_tree(self, node, char_depth): """traverse the tree recursively to generate suffix array""" if not node.edges: self.suffix_array.append(node.start - char_depth - 1) return chars = ['$', 'A', 'C', 'G', 'T'] for char in chars: try: next_node = self.suffix_tree.nodes[node.edges[char]] new_depth = char_depth + (node.end - node.start) self.traverse_tree(next_node, new_depth) except KeyError: pass def get_position_range(self, char, list_type, start, end): """returns range of positions for a char in specified type of list""" positions = [] for index in xrange(start, end + 1): if list_type[index].startswith(char): positions.append(index) break for index in xrange(end, start, -1): if list_type[index].startswith(char): positions.append(index) break return positions def map(self, pattern): """find a given pattern in the genome""" found_positions = [] rev_pattern = pattern[::-1] current_positions = self.get_position_range(rev_pattern[0], self.first_col, 0, len(self.first_col) - 1) for i in xrange(1, len(rev_pattern)): try: bwt_positions = self.get_position_range( rev_pattern[i], self.bwt, current_positions[0], current_positions[-1]) ltof_positions = range(self.ltof[bwt_positions[0]], self.ltof[bwt_positions[-1]] + 1) current_positions = ltof_positions except IndexError: # may not be able to find, simply return so don't have to go through entire pattern return found_positions # after last char, push every position in SA to found_positions for i in current_positions: found_positions.append(self.suffix_array[i]) return found_positions
def __init__(self, file_path): self.file_path = file_path self.suffix_tree = SuffixTree() self._parse_file(self.file_path)