def __init__(self, D_plus, D_minus, alphabet): self.word = '1'.join(D_plus) + '1' + '0'.join(D_minus) + '0$' self.suffixTree = SuffixTree(self.word) self.alphabet = alphabet | {'0', '1', '$'} self.edges = self.suffixTree.edges if self.__has_edges(0): # bug fix self.suffixTree.nodes[0].suffix_node = 0
def init_with_text(self, text): self.__text = text + "$" self.__suffix_array = [] tree = SuffixTree() tree.init_with_text(self.__text[:-1]) self.__suffix_array = tree.gen_suffix_array()
def constructSuffixArray(main_sequence): tree = SuffixTree(len(main_sequence)) for char in main_sequence: tree.add_char(char) # tree.print_graphviz_tree() return tree.depthFirstSearch()
def find_shortest_nonshared_substring(seq_1, seq_2): st = SuffixTree(seq_2) for k in range(2, len(seq_1) + 1): shared = True for kmer in generate_kmers(seq_1, k): if not st.has_substring(kmer): shared = False break if not shared: return kmer
def test_init(self): st = SuffixTree('foo') rows = st.get_rows() self.assertEqual(len(rows), 5) # root plus len(foo) + null term self.assertEqual(len(rows[0]), 1) # The root level self.assertEqual(len(rows[1]), 3) # 'f', 'o', and '\0' self.assertEqual(len(rows[2]), 3) # 'o' (of f), 'o' (of o), # '\0' (of o) self.assertEqual(len(rows[3]), 2) # 'o' (of o of f), '\0' (of o of o) self.assertEqual(len(rows[4]), 1) # '\0' (of o) root_item = rows[0][0] self.assertEqual(root_item.let, None) self.assertEqual(root_item.parent, None) self.assertEqual(root_item.depth, 0) self.assertEqual(root_item.positions, set([None])) self.assertIn('f', root_item.children) self.assertIn('o', root_item.children) self.assertIn("\0", root_item.children) f_item = root_item.children['f'] o_item = root_item.children['o'] null_item = root_item.children['\0'] row1 = (f_item, o_item, null_item) self.assertEqual([item.let for item in row1], ['f', 'o', '\0']) self.assertEqual([item.parent for item in row1], 3 * [rows[0][0], ]) self.assertEqual([item.depth for item in row1], [1, 1, 1]) self.assertEqual([item.positions for item in row1], [set([0]), set([1, 2]), set([3])]) f_child_o = f_item.children['o'] self.assertEqual(f_child_o.let, 'o') self.assertEqual(f_child_o.parent, f_item) self.assertEqual(f_child_o.depth, 2) self.assertEqual(f_child_o.positions, set([1])) o_child_o = o_item.children['o'] self.assertEqual(o_child_o.let, 'o') self.assertEqual(o_child_o.parent, o_item) self.assertEqual(o_child_o.depth, 2) self.assertEqual(o_child_o.positions, set([2])) # FIXME: Comprehensive testing is really called for here. Test every # node in the tree. lowest_null_parent = f_child_o.children['o'] lowest_null = lowest_null_parent.children['\0'] self.assertEqual(lowest_null.let, '\0') self.assertEqual(lowest_null.parent, lowest_null_parent) self.assertEqual(lowest_null.depth, 4) self.assertEqual(lowest_null.positions, set([3]))
def test_insertion(string: str): st = SuffixTree() st.insert_string(string) assert len([node for node in st.nodes if node.end is None]) == len(string) + 2 for suffix in suffixes(string): assert suffix in st
def test_repeated_string(self): st = SuffixTree("aaa") self.assertEqual(st.find_substring('a'), 0) self.assertEqual(st.find_substring('aa'), 0) self.assertEqual(st.find_substring('aaa'), 0) self.assertEqual(st.find_substring('b'), -1) self.assertTrue(st.has_substring('a')) self.assertTrue(st.has_substring('aa')) self.assertTrue(st.has_substring('aaa'))
def test_insert_multiple(lst): st = SuffixTree() for string in lst: st.insert_string(string) assert (len([node for node in st.nodes if node.end is None ]) == sum(len(string) + 1 for string in lst) + 1) for suffix in suffixes(string): assert suffix in st
def measure_suffix_tree(text, name): with open("report.txt", "a+", encoding="utf-8") as report: report.write( "Measuring time for Suffix Tree --- {}\n".format(name)) start_ns = time.time_ns() start_s = time.time() tree = SuffixTree(text) end_ns = time.time_ns() end_s = time.time() report.write("It took --- {} ns\nIt took --- {} s\n\n".format( end_ns - start_ns, end_s - start_s))
def test_get_depth_tuples(self): tree = SuffixTree.from_seq(['a', 'a', 'b']) expect(tree.items()).to( equal( set(( (('a', '$'), 2), (('a', 'a', '$'), 1), (('a', 'a', 'b', '$'), 1), (('a', 'b', '$'), 1), (('b', '$'), 1), ))))
def create_suffix_tree(t): process = psutil.Process(os.getpid()) start = time.clock() suffix_tree = SuffixTree(t) end = time.clock() mem = process.memory_info().rss / 1024 / 1024 print('------------------------------------------------------') print('Creating index struct - memory usage: ' + str(mem) + ' Mb') print('Creating index struct - time elapsed: ' + str(end - start) + 's') print('------------------------------------------------------') return suffix_tree
def test_search(self): st = SuffixTree('This is a test') self.assertEqual(st.search('T'), [0]) self.assertEqual(st.search('Th'), [0]) self.assertEqual(st.search('h'), [1]) self.assertEqual(st.search('is'), [2, 5]) self.assertEqual(st.search('qqqqq'), None)
def test_implicit(): st = SuffixTree('abcd') st_d = st.as_dict() assert (st_d == {'d': {}, 'cd': {}, 'bcd': {}, 'abcd': {}}) st = SuffixTree('xabxa') st_d = st.as_dict() assert (st_d == {'bxa': {}, 'abxa': {}, 'xabxa': {}})
def make_suffix_tree(filename): final = [] string = '' title = '' flag = 0 with open(filename, "r", encoding="utf8") as f: for i in f.readlines(): line = i.strip() if len(line) == 0: flag = flag - 1 continue elif flag == 0 and len(line) != 0: # It is a title final.append(SuffixTree(string, title, False)) string = '' title = line flag = 2 continue elif flag == 1 and len(line) != 0: # Content flag = 2 string += line + ' ' elif flag == 2: string += line + ' ' final.append(SuffixTree(string, title, False)) return final
def test_two_elements_in_tree(self): tree = SuffixTree.from_seq(['a', 'a']) expect(tree['a'].count).to(equal(2)) expect(tree['a']['a'].count).to(equal(1))
def test_contains_not(text, search_term): st = SuffixTree() st.insert_string(text) assert search_term not in st
def test_chinese_text(self): st = SuffixTree(codecs.open("test.txt", encoding="utf-8").read()) self.assertTrue(st.find_substring(u'概括性总结'))
def test_text_string(self): f = codecs.open("test.txt", encoding='utf-8') st = SuffixTree(f.read()) self.assertTrue(st.has_substring(u'a'))
def test_empty_string(self): st = SuffixTree('') self.assertEqual(st.find_substring('not there'), -1) self.assertEqual(st.find_substring(''), -1) self.assertFalse(st.has_substring('not there')) self.assertFalse(st.has_substring(''))
def test_case_sensitivity(self): f = open("test.txt") st = SuffixTree(f.read(), case_insensitive=True) self.assertEqual(st.find_substring('ukkonen'), 1498) self.assertEqual(st.find_substring('Optimal'), 1830)
import sys from suffix_tree import SuffixTree with open("test.txt") as f: st = SuffixTree(f.read(), case_insensitive=True) print(st.find_word_by_prefix(sys.argv[1]))
# for i in range(0, len(seqs[0]) - l + 1): # s = seqs[0][i:i + l] # #print(s) # bad = False # for lv in stree.leaves: # if lv.pathLabel.startswith(s): # bad = True # continue # if not bad: # if len(s) < len(shortest): # shortest = s # print(s) # #print(shortest) stree = SuffixTree(seqs[0] + seqs[1]) shortest = seqs[0] for l in stree.leaves: print(l.pathLabel) if len(l.pathLabel) < len(shortest): shortest = l.pathLabel print(shortest) #print(list((n.pathLabel for n in stree.innerNodes))) #res = '' #for l in stree.postOrderNodes: # print(l.edgeLabel) # res += l.edgeLabel + '\n'
def test_create(): st = SuffixTree() assert st is not None
def test_repr(self): st = SuffixTree("t") output = '\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t0 \t\nt\n' self.assertEqual(st.__repr__(), output)
import re # regular expression import dataset fables = [] # Stores the SuffixTree Objects. One object per Story. string = '' title = '' flag = 0 # Read from the Data file. with open("AesopTales.txt", "r") as f: for i in f.readlines(): line = i.strip() if len(line) == 0: flag = flag - 1 continue elif flag == 0 and len(line) != 0: # It is a title fables.append(SuffixTree(string, title, True)) string = '' title = line flag = 2 continue elif flag == 1 and len(line) != 0: # Content flag = 2 string += line + ' ' elif flag == 2: string += line + ' ' fables.append(SuffixTree(string, title, True)) del fables[0] n = len(fables) # There are 312 stories in the Given Dataset. print("Number of stories Read : ", n)
artificial_text_input += c i = 0 for input in basic_inputs: i = i + 1 trie = Trie(input) check_validity(input, trie, 1, test_string="Trie- Basic test no. " + str(i)) simple_suffix_tree = SuffixTreeNaive(input) check_validity(input, simple_suffix_tree, 1, test_string="Simple SuffixTree- Basic test no. " + str(i)) fast_suffix_tree = SuffixTree(input) check_validity(input, fast_suffix_tree, 1, test_string="McCreight- Basic test no. " + str(i)) trie = Trie(natural_text_input[:2000]) check_validity(natural_text_input[:2000], trie, 10, test_string="Trie- law act test") simple_suffix_tree = SuffixTreeNaive(natural_text_input) check_validity(natural_text_input, simple_suffix_tree, 1000, test_string="Simple SuffixTree- law act text")
class SuffixClassifier(object): def __init__(self, D_plus, D_minus, alphabet): self.word = '1'.join(D_plus) + '1' + '0'.join(D_minus) + '0$' self.suffixTree = SuffixTree(self.word) self.alphabet = alphabet | {'0', '1', '$'} self.edges = self.suffixTree.edges if self.__has_edges(0): # bug fix self.suffixTree.nodes[0].suffix_node = 0 def print_suffix_tree(self, printDirectDebug = True): print(self.word) self.print_tree_part('0', 0) print() if printDirectDebug: print(self.suffixTree.__repr__()) def print_tree_part(self, prefix, nodeId): found = False first = True for e in self.suffixTree.edges.values(): if e.source_node_index == nodeId: found = True text = self.suffixTree.string[e.first_char_index:e.last_char_index+1] if first: first = False else: prefix = (' ' * (len(prefix) - 1)) + '\\' data_node = self.__print_data_node(e.source_node_index) link = "%s%s--'%s'--%d" % (prefix, data_node, text, e.dest_node_index) if not self.__has_edges(e.dest_node_index): link += self.__print_data_node(e.dest_node_index) self.print_tree_part(link, e.dest_node_index) if found == False: print(('\n' if prefix[0] in ['0','\\'] else '') + prefix) def __print_data_node(self, nodeId): if hasattr(self.suffixTree.nodes[nodeId], 'negative'): node = self.suffixTree.nodes[nodeId] return "(%d,%d)" % (node.negative, node.positive) return "" def truncate_tree(self): self.nodes = {} # convert to dictionary because we need node # at specific index after remove any elements before for i in range(len(self.suffixTree.nodes)): u = self.suffixTree.nodes[i] u.positive = 0 #n_+ u.negative = 0 #n_- self.nodes[i] = u self.__truncate(0) def __truncate(self, u: int): for v, l in self.__traverse(u): if not any(c in l for c in ['0','1','$']): # case (a) self.__truncate(v) elif l[0] in ['0', '1']: # (b) or (c) leafs = self.__remove_with_edge(u, v, l) self.__increment(u, l[0], leafs) elif l[0] == '$': #(d) self.edges.pop((u, l[0])) self.nodes.pop(v) # not remove u! else: e = self.edges[(u, l[0])] sub = e.first_char_index for c in l: if c in ['0', '1']: # (e) or (f) leafs = self.__remove_from_node(v) self.__increment(v, c, leafs) break sub += 1 e.last_char_index = sub - 1 def __remove_from_node(self, u: int): #remove subtree from node if self.nodes[u].suffix_node == -1: #speedup, leaf return 1 leafs = 0 for v, l in self.__traverse(u): leafs += self.__remove_from_node(v) self.edges.pop((u, l[0])) self.nodes.pop(v) self.nodes[u].suffix_node = -1 return leafs def __remove_with_edge(self, u: int, v: int, l: Label): #remove subtree from edge leafs = self.__remove_from_node(v) self.edges.pop((u, l[0])) self.nodes.pop(v) #check if u is not leaf now if not self.__has_edges(u): self.nodes[u].suffix_node = -1 return leafs def recalculate(self, u): if self.nodes[u].suffix_node == -1: return self.nodes[u].negative, self.nodes[u].positive negative = 0 positive = 0 for v, l in self.__traverse(u): neg, pos = self.recalculate(v) negative += neg positive += pos self.__increment(u, '0', negative) self.__increment(u, '1', positive) return self.nodes[u].negative, self.nodes[u].positive def __increment(self, u: int, type: str, leafs): if type == '0': self.nodes[u].negative += leafs else: self.nodes[u].positive += leafs def __has_edges(self, u: int): return any(self.__traverse(u)) def __traverse(self, u): for c in self.alphabet: if (u, c) in self.edges: e = self.edges[(u, c)] l = Label(self, e.first_char_index, e.last_char_index) yield (e.dest_node_index, l)
class TestSuffixTree(unittest.TestCase): suffix_tree = SuffixTree('abracadabra') def test_get_suffix_array(self): self.assertIn('abra', self.suffix_tree.get_suffix_array())
def test_find_all(text_search_string_locations): text, search_string, locations = text_search_string_locations st = SuffixTree() st.insert_string(text) result = st.find_all(search_string) assert sorted([r[1] for r in result]) == sorted(locations)
def test_repr(self): st = SuffixTree("t") output = '\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t0 \tt\n' import pdb pdb.set_trace() self.assertEqual(st.__repr__(), output)
import sys out = open("output.txt", 'w') Input = open(sys.argv[1], 'r').read().split("\n") text = Input[0].strip() from suffix_tree import SuffixTree stree = SuffixTree(text) max_len = 0 for i in stree.innerNodes: s = i.pathLabel if len(s) > max_len: max_len = len(s) longestRep = s print >> out, longestRep
def build(self): self.suffix_tree = SuffixTree(self.corpus_str)
def test_repeated_string(self): st = SuffixTree("aaa") self.assertEqual(st.find_substring('a'), 0) self.assertEqual(st.find_substring('aa'), 0) self.assertEqual(st.find_substring('aaa'), 0) self.assertEqual(st.find_substring('b'), -1) self.assertTrue(st.has_substring('a')) self.assertTrue(st.has_substring('aa')) self.assertTrue(st.has_substring('aaa')) self.assertFalse(st.has_substring('aaaa')) self.assertFalse(st.has_substring('b')) #case sensitive by default self.assertFalse(st.has_substring('A'))
def test_occurrances(text_search_term_insertion_count): text, search_term, insertion_count = text_search_term_insertion_count st = SuffixTree() st.insert_string(text) assert st.occurrences(search_term) == insertion_count
def test_long_string(self): f = open("test.txt") st = SuffixTree(f.read()) self.assertEqual(st.find_substring('Ukkonen'), 1498) self.assertEqual(st.find_substring('Optimal'), 11131) self.assertFalse(st.has_substring('ukkonen'))
def test_chinese_string(self): st = SuffixTree(u"才高八斗") self.assertTrue(st.has_substring(u'高')) self.assertFalse(st.has_substring(u'豆豆'))
from suffix_tree import SuffixTree import sys File = open(sys.argv[1]).read() # SuffixTree stree = SuffixTree(File)
def test_repr(self): st = SuffixTree("t") output = '\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t0 \tt\n' import pdb;pdb.set_trace() self.assertEqual(st.__repr__(), output)
#! /usr/bin/env python from suffix_tree import SuffixTree import sys if __name__ == "__main__": with open(sys.argv[1]) as fd: text = fd.readline().strip() tree = SuffixTree(len(text)) for char in text: tree.add_char(char) print tree.depthFirstSearch()
def test_elements_are_reversed(self): tree = SuffixTree.from_seq(['a', 'a', 'b']) expect(tree['a']['b'].count).to(equal(1)) expect(tree['b']['a'].count).to(equal(0))
def test_single_suffix_tree(self): tree = SuffixTree.from_seq(['a']) expect(tree['a'].count).to(equal(1)) expect(tree['b'].count).to(equal(0)) expect(tree['a']['$'].count).to(equal(1))
from suffix_tree import SuffixTree, GST from document import Directory #f = open("AesopTales.txt") Aesop=Directory() string=Aesop.documentify("AesopTales.txt") tree= SuffixTree(string) # all occurences of a substring in all the stories ip=input() indices=tree.all_occurences(ip) print " all occurences of",ip,": ",indices count=0 for i in indices: title='' j=i for doc in Aesop.docs: if(i> doc.start and i< doc.end): title=doc.title j-=doc.start print j, title, string[i:i+40],"\n\n" count+=1 print"(",count,"occurences )" #first occurence/closest match in each story query=input() for doc in Aesop.docs: if doc.start-doc.end >=0: continue story= string[doc.start:doc.end] st=SuffixTree(story)