def test_implicit(): st = SuffixTree('abcd') st_d = st.as_dict() assert (st_d == {'d': {}, 'cd': {}, 'bcd': {}, 'abcd': {}}) st = SuffixTree('xabxa') st_d = st.as_dict() assert (st_d == {'bxa': {}, 'abxa': {}, 'xabxa': {}})
def __init__(self, D_plus, D_minus, alphabet): self.word = '1'.join(D_plus) + '1' + '0'.join(D_minus) + '0$' self.suffixTree = SuffixTree(self.word) self.alphabet = alphabet | {'0', '1', '$'} self.edges = self.suffixTree.edges if self.__has_edges(0): # bug fix self.suffixTree.nodes[0].suffix_node = 0
def test_insertion(string: str): st = SuffixTree() st.insert_string(string) assert len([node for node in st.nodes if node.end is None]) == len(string) + 2 for suffix in suffixes(string): assert suffix in st
def init_with_text(self, text): self.__text = text + "$" self.__suffix_array = [] tree = SuffixTree() tree.init_with_text(self.__text[:-1]) self.__suffix_array = tree.gen_suffix_array()
def test_insert_multiple(lst): st = SuffixTree() for string in lst: st.insert_string(string) assert (len([node for node in st.nodes if node.end is None ]) == sum(len(string) + 1 for string in lst) + 1) for suffix in suffixes(string): assert suffix in st
def find_shortest_nonshared_substring(seq_1, seq_2): st = SuffixTree(seq_2) for k in range(2, len(seq_1) + 1): shared = True for kmer in generate_kmers(seq_1, k): if not st.has_substring(kmer): shared = False break if not shared: return kmer
def create_suffix_tree(t): process = psutil.Process(os.getpid()) start = time.clock() suffix_tree = SuffixTree(t) end = time.clock() mem = process.memory_info().rss / 1024 / 1024 print('------------------------------------------------------') print('Creating index struct - memory usage: ' + str(mem) + ' Mb') print('Creating index struct - time elapsed: ' + str(end - start) + 's') print('------------------------------------------------------') return suffix_tree
def measure_suffix_tree(text, name): with open("report.txt", "a+", encoding="utf-8") as report: report.write( "Measuring time for Suffix Tree --- {}\n".format(name)) start_ns = time.time_ns() start_s = time.time() tree = SuffixTree(text) end_ns = time.time_ns() end_s = time.time() report.write("It took --- {} ns\nIt took --- {} s\n\n".format( end_ns - start_ns, end_s - start_s))
def test_repeated_string(self): st = SuffixTree("aaa") self.assertEqual(st.find_substring('a'), 0) self.assertEqual(st.find_substring('aa'), 0) self.assertEqual(st.find_substring('aaa'), 0) self.assertEqual(st.find_substring('b'), -1) self.assertTrue(st.has_substring('a')) self.assertTrue(st.has_substring('aa')) self.assertTrue(st.has_substring('aaa')) self.assertFalse(st.has_substring('aaaa')) self.assertFalse(st.has_substring('b')) #case sensitive by default self.assertFalse(st.has_substring('A'))
def make_suffix_tree(filename): final = [] string = '' title = '' flag = 0 with open(filename, "r", encoding="utf8") as f: for i in f.readlines(): line = i.strip() if len(line) == 0: flag = flag - 1 continue elif flag == 0 and len(line) != 0: # It is a title final.append(SuffixTree(string, title, False)) string = '' title = line flag = 2 continue elif flag == 1 and len(line) != 0: # Content flag = 2 string += line + ' ' elif flag == 2: string += line + ' ' final.append(SuffixTree(string, title, False)) return final
def test_long_string(self): f = open("test.txt") st = SuffixTree(f.read()) self.assertEqual(st.find_substring('Ukkonen'), 1498) self.assertEqual(st.find_substring('Optimal'), 11131) self.assertFalse(st.has_substring('ukkonen'))
import sys out = open("output.txt", 'w') Input = open(sys.argv[1], 'r').read().split("\n") text = Input[0].strip() from suffix_tree import SuffixTree stree = SuffixTree(text) max_len = 0 for i in stree.innerNodes: s = i.pathLabel if len(s) > max_len: max_len = len(s) longestRep = s print >> out, longestRep
import sys from suffix_tree import SuffixTree with open("test.txt") as f: st = SuffixTree(f.read(), case_insensitive=True) print(st.find_word_by_prefix(sys.argv[1]))
def test_contains_not(text, search_term): st = SuffixTree() st.insert_string(text) assert search_term not in st
from suffix_tree import SuffixTree import sys File = open(sys.argv[1]).read() # SuffixTree stree = SuffixTree(File)
def test_empty_string(self): st = SuffixTree('') self.assertEqual(st.find_substring('not there'), -1) self.assertEqual(st.find_substring(''), -1) self.assertFalse(st.has_substring('not there')) self.assertFalse(st.has_substring(''))
artificial_text_input += c i = 0 for input in basic_inputs: i = i + 1 trie = Trie(input) check_validity(input, trie, 1, test_string="Trie- Basic test no. " + str(i)) simple_suffix_tree = SuffixTreeNaive(input) check_validity(input, simple_suffix_tree, 1, test_string="Simple SuffixTree- Basic test no. " + str(i)) fast_suffix_tree = SuffixTree(input) check_validity(input, fast_suffix_tree, 1, test_string="McCreight- Basic test no. " + str(i)) trie = Trie(natural_text_input[:2000]) check_validity(natural_text_input[:2000], trie, 10, test_string="Trie- law act test") simple_suffix_tree = SuffixTreeNaive(natural_text_input) check_validity(natural_text_input, simple_suffix_tree, 1000, test_string="Simple SuffixTree- law act text")
def main(): a = 3 b = 3 read = open("query_output.txt", "r") hit_inputs = ["-l", "-s", "-f"] extend_inputs = ["-d", "-s", "-g"] print "Please type your command:", inputs = sys.stdin.readline() for i in range(len(inputs)): if inputs[i] != " ": hit = inputs[i:i + 2] break extend = inputs[-3:-1] while hit not in hit_inputs or extend not in extend_inputs: print "Wrong command, please type again: ", input = sys.stdin.readline() for i in range(len(input)): if input[i] != " ": hit = input[i:2] break extend = input[-3:-1] query_sequences = [] query_sequence = "" while True: s = read.readline() if s == "": if query_sequence != "": query_sequences.append(query_sequence) break if s[0] == ">": if query_sequence != "": query_sequences.append(query_sequence) query_sequence = "" else: if s[-1] == "\n": s = s[:-1] query_sequence += s k_size = 3 threshold = 11 query_neighbors = generate_all_neighbors(query_sequences, k_size, threshold) protein_name = [] protein_sequences = get_protein_data("ecoliProtSeq.fasta", protein_name) if hit == "-l": all_match_positions = get_match_position_org(protein_sequences, query_neighbors, k_size) if hit == "-s": proteins_suffix_trees = [] for i in range(len(protein_sequences)): proteins_suffix_trees.append(SuffixTree(protein_sequences[i])) all_match_positions = get_match_position_with_suffixtree( query_neighbors, proteins_suffix_trees) if hit == "-f": protein_fmindexs = [] for i in range(len(protein_sequences)): protein_fmindexs.append(FMIndex(protein_sequences[i] + "$", a, b)) all_match_positions = get_match_position_with_fmindex( query_neighbors, protein_fmindexs) # print len(all_match_positions) result = [] for k in range(len(all_match_positions)): for i in range(len(protein_sequences)): result.append({}) for l in range(len(query_neighbors)): for k_mer in query_neighbors[l]: for j in range(len(query_neighbors[l][k_mer])): if k_mer in all_match_positions[k][i]: for pos in all_match_positions[k][i][k_mer]: try: if extend == "-g": score, q_offset, s_offset, comp_seg, q_seg, s_seg = extending_with_gap( query_sequences[l], query_neighbors[l][k_mer][j], protein_sequences[i], pos, k_size, -5, 10) if extend == "-s": score, q_offset, s_offset, comp_seg, q_seg, s_seg = extending_with_score( query_sequences[l], query_neighbors[l][k_mer][j], protein_sequences[i], pos, k_size, 10) if extend == "-d": score, q_offset, s_offset, comp_seg, q_seg, s_seg = extending_with_drop( query_sequences[l], query_neighbors[l][k_mer][j], protein_sequences[i], pos, k_size, 3) except: continue list = [ q_offset, s_offset, comp_seg, q_seg, s_seg ] if (score in result): result[i][score].append(list) else: result[i][score] = list # print "i", i # print "k", k sys.stdout = open("output_result.txt", "w") for i in range(len(protein_sequences)): print(protein_name[i]) display = result[i] sort_order = sorted(display, reverse=True) m = 0 for j in sort_order: k = 0 m += 1 while k < len(display[j]): score = j q_offset = display[j][k] k += 1 s_offset = display[j][k] k += 1 comp_seg = display[j][k] k += 1 q_seg = display[j][k] k += 1 s_seg = display[j][k] k += 1 print "Score: ", score print "Query: ", "{0:4d}".format( q_offset), " ", q_seg, " ", q_offset + len(q_seg) print " ", comp_seg print "Sbjct: ", "{0:4d}".format( s_offset), " ", s_seg, " ", s_offset + len(s_seg) print "" print( "-----------------------------------------------------------------------------------------" )
line = line.replace('\n', '') line = line.upper() if (line == ""): continue if (line[0] == '>'): if (temp != ""): referenceGenome.append(temp) temp = "" continue temp += line referenceGenome.append(temp) # print referenceGenome[0] print "Reference Genome read" print "Suffix Tree start" stree = SuffixTree(referenceGenome[0]) print "Suffix Tree created" with open(alignmentResults, 'w+') as f: c = '' c += "%exact" + newLine f.write(c) for filename in inputFiles: inputFasta = [] inputFastaId = [] # outputResults={} temp = "" countAlignments = 0 with open(filename) as f: temp = ""
def __init__(self, string): self.__stree = SuffixTree(string)
def test_create(): st = SuffixTree() assert st is not None
def test_find_all(text_search_string_locations): text, search_string, locations = text_search_string_locations st = SuffixTree() st.insert_string(text) result = st.find_all(search_string) assert sorted([r[1] for r in result]) == sorted(locations)
def test_occurrances(text_search_term_insertion_count): text, search_term, insertion_count = text_search_term_insertion_count st = SuffixTree() st.insert_string(text) assert st.occurrences(search_term) == insertion_count
def test_case_sensitivity(self): f = open("test.txt") st = SuffixTree(f.read(), case_insensitive=True) self.assertEqual(st.find_substring('ukkonen'), 1498) self.assertEqual(st.find_substring('Optimal'), 1830)
def test_repr(self): st = SuffixTree("t") output = '\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t0 \tt\n' import pdb pdb.set_trace() self.assertEqual(st.__repr__(), output)
import re # regular expression import dataset fables = [] # Stores the SuffixTree Objects. One object per Story. string = '' title = '' flag = 0 # Read from the Data file. with open("AesopTales.txt", "r") as f: for i in f.readlines(): line = i.strip() if len(line) == 0: flag = flag - 1 continue elif flag == 0 and len(line) != 0: # It is a title fables.append(SuffixTree(string, title, True)) string = '' title = line flag = 2 continue elif flag == 1 and len(line) != 0: # Content flag = 2 string += line + ' ' elif flag == 2: string += line + ' ' fables.append(SuffixTree(string, title, True)) del fables[0] n = len(fables) # There are 312 stories in the Given Dataset. print("Number of stories Read : ", n)
def build(self): self.suffix_tree = SuffixTree(self.corpus_str)
# for i in range(0, len(seqs[0]) - l + 1): # s = seqs[0][i:i + l] # #print(s) # bad = False # for lv in stree.leaves: # if lv.pathLabel.startswith(s): # bad = True # continue # if not bad: # if len(s) < len(shortest): # shortest = s # print(s) # #print(shortest) stree = SuffixTree(seqs[0] + seqs[1]) shortest = seqs[0] for l in stree.leaves: print(l.pathLabel) if len(l.pathLabel) < len(shortest): shortest = l.pathLabel print(shortest) #print(list((n.pathLabel for n in stree.innerNodes))) #res = '' #for l in stree.postOrderNodes: # print(l.edgeLabel) # res += l.edgeLabel + '\n'
class TestSuffixTree(unittest.TestCase): suffix_tree = SuffixTree('abracadabra') def test_get_suffix_array(self): self.assertIn('abra', self.suffix_tree.get_suffix_array())
def test_contains(text_and_search_term): text, search_term = text_and_search_term st = SuffixTree() st.insert_string(text) for suffix in suffixes(search_term): assert suffix in st