示例#1
0
def test_implicit():
    st = SuffixTree('abcd')
    st_d = st.as_dict()
    assert (st_d == {'d': {}, 'cd': {}, 'bcd': {}, 'abcd': {}})

    st = SuffixTree('xabxa')
    st_d = st.as_dict()
    assert (st_d == {'bxa': {}, 'abxa': {}, 'xabxa': {}})
示例#2
0
 def __init__(self, D_plus, D_minus, alphabet):
     self.word = '1'.join(D_plus) + '1' + '0'.join(D_minus) + '0$'
     self.suffixTree = SuffixTree(self.word)
     self.alphabet = alphabet | {'0', '1', '$'}
     self.edges = self.suffixTree.edges
     if self.__has_edges(0):  # bug fix
         self.suffixTree.nodes[0].suffix_node = 0
示例#3
0
def test_insertion(string: str):
    st = SuffixTree()
    st.insert_string(string)
    assert len([node for node in st.nodes
                if node.end is None]) == len(string) + 2
    for suffix in suffixes(string):
        assert suffix in st
示例#4
0
    def init_with_text(self, text):
        self.__text = text + "$"
        self.__suffix_array = []

        tree = SuffixTree()
        tree.init_with_text(self.__text[:-1])

        self.__suffix_array = tree.gen_suffix_array()
示例#5
0
def test_insert_multiple(lst):
    st = SuffixTree()
    for string in lst:
        st.insert_string(string)
    assert (len([node for node in st.nodes if node.end is None
                 ]) == sum(len(string) + 1 for string in lst) + 1)
    for suffix in suffixes(string):
        assert suffix in st
def find_shortest_nonshared_substring(seq_1, seq_2):
    st = SuffixTree(seq_2)
    for k in range(2, len(seq_1) + 1):
        shared = True
        for kmer in generate_kmers(seq_1, k):
            if not st.has_substring(kmer):
                shared = False
                break
        if not shared:
            return kmer
示例#7
0
def create_suffix_tree(t):
    process = psutil.Process(os.getpid())
    start = time.clock()
    suffix_tree = SuffixTree(t)
    end = time.clock()
    mem = process.memory_info().rss / 1024 / 1024
    print('------------------------------------------------------')
    print('Creating index struct - memory usage: ' + str(mem) + ' Mb')
    print('Creating index struct - time elapsed: ' + str(end - start) + 's')
    print('------------------------------------------------------')
    return suffix_tree
示例#8
0
 def measure_suffix_tree(text, name):
     with open("report.txt", "a+", encoding="utf-8") as report:
         report.write(
             "Measuring time for Suffix Tree --- {}\n".format(name))
         start_ns = time.time_ns()
         start_s = time.time()
         tree = SuffixTree(text)
         end_ns = time.time_ns()
         end_s = time.time()
         report.write("It took --- {} ns\nIt took --- {} s\n\n".format(
             end_ns - start_ns, end_s - start_s))
    def test_repeated_string(self):
        st = SuffixTree("aaa")
        self.assertEqual(st.find_substring('a'), 0)
        self.assertEqual(st.find_substring('aa'), 0)
        self.assertEqual(st.find_substring('aaa'), 0)
        self.assertEqual(st.find_substring('b'), -1)
        self.assertTrue(st.has_substring('a'))
        self.assertTrue(st.has_substring('aa'))
        self.assertTrue(st.has_substring('aaa'))

        self.assertFalse(st.has_substring('aaaa'))
        self.assertFalse(st.has_substring('b'))
        #case sensitive by default
        self.assertFalse(st.has_substring('A'))
示例#10
0
def make_suffix_tree(filename):
    final = []
    string = ''
    title = ''
    flag = 0
    with open(filename, "r", encoding="utf8") as f:
        for i in f.readlines():
            line = i.strip()
            if len(line) == 0:
                flag = flag - 1
                continue
            elif flag == 0 and len(line) != 0:  # It is a title
                final.append(SuffixTree(string, title, False))
                string = ''
                title = line
                flag = 2
                continue
            elif flag == 1 and len(line) != 0:  # Content
                flag = 2
                string += line + ' '
            elif flag == 2:
                string += line + ' '
    final.append(SuffixTree(string, title, False))
    return final
 def test_long_string(self):
     f = open("test.txt")
     st = SuffixTree(f.read())
     self.assertEqual(st.find_substring('Ukkonen'), 1498)
     self.assertEqual(st.find_substring('Optimal'), 11131)
     self.assertFalse(st.has_substring('ukkonen'))
示例#12
0
import sys

out = open("output.txt", 'w')

Input = open(sys.argv[1], 'r').read().split("\n")

text = Input[0].strip()

from suffix_tree import SuffixTree
stree = SuffixTree(text)

max_len = 0
for i in stree.innerNodes:
    s = i.pathLabel
    if len(s) > max_len:
        max_len = len(s)
        longestRep = s

print >> out, longestRep
示例#13
0
import sys

from suffix_tree import SuffixTree

with open("test.txt") as f:
    st = SuffixTree(f.read(), case_insensitive=True)

print(st.find_word_by_prefix(sys.argv[1]))
示例#14
0
def test_contains_not(text, search_term):
    st = SuffixTree()
    st.insert_string(text)
    assert search_term not in st
示例#15
0
from suffix_tree import SuffixTree
import sys

File = open(sys.argv[1]).read()
#     SuffixTree
stree = SuffixTree(File)
 def test_empty_string(self):
     st = SuffixTree('')
     self.assertEqual(st.find_substring('not there'), -1)
     self.assertEqual(st.find_substring(''), -1)
     self.assertFalse(st.has_substring('not there'))
     self.assertFalse(st.has_substring(''))
示例#17
0
    artificial_text_input += c

i = 0
for input in basic_inputs:
    i = i + 1
    trie = Trie(input)
    check_validity(input,
                   trie,
                   1,
                   test_string="Trie- Basic test no. " + str(i))
    simple_suffix_tree = SuffixTreeNaive(input)
    check_validity(input,
                   simple_suffix_tree,
                   1,
                   test_string="Simple SuffixTree- Basic test no. " + str(i))
    fast_suffix_tree = SuffixTree(input)
    check_validity(input,
                   fast_suffix_tree,
                   1,
                   test_string="McCreight- Basic test no. " + str(i))

trie = Trie(natural_text_input[:2000])
check_validity(natural_text_input[:2000],
               trie,
               10,
               test_string="Trie- law act test")
simple_suffix_tree = SuffixTreeNaive(natural_text_input)
check_validity(natural_text_input,
               simple_suffix_tree,
               1000,
               test_string="Simple SuffixTree- law act text")
示例#18
0
def main():

    a = 3
    b = 3
    read = open("query_output.txt", "r")
    hit_inputs = ["-l", "-s", "-f"]
    extend_inputs = ["-d", "-s", "-g"]
    print "Please type your command:",
    inputs = sys.stdin.readline()
    for i in range(len(inputs)):
        if inputs[i] != " ":
            hit = inputs[i:i + 2]
            break
    extend = inputs[-3:-1]
    while hit not in hit_inputs or extend not in extend_inputs:
        print "Wrong command, please type again: ",
        input = sys.stdin.readline()
        for i in range(len(input)):
            if input[i] != " ":
                hit = input[i:2]
                break
        extend = input[-3:-1]
    query_sequences = []
    query_sequence = ""
    while True:
        s = read.readline()
        if s == "":
            if query_sequence != "":
                query_sequences.append(query_sequence)
                break
        if s[0] == ">":
            if query_sequence != "":
                query_sequences.append(query_sequence)
                query_sequence = ""
        else:
            if s[-1] == "\n":
                s = s[:-1]
            query_sequence += s
    k_size = 3
    threshold = 11
    query_neighbors = generate_all_neighbors(query_sequences, k_size,
                                             threshold)

    protein_name = []
    protein_sequences = get_protein_data("ecoliProtSeq.fasta", protein_name)

    if hit == "-l":
        all_match_positions = get_match_position_org(protein_sequences,
                                                     query_neighbors, k_size)

    if hit == "-s":
        proteins_suffix_trees = []
        for i in range(len(protein_sequences)):
            proteins_suffix_trees.append(SuffixTree(protein_sequences[i]))

        all_match_positions = get_match_position_with_suffixtree(
            query_neighbors, proteins_suffix_trees)

    if hit == "-f":
        protein_fmindexs = []
        for i in range(len(protein_sequences)):
            protein_fmindexs.append(FMIndex(protein_sequences[i] + "$", a, b))

        all_match_positions = get_match_position_with_fmindex(
            query_neighbors, protein_fmindexs)

    # print len(all_match_positions)

    result = []
    for k in range(len(all_match_positions)):
        for i in range(len(protein_sequences)):
            result.append({})
            for l in range(len(query_neighbors)):
                for k_mer in query_neighbors[l]:
                    for j in range(len(query_neighbors[l][k_mer])):
                        if k_mer in all_match_positions[k][i]:
                            for pos in all_match_positions[k][i][k_mer]:
                                try:
                                    if extend == "-g":
                                        score, q_offset, s_offset, comp_seg, q_seg, s_seg = extending_with_gap(
                                            query_sequences[l],
                                            query_neighbors[l][k_mer][j],
                                            protein_sequences[i], pos, k_size,
                                            -5, 10)
                                    if extend == "-s":
                                        score, q_offset, s_offset, comp_seg, q_seg, s_seg = extending_with_score(
                                            query_sequences[l],
                                            query_neighbors[l][k_mer][j],
                                            protein_sequences[i], pos, k_size,
                                            10)
                                    if extend == "-d":
                                        score, q_offset, s_offset, comp_seg, q_seg, s_seg = extending_with_drop(
                                            query_sequences[l],
                                            query_neighbors[l][k_mer][j],
                                            protein_sequences[i], pos, k_size,
                                            3)
                                except:
                                    continue
                                list = [
                                    q_offset, s_offset, comp_seg, q_seg, s_seg
                                ]
                                if (score in result):
                                    result[i][score].append(list)
                                else:
                                    result[i][score] = list
        #     print "i", i
        # print "k", k
    sys.stdout = open("output_result.txt", "w")
    for i in range(len(protein_sequences)):
        print(protein_name[i])
        display = result[i]
        sort_order = sorted(display, reverse=True)
        m = 0
        for j in sort_order:
            k = 0
            m += 1
            while k < len(display[j]):
                score = j
                q_offset = display[j][k]
                k += 1
                s_offset = display[j][k]
                k += 1
                comp_seg = display[j][k]
                k += 1
                q_seg = display[j][k]
                k += 1
                s_seg = display[j][k]
                k += 1
                print "Score: ", score
                print "Query:  ", "{0:4d}".format(
                    q_offset), "  ", q_seg, "  ", q_offset + len(q_seg)
                print "                ", comp_seg
                print "Sbjct:  ", "{0:4d}".format(
                    s_offset), "  ", s_seg, "  ", s_offset + len(s_seg)
                print ""
        print(
            "-----------------------------------------------------------------------------------------"
        )
示例#19
0
        line = line.replace('\n', '')
        line = line.upper()
        if (line == ""):
            continue
        if (line[0] == '>'):
            if (temp != ""):
                referenceGenome.append(temp)
            temp = ""
            continue
        temp += line
referenceGenome.append(temp)
# print referenceGenome[0]
print "Reference Genome read"

print "Suffix Tree start"
stree = SuffixTree(referenceGenome[0])
print "Suffix Tree created"

with open(alignmentResults, 'w+') as f:
    c = ''
    c += "%exact" + newLine
    f.write(c)
for filename in inputFiles:
    inputFasta = []
    inputFastaId = []
    # outputResults={}
    temp = ""
    countAlignments = 0

    with open(filename) as f:
        temp = ""
示例#20
0
 def __init__(self, string):
     self.__stree = SuffixTree(string)
示例#21
0
def test_create():
    st = SuffixTree()
    assert st is not None
示例#22
0
def test_find_all(text_search_string_locations):
    text, search_string, locations = text_search_string_locations
    st = SuffixTree()
    st.insert_string(text)
    result = st.find_all(search_string)
    assert sorted([r[1] for r in result]) == sorted(locations)
示例#23
0
def test_occurrances(text_search_term_insertion_count):
    text, search_term, insertion_count = text_search_term_insertion_count
    st = SuffixTree()
    st.insert_string(text)
    assert st.occurrences(search_term) == insertion_count
 def test_case_sensitivity(self):
     f = open("test.txt")
     st = SuffixTree(f.read(), case_insensitive=True)
     self.assertEqual(st.find_substring('ukkonen'), 1498)
     self.assertEqual(st.find_substring('Optimal'), 1830)
 def test_repr(self):
     st = SuffixTree("t")
     output = '\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t0 \tt\n'
     import pdb
     pdb.set_trace()
     self.assertEqual(st.__repr__(), output)
示例#26
0
import re  # regular expression
import dataset

fables = []  # Stores the SuffixTree Objects. One object per Story.
string = ''
title = ''
flag = 0
# Read from the Data file.
with open("AesopTales.txt", "r") as f:
    for i in f.readlines():
        line = i.strip()
        if len(line) == 0:
            flag = flag - 1
            continue
        elif flag == 0 and len(line) != 0:  # It is a title
            fables.append(SuffixTree(string, title, True))
            string = ''
            title = line
            flag = 2
            continue
        elif flag == 1 and len(line) != 0:  # Content
            flag = 2
            string += line + ' '
        elif flag == 2:
            string += line + ' '

fables.append(SuffixTree(string, title, True))
del fables[0]
n = len(fables)  # There are 312 stories in the Given Dataset.
print("Number of stories Read : ", n)
示例#27
0
 def build(self):
     self.suffix_tree = SuffixTree(self.corpus_str)
    #    for i in range(0, len(seqs[0]) - l + 1):
    #        s = seqs[0][i:i + l]
    #        #print(s)
    #        bad = False
    #        for lv in stree.leaves:
    #            if lv.pathLabel.startswith(s):
    #                bad = True
    #                continue
    #        if not bad:
    #            if len(s) < len(shortest):
    #                shortest = s
    #                print(s)
    #
    #print(shortest)

    stree = SuffixTree(seqs[0] + seqs[1])

    shortest = seqs[0]
    for l in stree.leaves:
        print(l.pathLabel)
        if len(l.pathLabel) < len(shortest):
            shortest = l.pathLabel

    print(shortest)

    #print(list((n.pathLabel for n in stree.innerNodes)))

    #res = ''
    #for l in stree.postOrderNodes:
    #    print(l.edgeLabel)
    #    res += l.edgeLabel + '\n'
示例#29
0
class TestSuffixTree(unittest.TestCase):

    suffix_tree = SuffixTree('abracadabra')

    def test_get_suffix_array(self):
        self.assertIn('abra', self.suffix_tree.get_suffix_array())
示例#30
0
def test_contains(text_and_search_term):
    text, search_term = text_and_search_term
    st = SuffixTree()
    st.insert_string(text)
    for suffix in suffixes(search_term):
        assert suffix in st