Пример #1
0
def get_partial_suffix_array(text, k):
    tree = SuffixTree( len( text ) )
    for char in text:
        tree.add_char( char )
    suffix_array = get_suffix_array( tree )
    partial_suffix_array = []
    for i in range( len(suffix_array) ):
        if suffix_array[i] % k == 0:
            partial_suffix_array.append( (i, suffix_array[i]) )
    return partial_suffix_array
Пример #2
0
    def _addToTree(self, key):
        """Adds a key into one of the SuffixTrees, and returns the
        string index of that key.

        There's a reason why this insertion is a little complicated:
        SuffixTree structure can hold, at most, strmat.MAXNUMSTR
        strings and strmat.MAXNUMNODES nodes.  If we go beyond that,
        the add() will fail.  To solve this, we use multiple
        trees."""
        (index, self._next_index) = (self._next_index, self._next_index + 1)
        ok = self._trees[-1].add(key, index)
        if ok:
            return index
        self.debug("Now expanding the tree:")
        self._trees.append(SuffixTree())
        ok = self._trees[-1].add(key, index)
        assert (ok)
        return index
Пример #3
0
 def test_empty_string(self):
     st = SuffixTree('')
     self.assertEqual(st.find_substring('not there'), -1)
     self.assertEqual(st.find_substring(''), -1)
     self.assertFalse(st.has_substring('not there'))
     self.assertFalse(st.has_substring(''))
Пример #4
0
 def test_case_sensitivity(self):
     f = open("test.txt")
     st = SuffixTree(f.read(), case_insensitive=True)
     self.assertEqual(st.find_substring('ukkonen'), 1498)
     self.assertEqual(st.find_substring('Optimal'), 1830)
Пример #5
0
 def test_long_string(self):
     f = open("test.txt")
     st = SuffixTree(f.read())
     self.assertEqual(st.find_substring('Ukkonen'), 1498)
     self.assertEqual(st.find_substring('Optimal'), 11131)
     self.assertFalse(st.has_substring('ukkonen'))
Пример #6
0
    def test_repeated_string(self):
        st = SuffixTree("aaa")
        self.assertEqual(st.find_substring('a'), 0)
        self.assertEqual(st.find_substring('aa'), 0)
        self.assertEqual(st.find_substring('aaa'), 0)
        self.assertEqual(st.find_substring('b'), -1)
        self.assertTrue(st.has_substring('a'))
        self.assertTrue(st.has_substring('aa'))
        self.assertTrue(st.has_substring('aaa'))

        self.assertFalse(st.has_substring('aaaa'))
        self.assertFalse(st.has_substring('b'))
        #case sensitive by default
        self.assertFalse(st.has_substring('A'))
Пример #7
0
## PS4 problem 1: suffix bwt algorithm / Burrows Wheeler Transform (BWT)
## The aim of BWT is to map millions of short reads to the genome in O(m) time
## where m is the length of the short read

from SuffixTree import SuffixTree

if __name__ == "__main__":

    genome = "MANOLISKELLIS"
    bwt = SuffixTree(genome)
    sp, ep, bwtsp, bwtep = bwt.findMatch("OLAS")

    if sp > 0 and ep > 0:

        print("Table ptr locations:")
        print("sp = ", sp, ", ep = ", ep)
        print("-----")

        locations = bwt.getGenomeLocations(bwtsp, bwtep)

        print("Genome locations:")
        print("bwtsp = ", bwtsp, ", bwtep = ", bwtep)
        print(genome)
        i, j = 0, 0
        while j < len(genome) and i < len(locations):
            if locations[i] == j:
                print("^", end='')
                i += 1
            else:
                print(" ", end='')
            j += 1
Пример #8
0
 def __init__(self, debug_flag=0):
     self._trees = [SuffixTree()]
     self._dict = {}
     self._debug_flag = debug_flag
     self._next_index = 1
Пример #9
0
def get_suffix_array_from_text(text):
    tree = SuffixTree(len(text))
    for char in text:
        tree.add_char(char)
    return get_suffix_array(tree)
Пример #10
0
        edges.sort()
        for e in edges:
            n = tree.nodes[e[1]]
            child_depth = depth + n.end - n.start
            if len(n.edges) == 0:
                suffix_array.append(n.start - depth)

            build_suffix_array(n, child_depth)

    build_suffix_array(tree.nodes[tree.root], 0)

    return suffix_array


def get_suffix_array_from_text(text):
    tree = SuffixTree(len(text))
    for char in text:
        tree.add_char(char)
    return get_suffix_array(tree)


if __name__ == "__main__":
    with open(sys.argv[1]) as fh:
        text = next(fh).strip()

    tree = SuffixTree(len(text))
    for char in text:
        tree.add_char(char)

    print ", ".join([str(x) for x in get_suffix_array(tree)])
Пример #11
0
from Regice import Regice
from SuffixTree import SuffixTree
regice = Regice()
filepath = './sample/index.html'
# tokenizeしたものを木にして
tokens = regice.tokenize_from_file('./sample/index.html')
st = SuffixTree(tokens)
tokens2 = regice.tokenize_from_file('./sample/test.html')
print(tokens2)
num_token = len(tokens2)
res = []
for i in range(3, num_token):
    for j in range(num_token - i):
        res.append(tokens2[j:j+i])
        print(j, i)
        print([pattern for pattern in st.search_pattern_all(tokens2[j:j+i])])

# regice.analyze(filepath)
# regice.all_similaries()
Пример #12
0
from DNAHelper.helper_functions import *
from SuffixTree import SuffixTree

alphabet = 'AGTCN'
text = read_genome('../Dataset/phix.fa')
reads, qualities = read_fastq('../Dataset/phix.fastq')

tree = SuffixTree(alphabet)
tree.build_generalized_suffix_tree([text])

count = 0
matches = []
for read in reads:
    read = read[:30]
    matches = tree.find(read)
    matches.extend(tree.find(reverse_complement(read)))
    if len(matches) > 0:
        count += 1
print(count, '/', len(reads), 'matched!')
Пример #13
0
def map_dna(args):
    # Parse input
    parser = Parser()
    sequence_name, sequence = parser.parse_fasta_sequence(args[1])
    read_file = args[2]
    k = int(args[3])

    # Build Suffix Tree
    print("Building Suffix Tree...", )
    suffix_tree = SuffixTree(sequence)
    print("DONE")

    print("Building Suffix Array...", )
    suffix_array = suffix_array_from_suffix_tree(suffix_tree)
    print("DONE")

    del suffix_tree

    print("Building BWT...", )
    bwt = bwt_from_suffix_array(suffix_array, sequence)
    print("DONE")

    print("Building First Occurrence Table...")
    first_occurrences = build_first_occurrence(bwt)
    print("DONE")

    print("Building Counts Table...")
    counts = build_counts(bwt)
    print("DONE")

    read_file_name = read_file.split(".")[0]
    timestamp = datetime.now().strftime("%Y-%d-%H-%M")
    file_name = "%s %s.SAM" % (sequence_name, timestamp)
    sam = SAM(filename=file_name,
              sequence_name=sequence_name,
              sequence_length=len(sequence))

    reads = parser.parse_fasta_reads(read_file)

    # Map DNA sequence
    print("**Beginning Mapping Process")
    counter = 0
    results = []
    matches = {}

    for read_name, read in reads.iteritems():
        counter += 1
        kmer_indices = {}
        print("**Mapping Read " + str(counter) + " ")
        max_index = -1
        max_score = -1
        candidate_mapping_indices = {}

        kmer_start_index = 0
        kmer_end_index = kmer_start_index + k

        while kmer_end_index - 1 <= len(read):
            kmer = read[kmer_start_index:kmer_end_index]
            relative_index = kmer_start_index

            if kmer not in matches:
                matches[kmer] = find_pattern_matches(kmer, suffix_array, bwt,
                                                     first_occurrences, counts)

            if matches[kmer]:

                # Perform pattern matching using suffix_tree with each kmer storing results for each matching kmer index relative
                # to its location within the read, i.e. matching_index - relative_index = potential_read_mapping_index
                for matching_index in matches[kmer]:
                    potential_read_index = matching_index - relative_index

                    if potential_read_index >= 0:
                        if potential_read_index not in candidate_mapping_indices:
                            candidate_mapping_indices[potential_read_index] = 0

                        candidate_mapping_indices[potential_read_index] += 1

                        if candidate_mapping_indices[
                                potential_read_index] > max_score:
                            max_index = potential_read_index
                            max_score = candidate_mapping_indices[
                                potential_read_index]

                kmer_start_index += k
                kmer_end_index += k

            else:
                kmer_start_index += 1
                kmer_end_index += 1

        sam.append_sam_output(read_name=read_name,
                              cigar="%dM" % len(read),
                              sequence_name=sequence_name,
                              position=max_index + 1)

    print("**MAPPING COMPLETE**")
Пример #14
0
 def __init__(self, code):
     self.bs = bs4.BeautifulSoup(code, 'html.parser')
     self.tokens = self.tokenize(self.bs.body)
     self.start_tags, self.tokens_val = self.extract_tokens(self.tokens)
     self.st = SuffixTree(self.tokens_val)