Пример #1
0
    def transform(self, s):
        """ Burrows-Wheeler transform with SuffixTree """
        assert self.EOS not in s, "Input string cannot contain null character ('%s')" % self.EOS

        # add end of text marker
        s += self.EOS

        st = SuffixTree()

        # construct a suffix tree O(n * log n)
        # can also be done in O(n) time
        st.add(s)

        # walk inorder to find sorted suffixes
        # only get the length of each suffix
        lens = self._walk(st.root)

        # as the last column letter will be left of the suffix
        # this means it's len(suffix) + 1
        # from the end of the input string s

        r = [0] * len(lens)
        for i in xrange(len(lens)):
            l = lens[i]
            if l == len(lens):
                r[i] = self.EOS
            else:
                r[i] = s[-l - 1]
        return ''.join(r)
Пример #2
0
    def transform(self, s):
        """ Burrows-Wheeler transform with SuffixTree """
        assert self.EOS not in s, "Input string cannot contain null character ('%s')" % self.EOS
        
        # add end of text marker
        s += self.EOS
        
        st = SuffixTree()
        
        # construct a suffix tree O(n * log n)
        # can also be done in O(n) time
        st.add(s)
        
        # walk inorder to find sorted suffixes
        # only get the length of each suffix
        lens = self._walk(st.root)

        # as the last column letter will be left of the suffix
        # this means it's len(suffix) + 1
        # from the end of the input string s
        
        r = [0]*len(lens)
        for i in xrange(len(lens)):
            l = lens[i]
            if l == len(lens):
                r[i] = self.EOS
            else:
                r[i] = s[-l-1]
        return ''.join(r)
Пример #3
0
 def __init__(self, dna):
     self.dna = dna
     self.suffix_tree = SuffixTree(len(dna))
     self.suffix_array = []
     self.first_col = []
     self.bwt = []
     self.ltof = []
     self.init_self()
Пример #4
0
def suffix_search_lcs(a, b):
    # left and right bounds, max sizes
    len_a, len_b = len(a) + 1, len(b) + 1
    short = min(len(a), len(b))
    
    tree = SuffixTree(False, [a])
    
    # returns if there is a common substring of length m between a, b
    def found_common(m):
        return any(tree.findStringIdx(b[i-m:i]) for i in range(m, len_b))
    
    # exponentially increase l and r
    l, r = 0, 1
    while r < len_a and found_common(r):
        l, r = r + 1, r * 2
    r = min(r, short)
    
    # right-most binary search on if substring length is possible
    while l <= r:
        m = (l + r) // 2
        
        if found_common(m):
            l = m + 1
        else:
            r = m - 1
    
    return r
Пример #5
0
class Notebook(object):
    """
    Notebook object that represents a single text document that is loaded
    into memory to allow the program to query and report information about 
    its contents. The core logic of the program should be found within this
    class.
    """
    
    def __init__(self, file_path):
        self.file_path = file_path
            
        self.suffix_tree = SuffixTree()    
        self._parse_file(self.file_path)
        
    def reload(self, file_path=None):
        if file_path:
            self.file_path = file_path
        
        self.suffix_tree.root = Node()  # Restart the suffix tree
        self._parse_file(self.file_path)
        
    def _parse_file(self, file_path):
        with open(file_path,'r') as notebook:
        
            # build the contents found in the file
            for i, line in enumerate(notebook.readlines()):
                word = '' # word buffer
                start = -1
                
                for j, char in enumerate(line):
                    if char in WHITE_KEYS or char in SPECIAL_CHARS:
                        self._add_suffixes(word, start, i)
                        word = ''
                        start = -1
                    else:
                        if start==-1: 
                            start = j
                        word += char
                    
    # Adds a word and all of its suffixes
    def _add_suffixes(self, word, position, line_no, whole_word=True):
        if len(word) == 0:
            return 
        
        self.suffix_tree.add_word(word.lower(), ((position, line_no), whole_word))
Пример #6
0
def suffix_search_lcs(a, b):
    # left and right bounds, max sizes
    len_a, len_b = len(a) + 1, len(b) + 1
    short = min(len(a), len(b))

    tree = SuffixTree(True, [a])
    print('Completed suffix tree')

    # returns if there is a common substring of length m between a, b
    def found_common(m):
        return any(tree.findStringIdx(b[i - m:i]) for i in range(m, len_b))

    # exponentially increase l and r
    l, r = 0, 1
    print(l, r)
    while r < len_a and found_common(r):
        l, r = r + 1, r * 2
        print(l, r)
    r = min(r, short)
    print(l, r)

    # right-most binary search on if substring length is possible
    while l <= r:
        m = (l + r) // 2
        print(m)

        if found_common(m):
            l = m + 1
        else:
            r = m - 1

    print('Longest Common Substrings:')
    print('\n'.join(
        set(b[i - r:i] for i in range(r, len_b)
            if tree.findStringIdx(b[i - r:i]))))

    return r
Пример #7
0
@dataclass
class Point:
    lat: float
    lon: float

    def __hash__(self) -> int:
        return hash((self.lat, self.lon))

    def __eq__(self, o: object) -> bool:
        return self.__hash__() == o.__hash__()

    def __str__(self) -> str:
        return f'{self.lat},{self.lon}'


s = SuffixTree()
s.generate(
    (
        Point(1,1),
        Point(1,0),
        Point(0,1),
        Point(1,1),
        Point(1,0),
        Point(0,0),
    )
)
# s.generate('MISSISSIPPI$')

# annotate graph for vizualization
for i in range(1, s.order()):
    parent = s.parent_id(i)
Пример #8
0
'''
Created on Oct 23, 2018

@author: ckennington
'''
from stlm import STLM
from suffixtree import SuffixTree
from sequence import Sequence

trie = SuffixTree()

text = 'c a c a o'.split()

for w in text:
    print('adding', w)
    trie.add(w)

print('\n')
trie.print_tree()
print('\n')

trie.update_all_counts()

stlm = STLM(trie)

tests = [
    'c a'.split(), 'c a o'.split(), 'a o'.split(), 'o'.split(), 'c'.split()
]

for test in tests:
    seq = Sequence()
Пример #9
0
class Mapper:
    def __init__(self, dna):
        self.dna = dna
        self.suffix_tree = SuffixTree(len(dna))
        self.suffix_array = []
        self.first_col = []
        self.bwt = []
        self.ltof = []
        self.init_self()

    def init_self(self):
        """initializes lists needed for mapping"""
        for c in self.dna:
            self.suffix_tree.add_char(c)
        root = self.suffix_tree.nodes[self.suffix_tree.root]
        self.traverse_tree(root, root.start)
        self.first_col = create_subscripts(
            [self.dna[x] for x in self.suffix_array])
        self.bwt = create_subscripts([
            self.dna[x - 1] if x > 0 else self.dna[-1]
            for x in self.suffix_array
        ])
        self.ltof = [self.first_col.index(x) for x in self.bwt]

    def traverse_tree(self, node, char_depth):
        """traverse the tree recursively to generate suffix array"""
        if not node.edges:
            self.suffix_array.append(node.start - char_depth - 1)
            return
        chars = ['$', 'A', 'C', 'G', 'T']
        for char in chars:
            try:
                next_node = self.suffix_tree.nodes[node.edges[char]]
                new_depth = char_depth + (node.end - node.start)
                self.traverse_tree(next_node, new_depth)
            except KeyError:
                pass

    def get_position_range(self, char, list_type, start, end):
        """returns range of positions for a char in specified type of list"""
        positions = []
        for index in xrange(start, end + 1):
            if list_type[index].startswith(char):
                positions.append(index)
                break
        for index in xrange(end, start, -1):
            if list_type[index].startswith(char):
                positions.append(index)
                break
        return positions

    def map(self, pattern):
        """find a given pattern in the genome"""
        found_positions = []
        rev_pattern = pattern[::-1]
        current_positions = self.get_position_range(rev_pattern[0],
                                                    self.first_col, 0,
                                                    len(self.first_col) - 1)
        for i in xrange(1, len(rev_pattern)):
            try:
                bwt_positions = self.get_position_range(
                    rev_pattern[i], self.bwt, current_positions[0],
                    current_positions[-1])
                ltof_positions = range(self.ltof[bwt_positions[0]],
                                       self.ltof[bwt_positions[-1]] + 1)
                current_positions = ltof_positions
            except IndexError:
                # may not be able to find, simply return so don't have to go through entire pattern
                return found_positions
        # after last char, push every position in SA to found_positions
        for i in current_positions:
            found_positions.append(self.suffix_array[i])
        return found_positions
Пример #10
0
 def __init__(self, file_path):
     self.file_path = file_path
         
     self.suffix_tree = SuffixTree()    
     self._parse_file(self.file_path)