예제 #1
0
 def test_find_5 (self, _, builder):
     tree = Tree ({
         'A' : 'aaaaa',
         'B' : 'bbbb',
     }, builder = builder)
     self.assertEqual (len (tree.find_all ('a')), 5)
     self.assertEqual (len (tree.find_all ('b')), 4)
예제 #2
0
 def test_find_3 (self, _, builder):
     # Gusfield1997 Figure 7.1 Page 129
     tree = Tree ({ 'A' : 'xyxaxaxa' }, builder = builder)
     self.assertTrue  (tree.find ('xyxaxaxa'))
     self.assertTrue  (tree.find ('xax'))
     self.assertTrue  (tree.find ('axa'))
     self.assertFalse (tree.find ('ay'))
예제 #3
0
def generate_maximal_repeats_sequence(df, tictoc):
    tictoc.tic()

    df_tree = df.event.value_counts().reset_index()

    #Count variants and then duplicate variants with count > 1
    #This way we can creat a tree based on variants instead of adding each trace
    #When count > 1 it is sufficient to have only two variants for maximal repeats to be discovered
    df_tree.columns = ['event', 'count']
    to_duplicate = df_tree['count'] > 1
    df_to_duplicate = df_tree[to_duplicate]
    df_tree = df_tree.append([df_to_duplicate], ignore_index=True)

    dict_tree = {}

    for index, row in df_tree.iterrows():
        dict_tree[index] = row['event'].split(" ")

    tree = Tree(dict_tree)

    maximal_repeats_sequence = []
    for C, path in sorted(tree.maximal_repeats()):
        sfx = str(path)
        # Only repeats with two or more elements
        if len(sfx.split(' ')) > 0:
            maximal_repeats_sequence.append(sfx)

    print("Suffix Tree generated.\t Processing Time: %.2fs" % (tictoc.toc()))

    return maximal_repeats_sequence
예제 #4
0
 def test_to_dot(self):
     # tree = Tree ({ 'A' : 'aaaaa' })
     # tree = Tree ({ 'A' : 'ababcabcdabdeaef' })
     tree = Tree({'A': 'xabxac', 'B': 'awyawxawxz'})
     dot = tree.to_dot()
     with open('/tmp/suffix_tree.dot', 'w') as tmp:
         tmp.write(dot)
     self.assertTrue(1 == 1)  # just test that it doesn't bomb
예제 #5
0
def main():
    # load all the files into a dictionary data where the keys are the names of the file
    path = ''
    data = {}

    for filename in glob.glob(os.path.join(path, 'sample.*')):
        with open(os.path.join(os.getcwd(), filename), 'rb') as f:
            data[filename] = f.read()
            print(filename)

    # length of each file
    '''
    for i in data.values():
      print(len(list(i)))
    '''
    # create a generalized suffix tree by passing in data dict containing file names and content
    t2 = Tree(data)

    # as shown below the:
    # length of the longest common byte substring is 27648
    # the files are sample.3 and sample.2
    # the starting offset for the files are 17408, 3072
    result = longest(t2.root)
    print('the length of the longest common substring(strand):', result[0][0])
    print('the files containing the strings are :', result[1])
    print('the starting offset in the respective files are:', result[2])
def melody_identifier(pm, note_shift):
    notes_dict = get_notes_dict(pm)
    notes_int = get_notes_dict_int(pm)
    tree = Tree(notes_dict)
    paths = grab_paths(tree)
    results = filter_paths(paths)
    weighted_seq = construct_weighted_seq(results, note_shift)
    return get_melody_instrument(weighted_seq, notes_int)
예제 #7
0
def get_BFMT(file=INPUT, n_docs=N_DOCS):
    docs = pd.read_csv(file, header=None, engine='python', nrows=n_docs)[5]
    docs = docs.apply(lambda x: x.lower())
    docs = docs.apply(
        lambda x: ''.join([i if ord(i) < 128 else ' ' for i in x]))
    docs = list(docs)
    tree = Tree(dict(enumerate(docs)))
    return tree, docs
예제 #8
0
 def test_1(self):
     """
     Test string common among 2 strings
     """
     tree = Tree({'1': 'xabzxa', '2': 'yabz'})
     result = longest(tree.root)
     self.assertEqual(result[0][0], 3)
     self.assertEqual(result[1][0], ['1', '2'])
     self.assertEqual(result[2][0], [1, 1])
예제 #9
0
 def test_2(self):
     """
     Test string common among 2 strings where the string 1 
     """
     tree = Tree({'1': 'axabxzaxabx', '2': 'ybxzxda'})
     result = longest(tree.root)
     self.assertEqual(result[0][0], 3)
     self.assertEqual(result[1][0], ['1', '2'])
     self.assertEqual(result[2][0], [3, 1])
예제 #10
0
 def test_3(self):
     """
     Test string common among 3 strings
     """
     tree = Tree({'1': 'xabzxabxab', '2': 'yabzdxabx', "3": "zavxabx"})
     result = longest(tree.root)
     self.assertEqual(result[0][0], 4)
     self.assertEqual(result[1][0], ['1', '2', '3'])
     self.assertEqual(result[2][0], [4, 5, 3])
예제 #11
0
 def test_lca(self):
     tree = Tree({'A': 'xabxac', 'B': 'awyawxawxz'})
     tree.prepare_lca()
     self.assertEqual(
         tree.lca(tree.nodemap['A'][1], tree.nodemap['B'][3]).id, 8)
     self.assertEqual(
         tree.lca(tree.nodemap['A'][0], tree.nodemap['B'][8]).id, 2)
     self.assertEqual(
         tree.lca(tree.nodemap['B'][1], tree.nodemap['B'][7]).id, 19)
     self.assertEqual(
         tree.lca(tree.nodemap['A'][0], tree.nodemap['B'][7]).id, 1)
예제 #12
0
 def test_find_7 (self, _, builder):
     tree = Tree ({
         'A' : 'abcde',
         'B' : 'bcde',
         'C' : 'cde',
         'D' : 'de',
         'E' : 'e',
     }, builder = builder)
     self.assertEqual (len (tree.find_all ('abcde')), 1)
     self.assertEqual (len (tree.find_all ('bcde')),  2)
     self.assertEqual (len (tree.find_all ('cde')),   3)
     self.assertEqual (len (tree.find_all ('de')),    4)
     self.assertEqual (len (tree.find_all ('e')),     5)
예제 #13
0
 def test_find_6 (self, _, builder):
     tree = Tree ({
         'A' : 'a',
         'B' : 'ab',
         'C' : 'abc',
         'D' : 'abcd',
         'E' : 'abcde',
     }, builder = builder)
     self.assertEqual (len (tree.find_all ('abcde')), 1)
     self.assertEqual (len (tree.find_all ('abcd')),  2)
     self.assertEqual (len (tree.find_all ('abc')),   3)
     self.assertEqual (len (tree.find_all ('ab')),    4)
     self.assertEqual (len (tree.find_all ('a')),     5)
예제 #14
0
    def test_maximal_repeats_2(self, _, builder):
        tree = Tree(
            {
                'A': '1abc2abc3',
                'B': '4abc5abc6',
                'C': '7abc',
                'D': '7abc',
                'E': 'abc',
            },
            builder=builder)

        self.maximal_repeats(tree, [
            '2 7 a b c',
            '5 a b c',
        ])
예제 #15
0
 def test_find_4 (self, _, builder):
     tree = Tree ({
         'A' : ('232 020b 092 093 039 061 102 135 098 099 '
                '039 040 039 040 044 141 140 098').split (),
         'B' : '097 098 039 040 041 129 043'.split (),
         'C' : ('097 098 039 040 020a 022 023 097 095 094 098 '
                '043 044 112 039 020b 039 098').split (),
     }, builder = builder)
     self.assertTrue  (tree.find ('039 040 041'.split ()))
     self.assertTrue  (tree.find ('039 040 039 040'.split ()))
     self.assertTrue  (tree.find ('020a 022 023'.split ()))
     self.assertTrue  (tree.find ('232 020b 092'.split ()))
     self.assertTrue  (tree.find ('097 098 039 040'.split ()))
     self.assertTrue  (tree.find ('141 140 098'.split ()))
     self.assertFalse (tree.find ('039 040 042'.split ()))
예제 #16
0
    def test_find_1 (self, _, builder):
        # Gusfield1997 Figure 5.1 Page  91
        tree = Tree ({ 'A' : 'xabxac' }, builder = builder)
        self.assertTrue  (tree.find ('x'))
        self.assertTrue  (tree.find ('xa'))
        self.assertTrue  (tree.find ('xab'))
        self.assertTrue  (tree.find ('xabx'))
        self.assertTrue  (tree.find ('xabxa'))
        self.assertTrue  (tree.find ('xabxac'))

        self.assertTrue  (tree.find ('abxac'))
        self.assertTrue  (tree.find ('bxac'))
        self.assertTrue  (tree.find ('xac'))
        self.assertTrue  (tree.find ('ac'))
        self.assertTrue  (tree.find ('c'))

        self.assertFalse (tree.find ('d'))
        self.assertFalse (tree.find ('xx'))
        self.assertFalse (tree.find ('xabxaa'))
class LongestPalindromeFinder(tk.Frame):
    def __init__(self, master=None):
        super().__init__(master=master)

        tk.Label(self, text='Sequence').grid(row=0, column=0, sticky='w')
        self.sequence_text = tk.Text(self)
        self.sequence_text.grid(row=1, column=0, columnspan=2, sticky='nsew')

        self.sequence_load_btn = tk.Button(
            self, text='Choose file', command=self.choose_sequence_file)
        self.sequence_load_btn.grid(row=2, column=1, sticky='e')

        buttons_frame = tk.Frame(self)
        self.run_btn = tk.Button(master=buttons_frame,
                                 text='Run', command=self.run)
        self.run_btn.grid(row=0, column=1)

        self.export_btn = tk.Button(
            master=buttons_frame, text='Export Tree', command=self.export_tree)
        self.export_btn.grid(row=0, column=0)
        buttons_frame.grid(row=3, column=0, columnspan=2, sticky='e')

        for i in range(2):
            self.grid_columnconfigure(i, weight=1)

        for i in range(4):
            self.grid_rowconfigure(i, weight=1)

        self.sequence = None

        self.tree = None

    def choose_sequence_file(self):
        filename = filedialog.askopenfilename(parent=self)
        try:
            self.sequence_text.delete(1.0, tk.END)
            self.sequence_text.insert(tk.END, open(filename, 'r').read())
        except FileNotFoundError:
            messagebox.showwarning(
                title='Bad file', message='No files selected')

    def load_sequence(self):
        self.sequence = self.sequence_text.get(1.0, tk.END).strip("\n ")

    def construct_tree(self):
        self.tree = Tree({1: self.sequence, 2: reversed(self.sequence)})

    def initialize(self):
        self.load_sequence()
        if not self.sequence:
            messagebox.showerror(
                title='Bad input', message='Invalid input sequence')
            return
        self.construct_tree()

    def run(self):
        self.initialize()
        result = str(self.find_the_longest_palindrome()).replace(' ', '')
        output(result, 'longest_palindrome_found.txt')

    def build_neighbor_leaves(self):
        leaves = {}

        def f(node):
            if node.is_leaf():
                leaves[(node.str_id, node.path.start)] = node

        self.tree.root.post_order(f)
        odd_neighbor_leaves = [(leaves[1, i], leaves[2, len(self.sequence) - 1 - i]) for i in range(len(self.sequence))]
        even_neighbor_leaves = [(self.sequence[i], leaves[1, i + 1], leaves[2, len(self.sequence) + 1 - i]) for i in
                                range(1, len(self.sequence) - 1) if self.sequence[i - 1] == self.sequence[i]]
        # neighbor_leaves = [(leaves[i, 0], leaves[2, 0])]
        return odd_neighbor_leaves, even_neighbor_leaves

    def find_the_longest_odd_palindrome(self, neighbor_leaves):
        candidate_node = None
        for node1, node2 in neighbor_leaves:
            node = self.tree.lca(node1, node2)
            if candidate_node is None or len(node) > len(candidate_node):
                candidate_node = node
        result = str(candidate_node).replace(' ', '')
        result = (result[::-1])[:-1] + result
        return result

    def find_the_longest_even_palindrome(self, neighbor_leaves):
        if not neighbor_leaves:
            return ''
        candidate_node = None
        middle = None
        for mid, node1, node2 in neighbor_leaves:
            node = self.tree.lca(node1, node2)
            if candidate_node is None or len(node) > len(candidate_node):
                candidate_node = node
                middle = mid
        if candidate_node == self.tree.root:
            candidate_node = ''
        result = str(candidate_node).replace(' ', '')
        result = result[::-1] + (middle * 2) + result
        return result

    def find_the_longest_palindrome(self):
        odd_neighbor_leaves, even_neighbor_leaves = self.build_neighbor_leaves()
        self.tree.prepare_lca()
        odd_palindrome = self.find_the_longest_odd_palindrome(odd_neighbor_leaves)
        even_palindrome = self.find_the_longest_even_palindrome(even_neighbor_leaves)

        return max(odd_palindrome, even_palindrome, key=len)

    def export_tree(self):
        self.initialize()
        filename = filedialog.asksaveasfilename(parent=self.master)
        graphviz.Source(self.tree.to_dot()).render(
            filename=filename, format='pdf', view=True, cleanup=True)
예제 #18
0
# def pysequitur_map(str_seq_unique, win_len):
#     print(str_seq_unique[0:20])
#     total_str = np.concatenate([str_seq_unique[0:-win_len//2:-1], str_seq_unique, str_seq_unique[-win_len//2:-1:-1]]).tolist()
#     print(len(str_seq_unique))
#     print(len(total_str))
#
#     for i in range(win_len//2, len(total_str)-win_len//2):
#         temp_str = total_str[i-win_len//2:i+win_len//2]
#         print(temp_str)
#         s_temp = psq.Sequencer2(temp_str)
#         print(s_temp.get())
#         print(s_temp.resolve())
# psq.print_grammar(s_temp)

suffixtree = Tree({"A": ch1_word_lst[:20]})
suffixtree.prepare_lca()

from graphviz import Source

dot_object = suffixtree.to_dot()
s = Source(dot_object)
s.render("test.gv", view=True)

# pysequitur_map(ch1_word_lst, 20)

ch1_clean_str = removeShortSegments(ch1_connotated_lst_unique.split(" "),
                                    ch1_count_lst)
ch1_clean_str_unique = runLengthEncoding(ch1_clean_str)

# print(ch1_connotated_lst)
예제 #19
0
 def test_maximal_repeats_1(self, _, builder):
     # See [Gusfield1997]_ §7.12.1, 144ff.
     tree = Tree({'A': 'xabxac', 'B': 'awyawxawxz'}, builder=builder)
     self.maximal_repeats(tree, [
         '2 x a',
     ])
예제 #20
0
 def test_find_2 (self, _, builder):
     # Gusfield1997 Figure 5.2 Page  92
     tree = Tree ({ 'A' : 'awyawxawxz' }, builder = builder)
     self.assertTrue  (tree.find ('awx'))
     self.assertTrue  (tree.find ('awy'))
     self.assertFalse (tree.find ('awz'))
 def construct_tree(self):
     self.tree = Tree({1: self.sequence, 2: reversed(self.sequence)})
예제 #22
0
    def test_maximal_repeats_3(self, _, builder):
        # pylint: disable=line-too-long
        tree = Tree(
            {
                'berlin-sb-lat-qu-931': ['157', '165', '167', '165'],
                'cava-dei-tirreni-bdb-4': [
                    '232', '020b', '092', '093', '039', '061', '102', '135',
                    '098', '099', '039', '040', '039', '040', '044', '141',
                    '140', '098', '121', '098', '090', '095', '134', '135',
                    '088', '104', '139', '140', '067', '091', '094', '158',
                    '159', '161', '162', '163', '164', '165', '158', '165',
                    '161'
                ],
                'gotha-flb-memb-i-84': [
                    'A000', '176', '039', '040', '020a', '022', '023', '097',
                    '095', '094', '098', '043', '044', '112', '039', '020b',
                    '039', '098', '140', '141', '138', '139', 'A000', '196',
                    '112', '105', '134', '157', '166', '163', '175', '165',
                    '191', '192', '193', '209', '210', '228', '213', '212',
                    '216', '217'
                ],
                'heiligenkreuz-sb-217': [
                    '249', '068', '134', '157', '165', '167', '201', '165',
                    '188', '191', '193', '188', '192', '016', '015', '013',
                    '055', '056', '042', '052', '066', '020a', '078', '191',
                    '131', '192', '195', '266', '242', '293', '273b', '287',
                    '094', '040', '020a', '023', '095', '094', '098', '020b',
                    '157', '163', '165', '196'
                ],
                'ivrea-bc-xxxiii': [
                    '068', '020a', '022', '023', '095', '094', '021', '097',
                    '098', '039', '040', '039', '040', '041', '129', '043',
                    '044', '112', '088', '104', '134', '135', '139'
                ],
                'ivrea-bc-xxxiv': [
                    '022', '020a', '022', '023', '095', '094', '021', '097',
                    '098', '039', '040', '039', '040', '041', '129', '043',
                    '098', '099', '101', '100', '020a', '044', '112', '088',
                    '134', '135', '060', '140', '141', '138', '139', '157',
                    '165', '164', '163', '093', '201', '159'
                ],
                'modena-bc-o-i-2': [
                    '163', '039', '040', '020a', '022', '023', '097', '095',
                    '094', '098', '043', '044', '112', '039', '020b', '039',
                    '098', '157', '163', '165', '191', '192', '193', '196',
                    '216', '217'
                ],
                'muenchen-bsb-lat-19416': [
                    '020a', '093', '020a', '022', '023', '095', '046', '067',
                    '047', '039', '040', '039', '040', '041', '129', '043',
                    '044', '112', '099', '097', '098', '103', '093', '105',
                    '191', '192', '193', '201'
                ],
                'muenchen-bsb-lat-29555-1': [
                    'M024', '163', '175', '165', 'M015', '196', '193', '209',
                    '210', '228', '273', '224'
                ],
                'muenchen-bsb-lat-3853': [
                    '249', '068', '134', '157', '165', '167', '201', '165',
                    '188', '191', '193', '188', '192', '016', '015', '013',
                    '055', '056', '042', '052', '066', '020a', '078', '191',
                    '131', '192', '195', '252a', '266', '242', '293', '273b',
                    '287', '094', '040', '020a', '023', '095', '094', '098',
                    '020b', '157', '163', '165', '196'
                ],
                'muenchen-bsb-lat-6360': ['157', '165', '167', '201', '165'],
                'paris-bn-lat-3878': [
                    '249', '157', '165', '167', '188', '191', '293', '165',
                    '196', '252a'
                ],
                'paris-bn-lat-4613': [
                    '020a', '094', '095', '096', '025', '098', '039', '040',
                    '022', '023', '105', '033', '121', '141', '191', '192',
                    '158', '165', '201', '215', '214', '219'
                ],
                'salzburg-bea-st-peter-a-ix-32': [
                    '252', '191', '201', 'M023', '215', '210', '093', '112',
                    '293', '252a', '010', '011', '293'
                ],
                'st-gallen-sb-733':
                ['020a', '089', '022', '023', '097', '094'],
                'st-paul-abs-4-1': [
                    '092', '093', '020a', '094', '095', '088', '044', '046',
                    '067', '098', '099', '041', '043', '044', '039', '098',
                    '134', '090', '113', '139', '140', '141', '140', '139',
                    '158', '140', '165', '181'
                ],
                'vatikan-bav-chigi-f-iv-75': [
                    '232', '020b', '092', '093', '039', '061', '102', '135',
                    '098', '099', '039', '040', '039', '040', '044', '141',
                    '140', '098', '090', '134', '135', '088', '104', '139',
                    '140', '067', '091', '094', '095', '158', '159', '098',
                    '159', '162', '163', '164', '165', '161', '093', '201',
                    '020a', '095', '088', '044', '046', '099', '041', '043',
                    '044', '141', '158', '208'
                ],
                'vatikan-bav-reg-lat-1000b': ['157', '165'],
                'vatikan-bav-reg-lat-263': [
                    '202', '087', '044', '224', 'A000', '163', '093', '224',
                    '225', '232', '020b'
                ],
                'vatikan-bav-vat-lat-5359': ['201', '165'],
                'vercelli-bce-clxxiv':
                ['167', '092', '093', '165', '163', '166'],
                'vercelli-bce-clxxv': ['163'],
                'wien-oenb-ser-n-3761': ['157'],
                'wolfenbuettel-hab-blankenb-130': [
                    '022', '043', '044', '060', '103', '097', '095', '067',
                    '040', '020a', '112', '038', '023', '094', '041', '020a',
                    '105', '098', '057', '014', '114', '039', '040', '039',
                    '040', '139', '140', '156', '178', '143', '150', '138',
                    '179', '174', '157', '160', '165', '228', '164', '163',
                    '093', '166', '159', '102', '201', '158', '161', '093',
                    '191', '192', '193', '201', '158', '180', '210', '211',
                    '214', '219', '202', '022', '023', '095', '098', '041',
                    '129', '043', '044', '094', '215', '131'
                ]
            },
            builder=builder)

        self.maximal_repeats(tree, [
            '4 022 023 095',
            '4 023 095 094',
            '4 041 129 043',
            '4 043 044 112',
            '4 092 093',
            '4 095 094 098',
            '4 098 099',
            '4 134 135',
            '4 139 140',
            '4 191 192 193',
            '4 201 165',
            '5 020a 022 023',
            '5 040 020a',
            '5 044 112',
            '5 157 165 167',
            '5 191 192',
            '6 023 095',
            '6 039 040 039 040',
            '6 095 094',
            '7 043 044',
            '7 157 165',
            '8 022 023',
            '9 039 040',
        ],
                             min_cv=4)