Exemplo n.º 1
0
 def test_find_5 (self, _, builder):
     tree = Tree ({
         'A' : 'aaaaa',
         'B' : 'bbbb',
     }, builder = builder)
     self.assertEqual (len (tree.find_all ('a')), 5)
     self.assertEqual (len (tree.find_all ('b')), 4)
Exemplo n.º 2
0
 def test_find_3 (self, _, builder):
     # Gusfield1997 Figure 7.1 Page 129
     tree = Tree ({ 'A' : 'xyxaxaxa' }, builder = builder)
     self.assertTrue  (tree.find ('xyxaxaxa'))
     self.assertTrue  (tree.find ('xax'))
     self.assertTrue  (tree.find ('axa'))
     self.assertFalse (tree.find ('ay'))
Exemplo n.º 3
0
def main():
    # load all the files into a dictionary data where the keys are the names of the file
    path = ''
    data = {}

    for filename in glob.glob(os.path.join(path, 'sample.*')):
        with open(os.path.join(os.getcwd(), filename), 'rb') as f:
            data[filename] = f.read()
            print(filename)

    # length of each file
    '''
    for i in data.values():
      print(len(list(i)))
    '''
    # create a generalized suffix tree by passing in data dict containing file names and content
    t2 = Tree(data)

    # as shown below the:
    # length of the longest common byte substring is 27648
    # the files are sample.3 and sample.2
    # the starting offset for the files are 17408, 3072
    result = longest(t2.root)
    print('the length of the longest common substring(strand):', result[0][0])
    print('the files containing the strings are :', result[1])
    print('the starting offset in the respective files are:', result[2])
Exemplo n.º 4
0
def generate_maximal_repeats_sequence(df, tictoc):
    tictoc.tic()

    df_tree = df.event.value_counts().reset_index()

    #Count variants and then duplicate variants with count > 1
    #This way we can creat a tree based on variants instead of adding each trace
    #When count > 1 it is sufficient to have only two variants for maximal repeats to be discovered
    df_tree.columns = ['event', 'count']
    to_duplicate = df_tree['count'] > 1
    df_to_duplicate = df_tree[to_duplicate]
    df_tree = df_tree.append([df_to_duplicate], ignore_index=True)

    dict_tree = {}

    for index, row in df_tree.iterrows():
        dict_tree[index] = row['event'].split(" ")

    tree = Tree(dict_tree)

    maximal_repeats_sequence = []
    for C, path in sorted(tree.maximal_repeats()):
        sfx = str(path)
        # Only repeats with two or more elements
        if len(sfx.split(' ')) > 0:
            maximal_repeats_sequence.append(sfx)

    print("Suffix Tree generated.\t Processing Time: %.2fs" % (tictoc.toc()))

    return maximal_repeats_sequence
def melody_identifier(pm, note_shift):
    notes_dict = get_notes_dict(pm)
    notes_int = get_notes_dict_int(pm)
    tree = Tree(notes_dict)
    paths = grab_paths(tree)
    results = filter_paths(paths)
    weighted_seq = construct_weighted_seq(results, note_shift)
    return get_melody_instrument(weighted_seq, notes_int)
Exemplo n.º 6
0
 def test_to_dot(self):
     # tree = Tree ({ 'A' : 'aaaaa' })
     # tree = Tree ({ 'A' : 'ababcabcdabdeaef' })
     tree = Tree({'A': 'xabxac', 'B': 'awyawxawxz'})
     dot = tree.to_dot()
     with open('/tmp/suffix_tree.dot', 'w') as tmp:
         tmp.write(dot)
     self.assertTrue(1 == 1)  # just test that it doesn't bomb
Exemplo n.º 7
0
def get_BFMT(file=INPUT, n_docs=N_DOCS):
    docs = pd.read_csv(file, header=None, engine='python', nrows=n_docs)[5]
    docs = docs.apply(lambda x: x.lower())
    docs = docs.apply(
        lambda x: ''.join([i if ord(i) < 128 else ' ' for i in x]))
    docs = list(docs)
    tree = Tree(dict(enumerate(docs)))
    return tree, docs
Exemplo n.º 8
0
 def test_1(self):
     """
     Test string common among 2 strings
     """
     tree = Tree({'1': 'xabzxa', '2': 'yabz'})
     result = longest(tree.root)
     self.assertEqual(result[0][0], 3)
     self.assertEqual(result[1][0], ['1', '2'])
     self.assertEqual(result[2][0], [1, 1])
Exemplo n.º 9
0
 def test_3(self):
     """
     Test string common among 3 strings
     """
     tree = Tree({'1': 'xabzxabxab', '2': 'yabzdxabx', "3": "zavxabx"})
     result = longest(tree.root)
     self.assertEqual(result[0][0], 4)
     self.assertEqual(result[1][0], ['1', '2', '3'])
     self.assertEqual(result[2][0], [4, 5, 3])
Exemplo n.º 10
0
 def test_2(self):
     """
     Test string common among 2 strings where the string 1 
     """
     tree = Tree({'1': 'axabxzaxabx', '2': 'ybxzxda'})
     result = longest(tree.root)
     self.assertEqual(result[0][0], 3)
     self.assertEqual(result[1][0], ['1', '2'])
     self.assertEqual(result[2][0], [3, 1])
Exemplo n.º 11
0
 def test_lca(self):
     tree = Tree({'A': 'xabxac', 'B': 'awyawxawxz'})
     tree.prepare_lca()
     self.assertEqual(
         tree.lca(tree.nodemap['A'][1], tree.nodemap['B'][3]).id, 8)
     self.assertEqual(
         tree.lca(tree.nodemap['A'][0], tree.nodemap['B'][8]).id, 2)
     self.assertEqual(
         tree.lca(tree.nodemap['B'][1], tree.nodemap['B'][7]).id, 19)
     self.assertEqual(
         tree.lca(tree.nodemap['A'][0], tree.nodemap['B'][7]).id, 1)
Exemplo n.º 12
0
 def test_find_7 (self, _, builder):
     tree = Tree ({
         'A' : 'abcde',
         'B' : 'bcde',
         'C' : 'cde',
         'D' : 'de',
         'E' : 'e',
     }, builder = builder)
     self.assertEqual (len (tree.find_all ('abcde')), 1)
     self.assertEqual (len (tree.find_all ('bcde')),  2)
     self.assertEqual (len (tree.find_all ('cde')),   3)
     self.assertEqual (len (tree.find_all ('de')),    4)
     self.assertEqual (len (tree.find_all ('e')),     5)
Exemplo n.º 13
0
 def test_find_6 (self, _, builder):
     tree = Tree ({
         'A' : 'a',
         'B' : 'ab',
         'C' : 'abc',
         'D' : 'abcd',
         'E' : 'abcde',
     }, builder = builder)
     self.assertEqual (len (tree.find_all ('abcde')), 1)
     self.assertEqual (len (tree.find_all ('abcd')),  2)
     self.assertEqual (len (tree.find_all ('abc')),   3)
     self.assertEqual (len (tree.find_all ('ab')),    4)
     self.assertEqual (len (tree.find_all ('a')),     5)
Exemplo n.º 14
0
 def test_find_4 (self, _, builder):
     tree = Tree ({
         'A' : ('232 020b 092 093 039 061 102 135 098 099 '
                '039 040 039 040 044 141 140 098').split (),
         'B' : '097 098 039 040 041 129 043'.split (),
         'C' : ('097 098 039 040 020a 022 023 097 095 094 098 '
                '043 044 112 039 020b 039 098').split (),
     }, builder = builder)
     self.assertTrue  (tree.find ('039 040 041'.split ()))
     self.assertTrue  (tree.find ('039 040 039 040'.split ()))
     self.assertTrue  (tree.find ('020a 022 023'.split ()))
     self.assertTrue  (tree.find ('232 020b 092'.split ()))
     self.assertTrue  (tree.find ('097 098 039 040'.split ()))
     self.assertTrue  (tree.find ('141 140 098'.split ()))
     self.assertFalse (tree.find ('039 040 042'.split ()))
Exemplo n.º 15
0
    def test_maximal_repeats_2(self, _, builder):
        tree = Tree(
            {
                'A': '1abc2abc3',
                'B': '4abc5abc6',
                'C': '7abc',
                'D': '7abc',
                'E': 'abc',
            },
            builder=builder)

        self.maximal_repeats(tree, [
            '2 7 a b c',
            '5 a b c',
        ])
Exemplo n.º 16
0
    def test_find_1 (self, _, builder):
        # Gusfield1997 Figure 5.1 Page  91
        tree = Tree ({ 'A' : 'xabxac' }, builder = builder)
        self.assertTrue  (tree.find ('x'))
        self.assertTrue  (tree.find ('xa'))
        self.assertTrue  (tree.find ('xab'))
        self.assertTrue  (tree.find ('xabx'))
        self.assertTrue  (tree.find ('xabxa'))
        self.assertTrue  (tree.find ('xabxac'))

        self.assertTrue  (tree.find ('abxac'))
        self.assertTrue  (tree.find ('bxac'))
        self.assertTrue  (tree.find ('xac'))
        self.assertTrue  (tree.find ('ac'))
        self.assertTrue  (tree.find ('c'))

        self.assertFalse (tree.find ('d'))
        self.assertFalse (tree.find ('xx'))
        self.assertFalse (tree.find ('xabxaa'))
Exemplo n.º 17
0
# def pysequitur_map(str_seq_unique, win_len):
#     print(str_seq_unique[0:20])
#     total_str = np.concatenate([str_seq_unique[0:-win_len//2:-1], str_seq_unique, str_seq_unique[-win_len//2:-1:-1]]).tolist()
#     print(len(str_seq_unique))
#     print(len(total_str))
#
#     for i in range(win_len//2, len(total_str)-win_len//2):
#         temp_str = total_str[i-win_len//2:i+win_len//2]
#         print(temp_str)
#         s_temp = psq.Sequencer2(temp_str)
#         print(s_temp.get())
#         print(s_temp.resolve())
# psq.print_grammar(s_temp)

suffixtree = Tree({"A": ch1_word_lst[:20]})
suffixtree.prepare_lca()

from graphviz import Source

dot_object = suffixtree.to_dot()
s = Source(dot_object)
s.render("test.gv", view=True)

# pysequitur_map(ch1_word_lst, 20)

ch1_clean_str = removeShortSegments(ch1_connotated_lst_unique.split(" "),
                                    ch1_count_lst)
ch1_clean_str_unique = runLengthEncoding(ch1_clean_str)

# print(ch1_connotated_lst)
Exemplo n.º 18
0
 def test_maximal_repeats_1(self, _, builder):
     # See [Gusfield1997]_ §7.12.1, 144ff.
     tree = Tree({'A': 'xabxac', 'B': 'awyawxawxz'}, builder=builder)
     self.maximal_repeats(tree, [
         '2 x a',
     ])
Exemplo n.º 19
0
 def test_find_2 (self, _, builder):
     # Gusfield1997 Figure 5.2 Page  92
     tree = Tree ({ 'A' : 'awyawxawxz' }, builder = builder)
     self.assertTrue  (tree.find ('awx'))
     self.assertTrue  (tree.find ('awy'))
     self.assertFalse (tree.find ('awz'))
 def construct_tree(self):
     self.tree = Tree({1: self.sequence, 2: reversed(self.sequence)})
Exemplo n.º 21
0
    def test_maximal_repeats_3(self, _, builder):
        # pylint: disable=line-too-long
        tree = Tree(
            {
                'berlin-sb-lat-qu-931': ['157', '165', '167', '165'],
                'cava-dei-tirreni-bdb-4': [
                    '232', '020b', '092', '093', '039', '061', '102', '135',
                    '098', '099', '039', '040', '039', '040', '044', '141',
                    '140', '098', '121', '098', '090', '095', '134', '135',
                    '088', '104', '139', '140', '067', '091', '094', '158',
                    '159', '161', '162', '163', '164', '165', '158', '165',
                    '161'
                ],
                'gotha-flb-memb-i-84': [
                    'A000', '176', '039', '040', '020a', '022', '023', '097',
                    '095', '094', '098', '043', '044', '112', '039', '020b',
                    '039', '098', '140', '141', '138', '139', 'A000', '196',
                    '112', '105', '134', '157', '166', '163', '175', '165',
                    '191', '192', '193', '209', '210', '228', '213', '212',
                    '216', '217'
                ],
                'heiligenkreuz-sb-217': [
                    '249', '068', '134', '157', '165', '167', '201', '165',
                    '188', '191', '193', '188', '192', '016', '015', '013',
                    '055', '056', '042', '052', '066', '020a', '078', '191',
                    '131', '192', '195', '266', '242', '293', '273b', '287',
                    '094', '040', '020a', '023', '095', '094', '098', '020b',
                    '157', '163', '165', '196'
                ],
                'ivrea-bc-xxxiii': [
                    '068', '020a', '022', '023', '095', '094', '021', '097',
                    '098', '039', '040', '039', '040', '041', '129', '043',
                    '044', '112', '088', '104', '134', '135', '139'
                ],
                'ivrea-bc-xxxiv': [
                    '022', '020a', '022', '023', '095', '094', '021', '097',
                    '098', '039', '040', '039', '040', '041', '129', '043',
                    '098', '099', '101', '100', '020a', '044', '112', '088',
                    '134', '135', '060', '140', '141', '138', '139', '157',
                    '165', '164', '163', '093', '201', '159'
                ],
                'modena-bc-o-i-2': [
                    '163', '039', '040', '020a', '022', '023', '097', '095',
                    '094', '098', '043', '044', '112', '039', '020b', '039',
                    '098', '157', '163', '165', '191', '192', '193', '196',
                    '216', '217'
                ],
                'muenchen-bsb-lat-19416': [
                    '020a', '093', '020a', '022', '023', '095', '046', '067',
                    '047', '039', '040', '039', '040', '041', '129', '043',
                    '044', '112', '099', '097', '098', '103', '093', '105',
                    '191', '192', '193', '201'
                ],
                'muenchen-bsb-lat-29555-1': [
                    'M024', '163', '175', '165', 'M015', '196', '193', '209',
                    '210', '228', '273', '224'
                ],
                'muenchen-bsb-lat-3853': [
                    '249', '068', '134', '157', '165', '167', '201', '165',
                    '188', '191', '193', '188', '192', '016', '015', '013',
                    '055', '056', '042', '052', '066', '020a', '078', '191',
                    '131', '192', '195', '252a', '266', '242', '293', '273b',
                    '287', '094', '040', '020a', '023', '095', '094', '098',
                    '020b', '157', '163', '165', '196'
                ],
                'muenchen-bsb-lat-6360': ['157', '165', '167', '201', '165'],
                'paris-bn-lat-3878': [
                    '249', '157', '165', '167', '188', '191', '293', '165',
                    '196', '252a'
                ],
                'paris-bn-lat-4613': [
                    '020a', '094', '095', '096', '025', '098', '039', '040',
                    '022', '023', '105', '033', '121', '141', '191', '192',
                    '158', '165', '201', '215', '214', '219'
                ],
                'salzburg-bea-st-peter-a-ix-32': [
                    '252', '191', '201', 'M023', '215', '210', '093', '112',
                    '293', '252a', '010', '011', '293'
                ],
                'st-gallen-sb-733':
                ['020a', '089', '022', '023', '097', '094'],
                'st-paul-abs-4-1': [
                    '092', '093', '020a', '094', '095', '088', '044', '046',
                    '067', '098', '099', '041', '043', '044', '039', '098',
                    '134', '090', '113', '139', '140', '141', '140', '139',
                    '158', '140', '165', '181'
                ],
                'vatikan-bav-chigi-f-iv-75': [
                    '232', '020b', '092', '093', '039', '061', '102', '135',
                    '098', '099', '039', '040', '039', '040', '044', '141',
                    '140', '098', '090', '134', '135', '088', '104', '139',
                    '140', '067', '091', '094', '095', '158', '159', '098',
                    '159', '162', '163', '164', '165', '161', '093', '201',
                    '020a', '095', '088', '044', '046', '099', '041', '043',
                    '044', '141', '158', '208'
                ],
                'vatikan-bav-reg-lat-1000b': ['157', '165'],
                'vatikan-bav-reg-lat-263': [
                    '202', '087', '044', '224', 'A000', '163', '093', '224',
                    '225', '232', '020b'
                ],
                'vatikan-bav-vat-lat-5359': ['201', '165'],
                'vercelli-bce-clxxiv':
                ['167', '092', '093', '165', '163', '166'],
                'vercelli-bce-clxxv': ['163'],
                'wien-oenb-ser-n-3761': ['157'],
                'wolfenbuettel-hab-blankenb-130': [
                    '022', '043', '044', '060', '103', '097', '095', '067',
                    '040', '020a', '112', '038', '023', '094', '041', '020a',
                    '105', '098', '057', '014', '114', '039', '040', '039',
                    '040', '139', '140', '156', '178', '143', '150', '138',
                    '179', '174', '157', '160', '165', '228', '164', '163',
                    '093', '166', '159', '102', '201', '158', '161', '093',
                    '191', '192', '193', '201', '158', '180', '210', '211',
                    '214', '219', '202', '022', '023', '095', '098', '041',
                    '129', '043', '044', '094', '215', '131'
                ]
            },
            builder=builder)

        self.maximal_repeats(tree, [
            '4 022 023 095',
            '4 023 095 094',
            '4 041 129 043',
            '4 043 044 112',
            '4 092 093',
            '4 095 094 098',
            '4 098 099',
            '4 134 135',
            '4 139 140',
            '4 191 192 193',
            '4 201 165',
            '5 020a 022 023',
            '5 040 020a',
            '5 044 112',
            '5 157 165 167',
            '5 191 192',
            '6 023 095',
            '6 039 040 039 040',
            '6 095 094',
            '7 043 044',
            '7 157 165',
            '8 022 023',
            '9 039 040',
        ],
                             min_cv=4)