forked from peterwilliams97/strings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_repeated_substrings_rolling_hash.py
395 lines (334 loc) · 16.2 KB
/
find_repeated_substrings_rolling_hash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
"""Find the longest substring that is repeated a specified minimum number of times a list of
strings. The number of repeats may different for each string.
In the sample code, the strings are files with the minimum number of occcurrences of the
substring encoded in their names.
MAIN FUNCTIONS
--------------
find_repeated_substrings()
Find longest substring that is repeated specified number of times in a list of strings
find_and_show_substrings()
Reads a list of files, calls find_repeated_substrings() and prints the longest substring
SKETCH OF ALGORITHM
-------------------
for k = 4 to K
allowed_substrings = None
for s = shortest to longest string
r = number of repeats required for s
all_substrings = all substrings of length k in s that occur >= r times
if allowed_substrings
allowed_substrings = intersection(all_substrings, allowed_substrings)
else
allowed_substrings = all_substrings
offsets[s] = offsets of allowed_substrings in s
PERFORMANCE
-----------
There are several aspects of the code that give good typical runtimes
- len(allowed_substrings) cannot increase for a given k and tends not to grow very much for
as k increases. If the first string searched is short enough then len(allowed_substrings)
can start at around 100-200 and stay below 300.
- for k > 4 the length k+1 substrings are generated from the length k stings by searching 1
character forward and back. This is
running_time <= 2*len(allowed_substrings)*number of strings*(K-4)*string_match(K)
For typcial values of
starting len(allowed_substrings) 100
number of strings 60
K 40
this gives
running_time <= 2 * 100 * 60 * 40 * 40 = 19,200,000
"""
import sys
import glob
import re
import os
import time
import random
import common
from common import H, unH
import rolling_hash
def filter_junk_strings(substrings):
"""Filter out miscellaneous junk strings"""
filtered_substrings = {}
for k,v in substrings.items():
if not common.is_junk(k):
filtered_substrings[k] = v
return filtered_substrings
def get_substrings(string, k, allowed_substrings):
"""Return all substrings of length >= <k> in <string> as a dict of
substring:count where there are count occurrences of substring in
<string>
If <allowed_substrings> then only allow substrings from <allowed_substrings>.
Performance
-----------
The returned substring:count dict will no longer than
<allowed_substrings> or <parent_keys> so the best way to guarantee
performance is to find short key sets.
"""
common.report('get_substrings:k=%2d,allowed_substrings=%5d,size=%7d' %
(k, len(allowed_substrings) if allowed_substrings else -1, len(string)))
substrings = {}
n = len(string)
for i in range(n-k):
pattern = string[i:i+k]
if common.is_junk(pattern):
continue
if allowed_substrings:
if not pattern in allowed_substrings:
continue
if not pattern in substrings:
substrings[pattern] = 0
substrings[pattern] += 1
return substrings
def filter_repeats(substrings, min_repeats):
"""Return entries in dict <substrings> with values >= <min_repeats>"""
filtered_substrings = {}
for key,value in substrings.items():
if value >= min_repeats:
filtered_substrings[key] = value
return filtered_substrings
def get_matching_offsets(file_names, offsets_dict, substring):
"""Return members of <offsets_dict> that match <substring>"""
matching_offsets = {}
for name in file_names:
substrings_list = sorted(offsets_dict[name].keys())
if substring in substrings_list:
if not name in matching_offsets.keys():
matching_offsets[name] = {}
matching_offsets[name] = offsets_dict[name][substring]
return matching_offsets
def is_offsets_greater(file_names, matching_offsets1, matching_offsets2):
"""Return True if matching_offsets1 has as many entries for each name as matching_offsets2"""
for name in file_names:
assert(len(matching_offsets1) >= len(matching_offsets1))
if len(matching_offsets1) < len(matching_offsets1):
return False
return True
def validate_child_offsets(file_names, offsets_dict, child_offsets_dict, k):
"""Check that a length k+1 substring is repeated no more than the length k substrings it
contains"""
substrings_list = _od_substrings(offsets_dict)
child_substrings_list = _od_substrings(child_offsets_dict)
common.report('validate_child_offsets(%d)' % k)
for child_substring in child_substrings_list:
assert(len(child_substring) == k+1)
child_matching_offsets = get_matching_offsets(file_names, child_offsets_dict, child_substring)
substring1 = child_substring[:-1]
substring2 = child_substring[1:]
if not substring1 in substrings_list:
print 'No match for parent"%s", child="%s" in parent list' % (H(substring1), H(child_substring))
return False
if not substring2 in substrings_list:
print 'No match for parent"%s", child="%s" in parent list' % (H(substring2), H(child_substring))
return False
matching_offsets1 = get_matching_offsets(file_names, offsets_dict, substring1)
matching_offsets2 = get_matching_offsets(file_names, offsets_dict, substring1)
if not is_offsets_greater(file_names, matching_offsets1, child_matching_offsets):
print 'Mismatch on parent="%s", child="%s"' % (H(substring1), H(child_substring))
return False
if not is_offsets_greater(file_names, matching_offsets2, child_matching_offsets):
print 'Mismatch on parent="%s", child="%s"' % (H(substring2), H(child_substring))
return False
return True
def get_child_offsets(file_names, test_files, offsets_dict, k):
""" Given a set of substrings of length <k> defined by offsets in a set of
test_files, return a dict of substrings of length k+1
where
offsets_dict[<filename>][<substring>] is the set of offsets of <substring>
in test_files[<filename>]
<file_names> is keys of test_files in the desired sort order (shorter first)
Performance
-----------
This is the inner loop of the program.
The returned dict will no longer than offsets_dict and string searches are on existing
substrings + 1 character to left or right so there is not that much text to search.
"""
common.report('get_child_offsets(file_names=%d,test_files=%d,%d,substrings=%d,k=%d)' %
(len(file_names), len(test_files), len(offsets_dict), len(offsets_dict.values()[0]), k))
parent_substrings = offsets_dict[file_names[0]].keys()
child_offsets_dict = {}
allowed_substrings = None
for name in file_names:
x = test_files[name]
child_offsets_dict[name] = {}
for key, ofs_set in offsets_dict[name].items():
# Use a list which unlike a set can be indexed and sorted
ofs_list = sorted(ofs_set)
# Remove parent offsets that would truncate substrings of length k+1
if ofs_list[0] == 0:
del(ofs_list[0])
if ofs_list[-1]+k+1 == len(x['text']):
del(ofs_list[-1])
# Create the child length k+1 substrings and add them to the child offsets dict
# ofs1 is the offset of the k+1 substring key1
for ofs in ofs_list:
for ofs1 in [ofs-1, ofs]:
key1 = x['text'][ofs1:ofs1+k+1]
#if len(key1) != k+1:
# print 'key="%s", key1="%s"' % (key, key1)
assert(len(key1) == k+1)
if allowed_substrings:
if not key1 in allowed_substrings:
continue
# Only allow keys with valid parents
if not key1[1:] in parent_substrings or not key1[:-1] in parent_substrings:
continue
# Get rid of the junk too
if common.is_junk(key1):
continue
# Got through all the filters. Add the new offset to the child dict
if not key1 in child_offsets_dict[name].keys():
child_offsets_dict[name][key1] = set([])
child_offsets_dict[name][key1].add(ofs1)
# Prune the entries with insufficient repeats
unpruned_len = len(child_offsets_dict[name].keys())
for key, ofs_set in child_offsets_dict[name].items():
if len(ofs_set) < x['repeats']:
del(child_offsets_dict[name][key])
# allowed_substrings is used as a filter in all but first pass through this loop
allowed_substrings = child_offsets_dict[name].keys()
common.report(' allowed_substrings=%3d,%3d,size=%7d' %
(unpruned_len, len(allowed_substrings), len(x['text'])))
# Need to go back and trim the substrings lists to allowed_substrings
# If this results in a zero length list for any file then returns
for name in file_names:
for key in child_offsets_dict[name].keys():
if not key in allowed_substrings:
del(child_offsets_dict[name][key])
if len(child_offsets_dict[name]) == 0:
return None
common.dump_dict('dumpfile_%03d' % (k+1), file_names, test_files, child_offsets_dict)
if common.is_validate():
if not validate_child_offsets(file_names, offsets_dict, child_offsets_dict, k):
raise ValueError
for name in file_names:
common.report('before=%3d,after=%3d,file=%s' % (len(offsets_dict[name]),
len(child_offsets_dict[name]),name))
return child_offsets_dict
def get_offsets_from_texts(file_names, test_files, k):
common.note_time('get_offsets_from_texts k=%d' % k)
allowed_substrings = None
for name in file_names:
x = test_files[name]
substrings = get_substrings(x['text'], k, allowed_substrings)
substrings = filter_repeats(substrings, x['repeats'])
substrings = filter_junk_strings(substrings)
if not substrings:
print 'No %d character string works!' % k
return None
allowed_substrings = substrings.keys()
# Remove all the substrings that are no longer used
for name in file_names:
for key in substrings.keys():
if not key in allowed_substrings:
del(substrings[name][key])
#report('k=%d:\n\substrings=%d:%s' % (k, len(allowed_substrings), sorted(allowed_substrings)))
note_time('got substrings')
# From now on work with offsets
# offsets_dict[<filename>][<substring>] = list of offsets of <substring> in file with name <filename>
offsets_dict = {}
for name in file_names:
x = test_files[name]
offsets_dict[name] = {}
for key in substrings.keys():
offsets_dict[name][key] = common.get_substring_offsets(x['text'], key)
return [offsets_dict[name] for name in file_names]
def test_files_to_text_repeats(file_names, test_files):
text_list = [test_files[name]['text'] for name in file_names]
min_repeats_list = [test_files[name]['repeats'] for name in file_names]
return text_list, min_repeats_list
def text_repeats_to_test_files(file_names, test_files, text_list, min_repeats_list):
for i,name in enumerate(file_names):
test_files[name]['text'] = text_list[i]
test_files[name]['repeats'] = min_repeats_list[i]
_MIN_K = 4 # Starting substring length
_MAX_K = 2000 # Max substring length
_JUNK_KEY_THRESHOLD = 500 # Substrings that occur this many times in a file are considered junk
def find_repeated_substrings(test_files):
"""Return the longest substring(s) s that is repeated in <test_files>
according to rule:
For each x in test_files:
s occurs at least x['repeats'] times in x['text']
test_files[name] = {'text':text, 'repeats':repeats}
"""
common.note_time('start searching strings')
common.report('find_repeated_substrings(%d,%d,%d)' % (len(test_files.keys()), _MIN_K, _MAX_K))
if not test_files:
print 'no test files'
return
# Find the substrings that are repeated >= k times in files with k repeats
# It is important to test shorter files first
file_names = [x for x in test_files.keys()]
file_names.sort(key = lambda x: len(test_files[x]['text']))
common.report('file_names:\n%s' % '\n'.join(['%8d:%3d: %s' %
(len(test_files[name]['text']),test_files[name]['repeats'],name) for name in file_names]))
# Start by finding all substrings of length _MIN_K which is typically 4
k = _MIN_K
if False:
print 'Pure Python'
pattern_offsets_list = get_offsets_from_texts(file_names, test_files, k)
else:
print 'Cython rolling hash'
text_list, repeats_list = test_files_to_text_repeats(file_names, test_files)
# Get rid of expensive references to big strings
for name in file_names:
test_files[name]['text'] = None
pattern_offsets_list = rolling_hash.get_offsets_from_texts(text_list, repeats_list, k,
_JUNK_KEY_THRESHOLD)
text_repeats_to_test_files(file_names, test_files, text_list, repeats_list)
text_list = None
min_repeats_list = None
if False:
# Does not work. !@#$ Find out why
while k >= _MIN_K:
pattern_offsets_list = rolling_hash.get_offsets_from_texts(text_list, repeats_list, k,
_JUNK_KEY_THRESHOLD)
if pattern_offsets_list[0]:
break
print 'reducing k %d=>%d' % (k, k // 2)
k = k // 2
common.note_time('got substrings')
offsets_dict = dict(zip(file_names, pattern_offsets_list))
# Work in increasing length of substrings, +1 per round
offsets_dict_dict = {}
k_list = []
for k in range(_MIN_K, _MAX_K):
#sparsify the text
for name in file_names:
test_files[name]['text'] = common.sparsify_by_offsets(test_files[name]['text'],
offsets_dict[name], k)
offsets_dict_dict[k] = offsets_dict
k_list.append(k)
common.note_time('found %3d substrings of length >= %3d' % (len(offsets_dict[file_names[0]]), k))
child_offsets_dict = get_child_offsets(file_names, test_files, offsets_dict, k)
if not child_offsets_dict:
break
offsets_dict = child_offsets_dict
# The offsets dict may have too many repeats
# Walk back through the offsets list to find the first one without excess repeats
k_list.reverse()
print '$' * 60
print 'k_list', k_list
for k in k_list:
print '-' * 60
offsets_dict = offsets_dict_dict[k]
for key in sorted(offsets_dict[file_names[0]].keys()):
print '%s:' % H(key),
for name in file_names:
if len(offsets_dict[name][key]) != test_files[name]['repeats']:
print '"%s":%d,%d' % (name, len(offsets_dict[name][key]), test_files[name]['repeats']),
print
for name in file_names:
for key, ofs_set in offsets_dict[name].items():
if len(ofs_set) > test_files[name]['repeats']:
for n in file_names:
del(offsets_dict[n][key])
#for name in file_names:
# if not offsets_dict[name]:
# del(offsets_dict[name])
print 'k=%d, offsets_dict=' % (k)
for name in file_names:
print ' ', name, ['"%s":%d'%(key, len(val)) for (key,val) in offsets_dict[name].items()]
if all(offsets_dict.values()):
break
exit()
# return last non-empty dict of offsets
return offsets_dict