def compare_string_subsets(mask, subset_fraction, num_tests): """Find the longest substring that is repeated in several subsets of a list of files matched by <mask> in which the filename encodes the number of repeats as defined in name_to_repeats() above. Compare results from each set. It shoulbe the same """ # Read the files matched by <mask> test_files = get_test_files(mask) print '%d files in mask "%s"' % (len(test_files), mask) if not test_files: print 'no test files' return file_names = [x for x in test_files.keys()] file_names.sort(key = lambda x: len(test_files[x]['text'])) for i, name in enumerate(file_names): x = test_files[name] print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name) print '=' * 60 subset_size = int(len(test_files)*subset_fraction) print 'Testing %d subsets of size %d from total of %d' % (num_tests, subset_size, len(test_files)) random.seed(111) subset_substring_list_list = [] for test in range(num_tests): test_file_names = file_names[:] random.shuffle(test_file_names) test_files_subset = test_file_names[:subset_size] if not common.is_quiet(): for i, name in enumerate(test_files_subset): x = test_files[name] print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name) print '-' * 60 offsets_dict = FRS.find_repeated_substrings(test_files) substring_list = offsets_dict[file_names[0]].keys() subset_substring_list_list.append(substring_list) print 'Found %d substrings' % len(substring_list) for i, substring in enumerate(substring_list): print '%2d: len=%3d, substring="%s"' % (i, len(substring), H(substring)) for name in file_names: x = test_files[name] offsets = sorted(offsets_dict[name][substring]) print '%40s: needed=%-2d,num=%-2d,offsets=%s' % (os.path.basename(name), x['repeats'], len(offsets), offsets) offsets2 = sorted(common.get_substring_offsets(x['text'], substring)) assert(len(offsets2) == len(offsets)) for i in range(len(offsets)): assert(offsets2[i] == offsets[i])
def find_and_show_substrings(mask): """Find the longest substring that is repeated in a list of files matched by <mask> in which the filename encodes the number of repeats as defined in name_to_repeats() above. """ # Read the files matched by <mask> test_files = get_test_files(mask) file_names = [x for x in test_files.keys()] file_names.sort(key = lambda x: len(test_files[x]['text'])) print '%d files in mask "%s"' % (len(test_files), mask) if not test_files: print 'no test files' return for i, name in enumerate(file_names): x = test_files[name] print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name) print '-' * 60 offsets_dict = FRS.find_repeated_substrings(test_files) # Print out the results substring_list = offsets_dict[file_names[0]].keys() print 'Substrings summary:' print 'Found %d substrings' % len(substring_list) for i, substring in enumerate(substring_list): print '%2d: len=%3d, substring="%s"' % (i, len(substring), H(substring)) print '=' * 80 print 'Substrings in detail:' for substring in substring_list: print 'Substring %2d' % i, '-' * 60 print 'len=%3d, substring="%s"' % (len(substring), substring) print 'hex =', H(substring) print 'Offsets of substring in test files:' for name in file_names: x = test_files[name] offsets = sorted(offsets_dict[name][substring]) print '%40s: needed=%-2d,num=%-2d,offsets=%s' % (os.path.basename(name), x['repeats'], len(offsets), offsets) offsets2 = sorted(common.get_substring_offsets(x['text'], substring)) assert(len(offsets2) == len(offsets)) for i in range(len(offsets)): assert(offsets2[i] == offsets[i])