Python find_repeated_substrings示例

编程语言: Python

命名空间/包名称: find_repeated_substrings_rolling_hash

方法/功能: find_repeated_substrings

hotexamples.com的示例: 2

Python find_repeated_substrings - 已找到2个示例。这些是从开源项目中提取的最受好评的find_repeated_substrings_rolling_hash.find_repeated_substrings现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： test_find_repeated_substrings.py 项目： alepharchives/strings

def compare_string_subsets(mask, subset_fraction, num_tests):
    """Find the longest substring that is repeated in several subsets of a list of files
        matched by <mask> in which the filename encodes the number of repeats as defined 
        in name_to_repeats() above.
        Compare results from each set. It shoulbe the same
    """
    # Read the files matched by <mask>    
    test_files = get_test_files(mask)
    print '%d files in mask "%s"' % (len(test_files), mask)
    if not test_files:
        print 'no test files'
        return

    file_names = [x for x in test_files.keys()]
    file_names.sort(key = lambda x: len(test_files[x]['text']))  
    
    for i, name in enumerate(file_names):
        x = test_files[name]
        print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name)
    print '=' * 60

    subset_size = int(len(test_files)*subset_fraction)

    print 'Testing %d subsets of size %d from total of %d' % (num_tests, subset_size, len(test_files))

    random.seed(111)

    subset_substring_list_list = [] 
    for test in range(num_tests):
        test_file_names = file_names[:]
        random.shuffle(test_file_names)
        test_files_subset = test_file_names[:subset_size]
        if not common.is_quiet():
            for i, name in enumerate(test_files_subset):
                x = test_files[name]
                print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name)
            print '-' * 60
        offsets_dict = FRS.find_repeated_substrings(test_files)
        substring_list = offsets_dict[file_names[0]].keys()
        subset_substring_list_list.append(substring_list)
        print 'Found %d substrings' % len(substring_list)
        for i, substring in enumerate(substring_list):
            print '%2d: len=%3d, substring="%s"' % (i, len(substring), H(substring))
            for name in file_names:
                x = test_files[name]
                offsets = sorted(offsets_dict[name][substring])
                print '%40s: needed=%-2d,num=%-2d,offsets=%s' % (os.path.basename(name), 
                    x['repeats'], len(offsets), offsets)
                offsets2 = sorted(common.get_substring_offsets(x['text'], substring)) 
                assert(len(offsets2) == len(offsets))
                for i in range(len(offsets)):
                    assert(offsets2[i] == offsets[i])

示例#2

显示文件

文件： test_find_repeated_substrings.py 项目： alepharchives/strings

def find_and_show_substrings(mask):
    """Find the longest substring that is repeated in a list of files
        matched by <mask> in which the filename encodes the number of 
        repeats as defined in name_to_repeats() above.
    """

    # Read the files matched by <mask>    
    test_files = get_test_files(mask)
    file_names = [x for x in test_files.keys()]
    file_names.sort(key = lambda x: len(test_files[x]['text']))  
    
    print '%d files in mask "%s"' % (len(test_files), mask)
    if not test_files:
        print 'no test files'
        return

    for i, name in enumerate(file_names):
        x = test_files[name]
        print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name)
    print '-' * 60

    offsets_dict = FRS.find_repeated_substrings(test_files)

    # Print out the results 
    
    substring_list = offsets_dict[file_names[0]].keys()

    print 'Substrings summary:'
    print 'Found %d substrings' % len(substring_list)
    for i, substring in enumerate(substring_list):
        print '%2d: len=%3d, substring="%s"' % (i, len(substring), H(substring))

    print '=' * 80
    print 'Substrings in detail:'
    for substring in substring_list:
        print 'Substring %2d' % i, '-' * 60
        print 'len=%3d, substring="%s"' % (len(substring), substring)
        print 'hex =', H(substring)
        print 'Offsets of substring in test files:'
        for name in file_names:
            x = test_files[name]
            offsets = sorted(offsets_dict[name][substring])
            print '%40s: needed=%-2d,num=%-2d,offsets=%s' % (os.path.basename(name), 
                x['repeats'], len(offsets), offsets)
            offsets2 = sorted(common.get_substring_offsets(x['text'], substring)) 
            assert(len(offsets2) == len(offsets))
            for i in range(len(offsets)):
                assert(offsets2[i] == offsets[i])