Python is_junk示例，common.is_junk Python示例

示例#1

0

显示文件

文件： find_repeated_substrings_rolling_hash.py 项目： alepharchives/strings

def filter_junk_strings(substrings):
    """Filter out miscellaneous junk strings"""
    filtered_substrings = {}
    for k,v in substrings.items():
        if not common.is_junk(k):
            filtered_substrings[k] = v
    return filtered_substrings

示例#2

0

显示文件

文件： find_repeated_substrings_rolling_hash.py 项目： alepharchives/strings

def get_substrings(string, k, allowed_substrings):
    """Return all substrings of length >= <k> in <string> as a dict of 
        substring:count where there are count occurrences of substring in 
        <string>
        If <allowed_substrings> then only allow substrings from <allowed_substrings>. 

        Performance
        -----------
        The returned substring:count dict will no longer than 
        <allowed_substrings> or <parent_keys> so the best way to guarantee 
        performance is to find short key sets.
    """ 
    common.report('get_substrings:k=%2d,allowed_substrings=%5d,size=%7d' % 
        (k, len(allowed_substrings) if allowed_substrings else -1, len(string)))
    substrings = {}
    n = len(string)
    for i in range(n-k):
        pattern = string[i:i+k]
        if common.is_junk(pattern):
            continue
        if allowed_substrings: 
            if not pattern in allowed_substrings:
                continue
        if not pattern in substrings:
            substrings[pattern] = 0
        substrings[pattern] += 1
    return substrings

示例#3

0

显示文件

文件： find_repeated_substrings_rolling_hash.py 项目： alepharchives/strings

def get_child_offsets(file_names, test_files, offsets_dict, k):
    """ Given a set of substrings of length <k> defined by offsets in a set of 
        test_files, return a dict of substrings of length k+1
        where
            offsets_dict[<filename>][<substring>] is the set of offsets of <substring>
            in test_files[<filename>]
        <file_names> is keys of test_files in the desired sort order (shorter first)
        
        Performance
        -----------
        This is the inner loop of the program.
        The returned dict will no longer than offsets_dict and string searches are on existing
        substrings + 1 character to left or right so there is not that much text to search.
    """
    common.report('get_child_offsets(file_names=%d,test_files=%d,%d,substrings=%d,k=%d)' % 
        (len(file_names), len(test_files), len(offsets_dict), len(offsets_dict.values()[0]), k))
    
    parent_substrings = offsets_dict[file_names[0]].keys()
    child_offsets_dict = {}
    allowed_substrings = None

    for name in file_names:
        x = test_files[name]
        child_offsets_dict[name] = {}
        
        for key, ofs_set in offsets_dict[name].items():
            # Use a list which unlike a set can be indexed and sorted
            ofs_list = sorted(ofs_set) 
            # Remove parent offsets that would truncate substrings of length k+1
            if ofs_list[0] == 0:
                del(ofs_list[0])
            if ofs_list[-1]+k+1 == len(x['text']):
                del(ofs_list[-1]) 

            # Create the child length k+1 substrings and add them to the child offsets dict
            # ofs1 is the offset of the k+1 substring key1 
            for ofs in ofs_list:
                for ofs1 in [ofs-1, ofs]:
                    key1 = x['text'][ofs1:ofs1+k+1]
                    #if len(key1) != k+1:
                    #    print 'key="%s", key1="%s"' % (key, key1)
                    assert(len(key1) == k+1)
                    if allowed_substrings:    
                        if not key1 in allowed_substrings:
                            continue
                    # Only allow keys with valid parents
                    if not key1[1:] in parent_substrings or not key1[:-1] in parent_substrings:
                        continue
                    # Get rid of the junk too    
                    if common.is_junk(key1):
                        continue

                    # Got through all the filters. Add the new offset to the child dict
                    if not key1 in child_offsets_dict[name].keys():
                        child_offsets_dict[name][key1] = set([])
                    child_offsets_dict[name][key1].add(ofs1)
 
        # Prune the entries with insufficient repeats
        unpruned_len = len(child_offsets_dict[name].keys())
        for key, ofs_set in child_offsets_dict[name].items():            
            if len(ofs_set) < x['repeats']:
                del(child_offsets_dict[name][key])
  
        # allowed_substrings is used as a filter in all but first pass through this loop
        allowed_substrings = child_offsets_dict[name].keys() 
        common.report('  allowed_substrings=%3d,%3d,size=%7d' % 
            (unpruned_len, len(allowed_substrings), len(x['text'])))

    # Need to go back and trim the substrings lists to allowed_substrings
    # If this results in a zero length list for any file then returns
    for name in file_names:
        for key in child_offsets_dict[name].keys():            
            if not key in allowed_substrings:
                del(child_offsets_dict[name][key])
        if len(child_offsets_dict[name]) == 0:
            return None

    common.dump_dict('dumpfile_%03d' % (k+1), file_names, test_files, child_offsets_dict)
    
    if common.is_validate():
        if not validate_child_offsets(file_names, offsets_dict, child_offsets_dict, k):
            raise ValueError

    for name in file_names:
        common.report('before=%3d,after=%3d,file=%s' % (len(offsets_dict[name]),
            len(child_offsets_dict[name]),name))

    return child_offsets_dict