def filter_junk_strings(substrings): """Filter out miscellaneous junk strings""" filtered_substrings = {} for k,v in substrings.items(): if not common.is_junk(k): filtered_substrings[k] = v return filtered_substrings
def get_substrings(string, k, allowed_substrings): """Return all substrings of length >= <k> in <string> as a dict of substring:count where there are count occurrences of substring in <string> If <allowed_substrings> then only allow substrings from <allowed_substrings>. Performance ----------- The returned substring:count dict will no longer than <allowed_substrings> or <parent_keys> so the best way to guarantee performance is to find short key sets. """ common.report('get_substrings:k=%2d,allowed_substrings=%5d,size=%7d' % (k, len(allowed_substrings) if allowed_substrings else -1, len(string))) substrings = {} n = len(string) for i in range(n-k): pattern = string[i:i+k] if common.is_junk(pattern): continue if allowed_substrings: if not pattern in allowed_substrings: continue if not pattern in substrings: substrings[pattern] = 0 substrings[pattern] += 1 return substrings
def get_child_offsets(file_names, test_files, offsets_dict, k): """ Given a set of substrings of length <k> defined by offsets in a set of test_files, return a dict of substrings of length k+1 where offsets_dict[<filename>][<substring>] is the set of offsets of <substring> in test_files[<filename>] <file_names> is keys of test_files in the desired sort order (shorter first) Performance ----------- This is the inner loop of the program. The returned dict will no longer than offsets_dict and string searches are on existing substrings + 1 character to left or right so there is not that much text to search. """ common.report('get_child_offsets(file_names=%d,test_files=%d,%d,substrings=%d,k=%d)' % (len(file_names), len(test_files), len(offsets_dict), len(offsets_dict.values()[0]), k)) parent_substrings = offsets_dict[file_names[0]].keys() child_offsets_dict = {} allowed_substrings = None for name in file_names: x = test_files[name] child_offsets_dict[name] = {} for key, ofs_set in offsets_dict[name].items(): # Use a list which unlike a set can be indexed and sorted ofs_list = sorted(ofs_set) # Remove parent offsets that would truncate substrings of length k+1 if ofs_list[0] == 0: del(ofs_list[0]) if ofs_list[-1]+k+1 == len(x['text']): del(ofs_list[-1]) # Create the child length k+1 substrings and add them to the child offsets dict # ofs1 is the offset of the k+1 substring key1 for ofs in ofs_list: for ofs1 in [ofs-1, ofs]: key1 = x['text'][ofs1:ofs1+k+1] #if len(key1) != k+1: # print 'key="%s", key1="%s"' % (key, key1) assert(len(key1) == k+1) if allowed_substrings: if not key1 in allowed_substrings: continue # Only allow keys with valid parents if not key1[1:] in parent_substrings or not key1[:-1] in parent_substrings: continue # Get rid of the junk too if common.is_junk(key1): continue # Got through all the filters. Add the new offset to the child dict if not key1 in child_offsets_dict[name].keys(): child_offsets_dict[name][key1] = set([]) child_offsets_dict[name][key1].add(ofs1) # Prune the entries with insufficient repeats unpruned_len = len(child_offsets_dict[name].keys()) for key, ofs_set in child_offsets_dict[name].items(): if len(ofs_set) < x['repeats']: del(child_offsets_dict[name][key]) # allowed_substrings is used as a filter in all but first pass through this loop allowed_substrings = child_offsets_dict[name].keys() common.report(' allowed_substrings=%3d,%3d,size=%7d' % (unpruned_len, len(allowed_substrings), len(x['text']))) # Need to go back and trim the substrings lists to allowed_substrings # If this results in a zero length list for any file then returns for name in file_names: for key in child_offsets_dict[name].keys(): if not key in allowed_substrings: del(child_offsets_dict[name][key]) if len(child_offsets_dict[name]) == 0: return None common.dump_dict('dumpfile_%03d' % (k+1), file_names, test_files, child_offsets_dict) if common.is_validate(): if not validate_child_offsets(file_names, offsets_dict, child_offsets_dict, k): raise ValueError for name in file_names: common.report('before=%3d,after=%3d,file=%s' % (len(offsets_dict[name]), len(child_offsets_dict[name]),name)) return child_offsets_dict