def construct_subhash_vectors(fname, dup_map): """collect set of checksums per file, substituting numeric id (fno, hno) for text values""" result = [] FnameMap.reset() # initialize mapping tables ChecksumMap.reset() fd = open(fname) last_name = "" hash_set = [] for text in fd: (val, name) = parse_md5deep_subfile_entry(text) if name != last_name: vec = construct_vector(last_name, hash_set, dup_map) if vec: result.append(vec) last_name = name hash_set = [] hash_set.append(val) vec = construct_vector(name, hash_set, dup_map) if vec: result.append(vec) fd.close() return result
def build_graph_from_vectors(vector_set, show_subgraph=False): "creates top-level fraph from set of vectors" B = nx.Graph() for fno, hset in vector_set: B.add_node(FnameMap.encode(fno), bipartite=0) for hno in hset: if hno not in B: B.add_node(ChecksumMap.encode(hno), bipartite=1) B.add_edge(FnameMap.encode(fno), ChecksumMap.encode(hno)) return B
def construct_vector(name, hash_set, dup_map): if name == "": # skipping - no file return False if name in dup_map: # skipping -- duplicate return False if len(hash_set) < 2: # skipping -- empty or singleton return False return [FnameMap.get_id(name), [ChecksumMap.get_id(hval) for hval in hash_set]]
def prune_vectors(vector_set, min_blocks): "only keep vectors containing at least 1 shared checksum" result = [] for fno, hset in vector_set: newset = [] for hno in hset: if ChecksumMap.get_count(hno) > 1: newset.append(hno) if len(newset) >= min_blocks: result.append([fno, newset]) return result
def find_conflicting_checksums(csums, graph): "find those block checksums that map to the same file region" range_sets = {} for hno in csums: range_val = ChecksumMap.get_range_using_encoded_id(hno) if range_val in range_sets: range_sets[range_val].append(hno) else: range_sets[range_val] = [hno] compatible = [value[0] for key, value in range_sets.items() if len(value) == 1] # below line is pythonic, but a bit confusing. sum used # to merge list of lists conflicting = sum([value for key, value in range_sets.items() if len(value) > 1], []) ranges = {key: value for key, value in range_sets.items() if len(value) > 1} return compatible, conflicting, ranges
def resolve_csums(csums): resolved_checksums = [ChecksumMap.get_hval_using_encoded_id(hno) for hno in csums] return resolved_checksums