def _prep_iterable(self, obj, parent, parents_ids=EMPTY_FROZENSET): result = defaultdict(int) for i, item in enumerate(obj): if self._skip_this(item, parent="{}[{}]".format(parent, i)): continue item_id = get_id(item) if parents_ids and item_id in parents_ids: continue parents_ids_added = add_to_frozen_set(parents_ids, item_id) hashed = self._hash(item, parent=parent, parents_ids=parents_ids_added) # counting repetitions result[hashed] += 1 if self.ignore_repetition: result = list(result.keys()) else: result = [ '{}|{}'.format(i, v) for i, v in result.items() ] result = sorted(map(str, result)) # making sure the result items are string and sorted so join command works. result = ','.join(result) result = KEY_TO_VAL_STR.format(type(obj).__name__, result) return result
def _prep_dict(self, obj, parent, parents_ids=EMPTY_FROZENSET, print_as_attribute=False, original_type=None): result = [] key_text = "%s{}".format(INDEX_VS_ATTRIBUTE[print_as_attribute]) for key, item in obj.items(): key_formatted = "'%s'" % key if not print_as_attribute and isinstance(key, strings) else key key_in_report = key_text % (parent, key_formatted) key_hash = self._hash(key, parent=key_in_report, parents_ids=parents_ids) item_id = get_id(item) if (parents_ids and item_id in parents_ids) or self._skip_this(item, parent=key_in_report): continue parents_ids_added = add_to_frozen_set(parents_ids, item_id) hashed = self._hash(item, parent=key_in_report, parents_ids=parents_ids_added) hashed = KEY_TO_VAL_STR.format(key_hash, hashed) result.append(hashed) result.sort() result = ';'.join(result) if print_as_attribute: type_ = original_type or type(obj) type_str = type_.__name__ for type_group in self.ignore_type_in_groups: if self.type_check_func(type_, type_group): type_str = ','.join(map(lambda x: x.__name__, type_group)) break else: type_str = 'dict' return "%s:{%s}" % (type_str, result)
def __contains__(self, obj): try: hash(obj) except TypeError: key = get_id(obj) else: key = obj return super().__contains__(key)
def __getitem__(self, obj): key = obj result = None try: result = super().__getitem__(key) except (TypeError, KeyError): key = get_id(obj) try: result = super().__getitem__(key) except KeyError: raise KeyError('{} is not one of the hashed items.'.format(obj)) from None return result
def __getitem__(self, obj): key = obj result = None try: result = super().__getitem__(key) except (TypeError, KeyError): key = get_id(obj) try: result = super().__getitem__(key) except KeyError: raise KeyError( '{} is not one of the hashed items.'.format(obj)) from None return result
def _prep_dict(self, obj, parent, parents_ids=EMPTY_FROZENSET, print_as_attribute=False, original_type=None): result = [] counts = 1 key_text = "%s{}".format(INDEX_VS_ATTRIBUTE[print_as_attribute]) for key, item in obj.items(): counts += 1 # ignore private variables if self.ignore_private_variables and isinstance( key, str) and key.startswith('__'): continue key_formatted = "'%s'" % key if not print_as_attribute and isinstance( key, strings) else key key_in_report = key_text % (parent, key_formatted) key_hash, _ = self._hash(key, parent=key_in_report, parents_ids=parents_ids) if not key_hash: continue item_id = get_id(item) if (parents_ids and item_id in parents_ids) or self._skip_this( item, parent=key_in_report): continue parents_ids_added = add_to_frozen_set(parents_ids, item_id) hashed, count = self._hash(item, parent=key_in_report, parents_ids=parents_ids_added) hashed = KEY_TO_VAL_STR.format(key_hash, hashed) result.append(hashed) counts += count result.sort() result = ';'.join(result) if print_as_attribute: type_ = original_type or type(obj) type_str = type_.__name__ for type_group in self.ignore_type_in_groups: if self.type_check_func(type_, type_group): type_str = ','.join(map(lambda x: x.__name__, type_group)) break else: type_str = 'dict' return "{}:{{{}}}".format(type_str, result), counts
def test_dict1(self): string1 = "a" key1 = "key1" obj = {key1: string1, 1: 10, 2: 20} expected_result = { 1: DeepHash.sha1hex('int:1'), 10: DeepHash.sha1hex('int:10'), 2: DeepHash.sha1hex('int:2'), 20: DeepHash.sha1hex('int:20'), key1: '1073ab6cda4b991cd29f9e83a307f34004ae9327', string1: '86f7e437faa5a7fce15d1ddcb9eaeaea377667b8', get_id(obj): '11e23f096df81b1ccab0c309cdf8b4ba5a0a6895' } result = DeepHash(obj, ignore_string_type_changes=True, hasher=DeepHash.sha1hex) assert expected_result == result
def do_list_or_tuple(self, func, func_str): string1 = "a" obj = func([string1, 10, 20]) if func is list: obj_id = get_id(obj) else: obj_id = obj string1_prepped = prep_str(string1) expected_result = { 10: 'int:10', 20: 'int:20', string1: string1_prepped, obj_id: '{}:{},int:10,int:20'.format(func_str, string1_prepped), } result = DeepHashPrep(obj, ignore_string_type_changes=True) assert expected_result == result
def test_dict_hash(self): string1 = "a" string1_prepped = prep_str(string1) key1 = "key1" key1_prepped = prep_str(key1) obj = {key1: string1, 1: 10, 2: 20} expected_result = { 1: 'int:1', 10: 'int:10', 2: 'int:2', 20: 'int:20', key1: key1_prepped, string1: string1_prepped, get_id(obj): 'dict:{int:1:int:10;int:2:int:20;%s:%s}' % (key1, string1) } result = DeepHashPrep(obj, ignore_string_type_changes=True) assert expected_result == result
def _prep_dict(self, obj, parent, parents_ids=EMPTY_FROZENSET, print_as_attribute=False, original_type=None): result = [] key_text = "%s{}".format(INDEX_VS_ATTRIBUTE[print_as_attribute]) for key, item in obj.items(): key_formatted = "'%s'" % key if not print_as_attribute and isinstance( key, strings) else key key_in_report = key_text % (parent, key_formatted) key_hash = self._hash(key, parent=key_in_report, parents_ids=parents_ids) item_id = get_id(item) if (parents_ids and item_id in parents_ids) or self._skip_this( item, parent=key_in_report): continue parents_ids_added = add_to_frozen_set(parents_ids, item_id) hashed = self._hash(item, parent=key_in_report, parents_ids=parents_ids_added) hashed = KEY_TO_VAL_STR.format(key_hash, hashed) result.append(hashed) result.sort() result = ';'.join(result) if print_as_attribute: type_ = original_type or type(obj) type_str = type_.__name__ for type_group in self.ignore_type_in_groups: if type_ in type_group: type_str = ','.join(map(lambda x: x.__name__, type_group)) break else: type_str = 'dict' return "%s:{%s}" % (type_str, result)
def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET): """The main diff method""" if isinstance(obj, bool): obj = self._prep_bool(obj) result = None else: result = not_hashed try: result = self[obj] except (TypeError, KeyError): pass else: return result if self._skip_this(obj, parent): return elif obj is None: result = 'NONE' elif isinstance(obj, strings): result = prepare_string_for_hashing( obj, ignore_string_type_changes=self.ignore_string_type_changes, ignore_string_case=self.ignore_string_case) elif isinstance(obj, numbers): result = self._prep_number(obj) elif isinstance(obj, MutableMapping): result = self._prep_dict(obj=obj, parent=parent, parents_ids=parents_ids) elif isinstance(obj, tuple): result = self._prep_tuple(obj=obj, parent=parent, parents_ids=parents_ids) elif isinstance(obj, Iterable): result = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids) elif obj in {BoolObj.TRUE, BoolObj.FALSE}: result = 'bool:true' if obj is BoolObj.TRUE else 'bool:false' else: result = self._prep_obj(obj=obj, parent=parent, parents_ids=parents_ids) if result is not_hashed: # pragma: no cover self[UNPROCESSED].append(obj) elif result is unprocessed: pass elif self.apply_hash: if isinstance(obj, strings): result_cleaned = result else: result_cleaned = prepare_string_for_hashing( result, ignore_string_type_changes=self.ignore_string_type_changes, ignore_string_case=self.ignore_string_case) result = self.hasher(result_cleaned) # It is important to keep the hash of all objects. # The hashes will be later used for comparing the objects. try: self[obj] = result except TypeError: obj_id = get_id(obj) self[obj_id] = result return result
def test_prep_iterable_with_excluded_type(self): l1 = logging.getLogger("test") obj = [1, l1] result = DeepHashPrep(obj, exclude_types={logging.Logger}) assert get_id(l1) not in result
def test_prep_iterable_with_loop(self): obj = [1] obj.append(obj) result = DeepHashPrep(obj) expected_result = {get_id(obj): 'list:int:1', 1: 'int:1'} assert expected_result == result
def test_prep_dic_with_loop(self): obj = {2: 1337} obj[1] = obj result = DeepHashPrep(obj) expected_result = {get_id(obj): 'dict:{int:2:int:1337}', 1: 'int:1', 2: 'int:2', 1337: 'int:1337'} assert expected_result == result
def test_skip_type(self): l1 = logging.getLogger("test") obj = {"log": l1, 2: 1337} result = DeepHashPrep(obj, exclude_types={logging.Logger}) assert get_id(l1) not in result
def test_dictionary(self): obj = {1: 1} result = DeepHash(obj) assert set(result.keys()) == {1, get_id(obj)}
def __init__(self, obj, *, hashes=None, exclude_types=None, exclude_paths=None, exclude_regex_paths=None, hasher=None, ignore_repetition=True, significant_digits=None, number_format_notation="f", apply_hash=True, ignore_type_in_groups=None, ignore_string_type_changes=False, ignore_numeric_type_changes=False, ignore_type_subclasses=False, ignore_string_case=False, number_to_string_func=None, **kwargs): if kwargs: raise ValueError( ("The following parameter(s) are not valid: %s\n" "The valid parameters are obj, hashes, exclude_types, significant_digits, " "exclude_paths, exclude_regex_paths, hasher, ignore_repetition, " "number_format_notation, apply_hash, ignore_type_in_groups, ignore_string_type_changes, " "ignore_numeric_type_changes, ignore_type_subclasses, ignore_string_case " "number_to_string_func") % ', '.join(kwargs.keys())) self.obj = obj exclude_types = set() if exclude_types is None else set(exclude_types) self.exclude_types_tuple = tuple(exclude_types) # we need tuple for checking isinstance self.ignore_repetition = ignore_repetition self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths) self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths) default_hasher = self.murmur3_128bit if mmh3 else self.sha256hex self.hasher = default_hasher if hasher is None else hasher hashes = hashes if hashes else {} self.update(hashes) self[UNPROCESSED] = [] self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes) self.number_format_notation = number_format_notation self.ignore_type_in_groups = self.get_ignore_types_in_groups( ignore_type_in_groups=ignore_type_in_groups, ignore_string_type_changes=ignore_string_type_changes, ignore_numeric_type_changes=ignore_numeric_type_changes, ignore_type_subclasses=ignore_type_subclasses) self.ignore_string_type_changes = ignore_string_type_changes self.ignore_numeric_type_changes = ignore_numeric_type_changes self.ignore_string_case = ignore_string_case # makes the hash return constant size result if true # the only time it should be set to False is when # testing the individual hash functions for different types of objects. self.apply_hash = apply_hash self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group self.number_to_string = number_to_string_func or number_to_string self._hash(obj, parent="root", parents_ids=frozenset({get_id(obj)})) if self[UNPROCESSED]: logger.warning("Can not hash the following items: {}.".format(self[UNPROCESSED])) else: del self[UNPROCESSED]
def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET): """The main diff method""" try: result = self[obj] except (TypeError, KeyError): pass else: return result result = not_hashed if self._skip_this(obj, parent): return elif obj is None: result = 'NONE' elif isinstance(obj, strings): result = prepare_string_for_hashing( obj, ignore_string_type_changes=self.ignore_string_type_changes, ignore_string_case=self.ignore_string_case) elif isinstance(obj, numbers): result = self._prep_number(obj) elif isinstance(obj, MutableMapping): result = self._prep_dict(obj=obj, parent=parent, parents_ids=parents_ids) elif isinstance(obj, tuple): result = self._prep_tuple(obj=obj, parent=parent, parents_ids=parents_ids) elif isinstance(obj, Iterable): result = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids) else: result = self._prep_obj(obj=obj, parent=parent, parents_ids=parents_ids) if result is not_hashed: # pragma: no cover self[UNPROCESSED].append(obj) elif result is unprocessed: pass elif self.apply_hash: if isinstance(obj, strings): result_cleaned = result else: result_cleaned = prepare_string_for_hashing( result, ignore_string_type_changes=self.ignore_string_type_changes, ignore_string_case=self.ignore_string_case) result = self.hasher(result_cleaned) # It is important to keep the hash of all objects. # The hashes will be later used for comparing the objects. try: self[obj] = result except TypeError: obj_id = get_id(obj) self[obj_id] = result return result
def __init__(self, obj, *, hashes=None, exclude_types=None, exclude_paths=None, exclude_regex_paths=None, hasher=None, ignore_repetition=True, significant_digits=None, number_format_notation="f", apply_hash=True, ignore_type_in_groups=None, ignore_string_type_changes=False, ignore_numeric_type_changes=False, ignore_type_subclasses=False, ignore_string_case=False, number_to_string_func=None, **kwargs): if kwargs: raise ValueError( ("The following parameter(s) are not valid: %s\n" "The valid parameters are obj, hashes, exclude_types, significant_digits, " "exclude_paths, exclude_regex_paths, hasher, ignore_repetition, " "number_format_notation, apply_hash, ignore_type_in_groups, ignore_string_type_changes, " "ignore_numeric_type_changes, ignore_type_subclasses, ignore_string_case " "number_to_string_func") % ', '.join(kwargs.keys())) self.obj = obj exclude_types = set() if exclude_types is None else set(exclude_types) self.exclude_types_tuple = tuple(exclude_types) # we need tuple for checking isinstance self.ignore_repetition = ignore_repetition self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths) self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths) default_hasher = self.murmur3_128bit if mmh3 else self.sha256hex self.hasher = default_hasher if hasher is None else hasher hashes = hashes if hashes else {} self.update(hashes) self[UNPROCESSED] = [] self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes) self.number_format_notation = number_format_notation self.ignore_type_in_groups = self.get_ignore_types_in_groups( ignore_type_in_groups=ignore_type_in_groups, ignore_string_type_changes=ignore_string_type_changes, ignore_numeric_type_changes=ignore_numeric_type_changes, ignore_type_subclasses=ignore_type_subclasses) self.ignore_string_type_changes = ignore_string_type_changes self.ignore_numeric_type_changes = ignore_numeric_type_changes self.ignore_string_case = ignore_string_case # makes the hash return constant size result if true # the only time it should be set to False is when # testing the individual hash functions for different types of objects. self.apply_hash = apply_hash self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group self.number_to_string = number_to_string_func or number_to_string self._hash(obj, parent="root", parents_ids=frozenset({get_id(obj)})) if self[UNPROCESSED]: logger.warning("Can not hash the following items: {}.".format(self[UNPROCESSED])) else: del self[UNPROCESSED]