def test_dict_sketch_str_value(self): # Dict value sketch type should be auto inferred dict_data = [{ 'a': 'b', 'b': 'c' }, { 'a': 'b', 'b': 'c' }, { 'a': 'd', 'b': '4' }, None] sa = SArray(data=dict_data) self.__validate_nested_sketch_result(sa) sketch = sa.sketch_summary() fi = sketch.frequent_items() self.assertEqual(len(fi), 2) self.assertEqual(fi['{"a":"b", "b":"c"}'], 2) self.assertEqual(fi['{"a":"d", "b":"4"}'], 1) # Get dict key sketch key_summary = sketch.dict_key_summary() another_rep = list( itertools.chain.from_iterable(list(sa.dict_keys().dropna()))) self.__validate_sketch_result(key_summary, SArray(another_rep)) # Get dict value sketch value_summary = sketch.dict_value_summary() another_rep = list( itertools.chain.from_iterable(list(sa.dict_values().dropna()))) self.__validate_sketch_result(value_summary, SArray(another_rep)) # sub sketch with one key s = sa.sketch_summary(sub_sketch_keys='a').element_sub_sketch('a') expected = sa.unpack(column_name_prefix="")['a'] self.__validate_sketch_result(s, expected) s = sa.sketch_summary( sub_sketch_keys='Nonexist').element_sub_sketch('Nonexist') self.assertEqual(s.num_undefined(), len(sa)) # sub sketch with multiple keys keys = ['a', 'b'] s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys) self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(s.has_key(key)) expected = sa.unpack(column_name_prefix="")[key] self.__validate_sketch_result(s[key], expected) # allow pass in empty keys, which will retrieve all keys s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch() self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(s.has_key(key)) expected = sa.unpack(column_name_prefix="")[key] self.__validate_sketch_result(s[key], expected)
def test_dict_sketch_int_value(self): dict_data = [{}, { 'a': 1, 'b': 2 }, { 'a': 1, 'b': 2 }, { 'a': 3, 'c': 1 }, { 'a': 1, 'b': 2, 'c': 3 }, None] sa = SArray(data=dict_data) self.__validate_nested_sketch_result(sa) sketch = sa.sketch_summary() self.assertEqual(sketch.num_unique(), 4) fi = sketch.frequent_items() self.assertEqual(len(fi), 4) self.assertEqual((fi['{"a":1, "b":2}']), 2) self.assertEqual((fi['{"a":3, "c":1}']), 1) # Get dict key sketch key_summary = sketch.dict_key_summary() another_rep = list( itertools.chain.from_iterable(list(sa.dict_keys().dropna()))) self.__validate_sketch_result(key_summary, SArray(another_rep)) # Get dict value sketch value_summary = sketch.dict_value_summary() another_rep = list( itertools.chain.from_iterable(list(sa.dict_values().dropna()))) self.__validate_sketch_result(value_summary, SArray(another_rep)) # sub sketch with one key s = sa.sketch_summary(sub_sketch_keys='a').element_sub_sketch('a') expected = sa.unpack(column_name_prefix="")['a'] self.__validate_sketch_result(s, expected) s = sa.sketch_summary( sub_sketch_keys='Nonexist').element_sub_sketch('Nonexist') self.assertEqual(s.num_undefined(), len(sa)) # sub sketch with multiple keys keys = ['a', 'b'] s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys) self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(s.has_key(key)) expected = sa.unpack(column_name_prefix="")[key] self.__validate_sketch_result(s[key], expected)
def test_dict_sketch_str_value(self): # Dict value sketch type should be auto inferred dict_data = [{'a':'b', 'b':'c'}, {'a':'b', 'b':'c'}, {'a':'d', 'b':'4'}, None] sa = SArray(data=dict_data) self.__validate_nested_sketch_result(sa) sketch = sa.sketch_summary() fi = sketch.frequent_items() self.assertEqual(len(fi), 2) self.assertEqual(fi['{"a":"b", "b":"c"}'], 2) self.assertEqual(fi['{"a":"d", "b":"4"}'], 1) # Get dict key sketch key_summary = sketch.dict_key_summary() another_rep = list(itertools.chain.from_iterable(list(sa.dict_keys().dropna()))) self.__validate_sketch_result(key_summary, SArray(another_rep)) # Get dict value sketch value_summary = sketch.dict_value_summary() another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna()))) self.__validate_sketch_result(value_summary, SArray(another_rep)) # sub sketch with one key s = sa.sketch_summary(sub_sketch_keys ='a').element_sub_sketch('a') expected = sa.unpack(column_name_prefix="")['a'] self.__validate_sketch_result(s, expected) s = sa.sketch_summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist') self.assertEqual(s.num_undefined(), len(sa)) # sub sketch with multiple keys keys = ['a', 'b'] s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch(keys) self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(s.has_key(key)) expected = sa.unpack(column_name_prefix="")[key] self.__validate_sketch_result(s[key], expected) # allow pass in empty keys, which will retrieve all keys s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch() self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(s.has_key(key)) expected = sa.unpack(column_name_prefix="")[key] self.__validate_sketch_result(s[key], expected)
def test_dict_sketch_int_value(self): dict_data = [{}, {'a':1, 'b':2}, {'a':1, 'b':2}, {'a':3, 'c':1}, {'a': 1, 'b': 2, 'c': 3}, None] sa = SArray(data=dict_data) self.__validate_nested_sketch_result(sa) sketch = sa.sketch_summary() self.assertEqual(sketch.num_unique(), 4) fi = sketch.frequent_items() self.assertEqual(len(fi), 4) self.assertEqual((fi['{"a":1, "b":2}']), 2) self.assertEqual((fi['{"a":3, "c":1}']), 1) # Get dict key sketch key_summary = sketch.dict_key_summary() another_rep = list(itertools.chain.from_iterable(list(sa.dict_keys().dropna()))) self.__validate_sketch_result(key_summary, SArray(another_rep)) # Get dict value sketch value_summary = sketch.dict_value_summary() another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna()))) self.__validate_sketch_result(value_summary, SArray(another_rep)) # sub sketch with one key s = sa.sketch_summary(sub_sketch_keys ='a').element_sub_sketch('a') expected = sa.unpack(column_name_prefix="")['a'] self.__validate_sketch_result(s, expected) s = sa.sketch_summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist') self.assertEqual(s.num_undefined(), len(sa)) # sub sketch with multiple keys keys = ['a', 'b'] s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch(keys) self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(s.has_key(key)) expected = sa.unpack(column_name_prefix="")[key] self.__validate_sketch_result(s[key], expected)