def test_group_by_advanced(): data = [ {"user": "******", "value": 1}, {"user": "******", "value": 2}, {"user": "******", "value": 1}, {"user": "******", "value": 2}, {"user": "******", "value": 1}, {"user": "******", "value": 3}, {"user": "******", "value": 4}, {"user": "******", "value": 3}, {"user": "******", "value": 4}, {"user": "******", "value": 5}, {"user": "******", "value": 5}, {"user": "******", "value": 6}, {"user": "******", "value": 7}, ] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) # don't deduplicate groups = ds.collect_set("user", dedupe=False).apply(summarize) assert groups["bob"] == {"count": 5, "min": 1, "max": 2}, groups["bob"] assert groups["alice"] == {"count": 6, "min": 3, "max": 5} assert groups["eve"] == {"count": 2, "min": 6, "max": 7} # deduplicate groups = ds.collect_set("user", dedupe=True).apply(summarize) assert groups["bob"] == {"count": 2, "min": 1, "max": 2} assert groups["alice"] == {"count": 3, "min": 3, "max": 5} assert groups["eve"] == {"count": 2, "min": 6, "max": 7}
def test_group_by(): data = [ {"user": "******", "value": 1}, {"user": "******", "value": 2}, {"user": "******", "value": 3}, {"user": "******", "value": 4}, {"user": "******", "value": 5}, {"user": "******", "value": 6}, {"user": "******", "value": 7}, ] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) groups = ds.collect_set("user") # the right number of groups assert len(groups) == 3 # the groups have the right number of records assert groups.count("bob") == 2 assert groups.count("alice") == 3 assert groups.count("eve") == 2 # the aggregations work assert groups.aggregate("value", max).get("bob") == 2 assert groups.aggregate("value", min).get("alice") == 3 assert groups.aggregate("value", sum).get("eve") == 13
class SubCollection: __slots__ = ["values"] def __init__(self, values): self.values = DictSet(values or [], storage_class=STORAGE_CLASS.MEMORY) def __getitem__(self, item): """ Selector access to a value in a collection, support arrays """ if isinstance(item, tuple): return list(self.values.select(*item)) else: return self.values.collect_list(item) def __len__(self): return self.values.count() def __repr__(self): return f"SubCollection of {len(self)} items" def get(self, item): values = self[item] if len(values) == 0 or values is None: return None return values
def test_gappy_set(): data = [ { "key": 1, "value": "one", "plus1": 2 }, { "key": 2, "value": "two", "plus1": 3 }, { "key": None, "value": "one", "plus1": 4 }, { "key": 4, "value": "two", "plus1": 5 }, { "key": 4, "value": None, "plus1": 5 }, ] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) g = list(ds.group_by("value").average("key")) assert g == [ { "AVG(key)": 4.0, "value": None }, { "AVG(key)": 3.0, "value": "two" }, { "AVG(key)": 1.0, "value": "one" }, ], g
def __init__(self, dictset: DictSet, column: str, dedupe: bool = False): """ Collection functionality for Iterables of Dictionaries Parameters: dictset: Iterable of dictionaries The dataset to perform the Collection on column: string The name of the field to collect by dedupe: bool (optional) Remove duplicate values from the collections Returns: CollectedSet Warning: The 'Collection' object holds the entire dataset in memory so is unsuitable for large datasets. """ collections: dict = {} groups = dictset if dedupe: groups = dictset.distinct() for item in groups: if hasattr(item, "as_dict"): my_item = item.as_dict() else: my_item = item.copy() key = my_item.pop(column, None) if not key in collections: collections[key] = [] collections[key].append(my_item) if dedupe: for collection in collections: collections[collection] = { frozenset(item.items()): item for item in collections[collection] }.values() self._collections = collections
def do_read(): import os import sys sys.path.insert(1, os.path.join(sys.path[0], "../..")) from mabel.data.internals.group_by import GroupBy from mabel.data.internals.dictset import STORAGE_CLASS, DictSet data = [ { "user": "******", "value": 1 }, { "user": "******", "value": 2 }, { "user": "******", "value": 1 }, { "user": "******", "value": 2 }, { "user": "******", "value": 1 }, { "user": "******", "value": 3 }, { "user": "******", "value": 4 }, { "user": "******", "value": 3 }, { "user": "******", "value": 4 }, { "user": "******", "value": 5 }, { "user": "******", "value": 5 }, { "user": "******", "value": 6 }, { "user": "******", "value": 7 }, ] * 10000 for i in range(50): ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, "user").min("value"))
def __init__(self, values): self.values = DictSet(values or [], storage_class=STORAGE_CLASS.MEMORY)
def test_combined_group_by(): """ Test combined grouping and aggregation """ data = [ { "fname": "bob", "sname": "smith", "value": 1, "cost": 1 }, { "fname": "bob", "sname": "jones", "value": 2, "cost": 4 }, { "fname": "bob", "sname": "smith", "value": 1, "cost": 3 }, { "fname": "bob", "sname": "jones", "value": 2, "cost": 2 }, { "fname": "bob", "sname": "smith", "value": 1, "cost": 1 }, { "fname": "alice", "sname": "jones", "value": 3, "cost": 4 }, { "fname": "alice", "sname": "smith", "value": 4, "cost": 3 }, { "fname": "alice", "sname": "jones", "value": 3, "cost": 2 }, { "fname": "alice", "sname": "smith", "value": 4, "cost": 1 }, { "fname": "alice", "sname": "jones", "value": 5, "cost": 4 }, { "fname": "alice", "sname": "smith", "value": 5, "cost": 3 }, { "fname": "eve", "sname": "jones", "value": 6, "cost": 2 }, { "fname": "eve", "sname": "smith", "value": 7, "cost": 1 }, ] # fmt:off ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).groups()) assert gs == [{ 'fname': 'bob', 'sname': 'smith' }, { 'fname': 'bob', 'sname': 'jones' }, { 'fname': 'alice', 'sname': 'jones' }, { 'fname': 'alice', 'sname': 'smith' }, { 'fname': 'eve', 'sname': 'jones' }, { 'fname': 'eve', 'sname': 'smith' }], gs ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).count()) assert gs == [{ 'COUNT(*)': 2, 'fname': 'bob', 'sname': 'jones' }, { 'COUNT(*)': 3, 'fname': 'bob', 'sname': 'smith' }, { 'COUNT(*)': 3, 'fname': 'alice', 'sname': 'smith' }, { 'COUNT(*)': 1, 'fname': 'eve', 'sname': 'smith' }, { 'COUNT(*)': 1, 'fname': 'eve', 'sname': 'jones' }, { 'COUNT(*)': 3, 'fname': 'alice', 'sname': 'jones' }], gs ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).average('value')) assert gs == [{ 'AVG(value)': Decimal('2.0'), 'fname': 'bob', 'sname': 'jones' }, { 'AVG(value)': Decimal('1.0'), 'fname': 'bob', 'sname': 'smith' }, { 'AVG(value)': Decimal('4.333333333333333333333333333'), 'fname': 'alice', 'sname': 'smith' }, { 'AVG(value)': Decimal('7.0'), 'fname': 'eve', 'sname': 'smith' }, { 'AVG(value)': Decimal('6.0'), 'fname': 'eve', 'sname': 'jones' }, { 'AVG(value)': Decimal('3.666666666666666666666666667'), 'fname': 'alice', 'sname': 'jones' }], gs ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).average('cost')) assert gs == [{ 'AVG(cost)': Decimal('3.0'), 'fname': 'bob', 'sname': 'jones' }, { 'AVG(cost)': Decimal('1.666666666666666666666666667'), 'fname': 'bob', 'sname': 'smith' }, { 'AVG(cost)': Decimal('2.333333333333333333333333333'), 'fname': 'alice', 'sname': 'smith' }, { 'AVG(cost)': Decimal('1.0'), 'fname': 'eve', 'sname': 'smith' }, { 'AVG(cost)': Decimal('2.0'), 'fname': 'eve', 'sname': 'jones' }, { 'AVG(cost)': Decimal('3.333333333333333333333333333'), 'fname': 'alice', 'sname': 'jones' }], gs ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).average(( 'cost', 'value', ))) assert gs == [{ 'AVG(cost)': Decimal('3'), 'AVG(value)': Decimal('2'), 'fname': 'bob', 'sname': 'jones' }, { 'AVG(cost)': Decimal('1.666666666666666666666666667'), 'AVG(value)': Decimal('1'), 'fname': 'bob', 'sname': 'smith' }, { 'AVG(cost)': Decimal('2.333333333333333333333333333'), 'AVG(value)': Decimal('4.333333333333333333333333333'), 'fname': 'alice', 'sname': 'smith' }, { 'AVG(cost)': 1.0, 'AVG(value)': 7.0, 'fname': 'eve', 'sname': 'smith' }, { 'AVG(cost)': 2.0, 'AVG(value)': 6.0, 'fname': 'eve', 'sname': 'jones' }, { 'AVG(cost)': Decimal('3.333333333333333333333333333'), 'AVG(value)': Decimal('3.666666666666666666666666667'), 'fname': 'alice', 'sname': 'jones' }] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list( GroupBy(ds, ("fname", "sname")).aggregate([('MAX', 'value'), ('MIN', 'cost')])) assert gs == [{ 'MAX(value)': 2, 'MIN(cost)': 2, 'fname': 'bob', 'sname': 'jones' }, { 'MAX(value)': 1, 'MIN(cost)': 1, 'fname': 'bob', 'sname': 'smith' }, { 'MAX(value)': 5, 'MIN(cost)': 1, 'fname': 'alice', 'sname': 'smith' }, { 'MAX(value)': 7, 'MIN(cost)': 1, 'fname': 'eve', 'sname': 'smith' }, { 'MAX(value)': 6, 'MIN(cost)': 2, 'fname': 'eve', 'sname': 'jones' }, { 'MAX(value)': 5, 'MIN(cost)': 2, 'fname': 'alice', 'sname': 'jones' }], gs
def test_group_by(): """ Test simple grouping and aggregation """ data = [ { "user": "******", "value": 1 }, { "user": "******", "value": 2 }, { "user": "******", "value": 1 }, { "user": "******", "value": 2 }, { "user": "******", "value": 1 }, { "user": "******", "value": 3 }, { "user": "******", "value": 4 }, { "user": "******", "value": 3 }, { "user": "******", "value": 4 }, { "user": "******", "value": 5 }, { "user": "******", "value": 5 }, { "user": "******", "value": 6 }, { "user": "******", "value": 7 }, ] # fmt:off ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) assert list(GroupBy(ds, "user").groups()) == [{ 'user': '******' }, { 'user': '******' }, { 'user': '******' }] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) assert list(GroupBy(ds, "user").count()) == [{ 'COUNT(*)': 6, 'user': '******' }, { 'COUNT(*)': 5, 'user': '******' }, { 'COUNT(*)': 2, 'user': '******' }] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gb = list(GroupBy(ds, "user").average("value")) assert gb == [{ 'AVG(value)': Decimal("4.0"), 'user': '******' }, { 'AVG(value)': Decimal("1.4"), 'user': '******' }, { 'AVG(value)': Decimal("6.5"), 'user': '******' }], gb ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) assert list(GroupBy(ds, "user").max("value")) == [{ 'MAX(value)': 5, 'user': '******' }, { 'MAX(value)': 2, 'user': '******' }, { 'MAX(value)': 7, 'user': '******' }] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) assert list(GroupBy(ds, "user").min("value")) == [{ 'MIN(value)': 3, 'user': '******' }, { 'MIN(value)': 1, 'user': '******' }, { 'MIN(value)': 6, 'user': '******' }]