示例#1
0
def test_group_by_advanced():
    data = [
        {"user": "******", "value": 1},
        {"user": "******", "value": 2},
        {"user": "******", "value": 1},
        {"user": "******", "value": 2},
        {"user": "******", "value": 1},
        {"user": "******", "value": 3},
        {"user": "******", "value": 4},
        {"user": "******", "value": 3},
        {"user": "******", "value": 4},
        {"user": "******", "value": 5},
        {"user": "******", "value": 5},
        {"user": "******", "value": 6},
        {"user": "******", "value": 7},
    ]
    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)

    # don't deduplicate
    groups = ds.collect_set("user", dedupe=False).apply(summarize)
    assert groups["bob"] == {"count": 5, "min": 1, "max": 2}, groups["bob"]
    assert groups["alice"] == {"count": 6, "min": 3, "max": 5}
    assert groups["eve"] == {"count": 2, "min": 6, "max": 7}

    # deduplicate
    groups = ds.collect_set("user", dedupe=True).apply(summarize)
    assert groups["bob"] == {"count": 2, "min": 1, "max": 2}
    assert groups["alice"] == {"count": 3, "min": 3, "max": 5}
    assert groups["eve"] == {"count": 2, "min": 6, "max": 7}
示例#2
0
def test_group_by():
    data = [
        {"user": "******", "value": 1},
        {"user": "******", "value": 2},
        {"user": "******", "value": 3},
        {"user": "******", "value": 4},
        {"user": "******", "value": 5},
        {"user": "******", "value": 6},
        {"user": "******", "value": 7},
    ]

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    groups = ds.collect_set("user")

    # the right number of groups
    assert len(groups) == 3

    # the groups have the right number of records
    assert groups.count("bob") == 2
    assert groups.count("alice") == 3
    assert groups.count("eve") == 2

    # the aggregations work
    assert groups.aggregate("value", max).get("bob") == 2
    assert groups.aggregate("value", min).get("alice") == 3
    assert groups.aggregate("value", sum).get("eve") == 13
示例#3
0
class SubCollection:

    __slots__ = ["values"]

    def __init__(self, values):
        self.values = DictSet(values or [], storage_class=STORAGE_CLASS.MEMORY)

    def __getitem__(self, item):
        """
        Selector access to a value in a collection, support arrays
        """
        if isinstance(item, tuple):
            return list(self.values.select(*item))
        else:
            return self.values.collect_list(item)

    def __len__(self):
        return self.values.count()

    def __repr__(self):
        return f"SubCollection of {len(self)} items"

    def get(self, item):
        values = self[item]
        if len(values) == 0 or values is None:
            return None
        return values
示例#4
0
def test_gappy_set():
    data = [
        {
            "key": 1,
            "value": "one",
            "plus1": 2
        },
        {
            "key": 2,
            "value": "two",
            "plus1": 3
        },
        {
            "key": None,
            "value": "one",
            "plus1": 4
        },
        {
            "key": 4,
            "value": "two",
            "plus1": 5
        },
        {
            "key": 4,
            "value": None,
            "plus1": 5
        },
    ]
    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    g = list(ds.group_by("value").average("key"))
    assert g == [
        {
            "AVG(key)": 4.0,
            "value": None
        },
        {
            "AVG(key)": 3.0,
            "value": "two"
        },
        {
            "AVG(key)": 1.0,
            "value": "one"
        },
    ], g
示例#5
0
    def __init__(self, dictset: DictSet, column: str, dedupe: bool = False):
        """
        Collection functionality for Iterables of Dictionaries
        Parameters:
            dictset: Iterable of dictionaries
                The dataset to perform the Collection on
            column: string
                The name of the field to collect by
            dedupe: bool (optional)
                Remove duplicate values from the collections
        Returns:
            CollectedSet
        Warning:
            The 'Collection' object holds the entire dataset in memory so is
            unsuitable for large datasets.
        """
        collections: dict = {}

        groups = dictset
        if dedupe:
            groups = dictset.distinct()

        for item in groups:
            if hasattr(item, "as_dict"):
                my_item = item.as_dict()
            else:
                my_item = item.copy()
            key = my_item.pop(column, None)
            if not key in collections:
                collections[key] = []
            collections[key].append(my_item)
        if dedupe:
            for collection in collections:
                collections[collection] = {
                    frozenset(item.items()): item
                    for item in collections[collection]
                }.values()
        self._collections = collections
示例#6
0
def do_read():

    import os
    import sys

    sys.path.insert(1, os.path.join(sys.path[0], "../.."))
    from mabel.data.internals.group_by import GroupBy
    from mabel.data.internals.dictset import STORAGE_CLASS, DictSet

    data = [
        {
            "user": "******",
            "value": 1
        },
        {
            "user": "******",
            "value": 2
        },
        {
            "user": "******",
            "value": 1
        },
        {
            "user": "******",
            "value": 2
        },
        {
            "user": "******",
            "value": 1
        },
        {
            "user": "******",
            "value": 3
        },
        {
            "user": "******",
            "value": 4
        },
        {
            "user": "******",
            "value": 3
        },
        {
            "user": "******",
            "value": 4
        },
        {
            "user": "******",
            "value": 5
        },
        {
            "user": "******",
            "value": 5
        },
        {
            "user": "******",
            "value": 6
        },
        {
            "user": "******",
            "value": 7
        },
    ] * 10000

    for i in range(50):
        ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
        gs = list(GroupBy(ds, "user").min("value"))
示例#7
0
 def __init__(self, values):
     self.values = DictSet(values or [], storage_class=STORAGE_CLASS.MEMORY)
示例#8
0
def test_combined_group_by():
    """
    Test combined grouping and aggregation
    """
    data = [
        {
            "fname": "bob",
            "sname": "smith",
            "value": 1,
            "cost": 1
        },
        {
            "fname": "bob",
            "sname": "jones",
            "value": 2,
            "cost": 4
        },
        {
            "fname": "bob",
            "sname": "smith",
            "value": 1,
            "cost": 3
        },
        {
            "fname": "bob",
            "sname": "jones",
            "value": 2,
            "cost": 2
        },
        {
            "fname": "bob",
            "sname": "smith",
            "value": 1,
            "cost": 1
        },
        {
            "fname": "alice",
            "sname": "jones",
            "value": 3,
            "cost": 4
        },
        {
            "fname": "alice",
            "sname": "smith",
            "value": 4,
            "cost": 3
        },
        {
            "fname": "alice",
            "sname": "jones",
            "value": 3,
            "cost": 2
        },
        {
            "fname": "alice",
            "sname": "smith",
            "value": 4,
            "cost": 1
        },
        {
            "fname": "alice",
            "sname": "jones",
            "value": 5,
            "cost": 4
        },
        {
            "fname": "alice",
            "sname": "smith",
            "value": 5,
            "cost": 3
        },
        {
            "fname": "eve",
            "sname": "jones",
            "value": 6,
            "cost": 2
        },
        {
            "fname": "eve",
            "sname": "smith",
            "value": 7,
            "cost": 1
        },
    ]

    # fmt:off
    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    gs = list(GroupBy(ds, ("fname", "sname")).groups())
    assert gs == [{
        'fname': 'bob',
        'sname': 'smith'
    }, {
        'fname': 'bob',
        'sname': 'jones'
    }, {
        'fname': 'alice',
        'sname': 'jones'
    }, {
        'fname': 'alice',
        'sname': 'smith'
    }, {
        'fname': 'eve',
        'sname': 'jones'
    }, {
        'fname': 'eve',
        'sname': 'smith'
    }], gs

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    gs = list(GroupBy(ds, ("fname", "sname")).count())
    assert gs == [{
        'COUNT(*)': 2,
        'fname': 'bob',
        'sname': 'jones'
    }, {
        'COUNT(*)': 3,
        'fname': 'bob',
        'sname': 'smith'
    }, {
        'COUNT(*)': 3,
        'fname': 'alice',
        'sname': 'smith'
    }, {
        'COUNT(*)': 1,
        'fname': 'eve',
        'sname': 'smith'
    }, {
        'COUNT(*)': 1,
        'fname': 'eve',
        'sname': 'jones'
    }, {
        'COUNT(*)': 3,
        'fname': 'alice',
        'sname': 'jones'
    }], gs

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    gs = list(GroupBy(ds, ("fname", "sname")).average('value'))
    assert gs == [{
        'AVG(value)': Decimal('2.0'),
        'fname': 'bob',
        'sname': 'jones'
    }, {
        'AVG(value)': Decimal('1.0'),
        'fname': 'bob',
        'sname': 'smith'
    }, {
        'AVG(value)': Decimal('4.333333333333333333333333333'),
        'fname': 'alice',
        'sname': 'smith'
    }, {
        'AVG(value)': Decimal('7.0'),
        'fname': 'eve',
        'sname': 'smith'
    }, {
        'AVG(value)': Decimal('6.0'),
        'fname': 'eve',
        'sname': 'jones'
    }, {
        'AVG(value)': Decimal('3.666666666666666666666666667'),
        'fname': 'alice',
        'sname': 'jones'
    }], gs

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    gs = list(GroupBy(ds, ("fname", "sname")).average('cost'))
    assert gs == [{
        'AVG(cost)': Decimal('3.0'),
        'fname': 'bob',
        'sname': 'jones'
    }, {
        'AVG(cost)': Decimal('1.666666666666666666666666667'),
        'fname': 'bob',
        'sname': 'smith'
    }, {
        'AVG(cost)': Decimal('2.333333333333333333333333333'),
        'fname': 'alice',
        'sname': 'smith'
    }, {
        'AVG(cost)': Decimal('1.0'),
        'fname': 'eve',
        'sname': 'smith'
    }, {
        'AVG(cost)': Decimal('2.0'),
        'fname': 'eve',
        'sname': 'jones'
    }, {
        'AVG(cost)': Decimal('3.333333333333333333333333333'),
        'fname': 'alice',
        'sname': 'jones'
    }], gs

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    gs = list(GroupBy(ds, ("fname", "sname")).average((
        'cost',
        'value',
    )))
    assert gs == [{
        'AVG(cost)': Decimal('3'),
        'AVG(value)': Decimal('2'),
        'fname': 'bob',
        'sname': 'jones'
    }, {
        'AVG(cost)': Decimal('1.666666666666666666666666667'),
        'AVG(value)': Decimal('1'),
        'fname': 'bob',
        'sname': 'smith'
    }, {
        'AVG(cost)': Decimal('2.333333333333333333333333333'),
        'AVG(value)': Decimal('4.333333333333333333333333333'),
        'fname': 'alice',
        'sname': 'smith'
    }, {
        'AVG(cost)': 1.0,
        'AVG(value)': 7.0,
        'fname': 'eve',
        'sname': 'smith'
    }, {
        'AVG(cost)': 2.0,
        'AVG(value)': 6.0,
        'fname': 'eve',
        'sname': 'jones'
    }, {
        'AVG(cost)': Decimal('3.333333333333333333333333333'),
        'AVG(value)': Decimal('3.666666666666666666666666667'),
        'fname': 'alice',
        'sname': 'jones'
    }]

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    gs = list(
        GroupBy(ds, ("fname", "sname")).aggregate([('MAX', 'value'),
                                                   ('MIN', 'cost')]))
    assert gs == [{
        'MAX(value)': 2,
        'MIN(cost)': 2,
        'fname': 'bob',
        'sname': 'jones'
    }, {
        'MAX(value)': 1,
        'MIN(cost)': 1,
        'fname': 'bob',
        'sname': 'smith'
    }, {
        'MAX(value)': 5,
        'MIN(cost)': 1,
        'fname': 'alice',
        'sname': 'smith'
    }, {
        'MAX(value)': 7,
        'MIN(cost)': 1,
        'fname': 'eve',
        'sname': 'smith'
    }, {
        'MAX(value)': 6,
        'MIN(cost)': 2,
        'fname': 'eve',
        'sname': 'jones'
    }, {
        'MAX(value)': 5,
        'MIN(cost)': 2,
        'fname': 'alice',
        'sname': 'jones'
    }], gs
示例#9
0
def test_group_by():
    """
    Test simple grouping and aggregation
    """
    data = [
        {
            "user": "******",
            "value": 1
        },
        {
            "user": "******",
            "value": 2
        },
        {
            "user": "******",
            "value": 1
        },
        {
            "user": "******",
            "value": 2
        },
        {
            "user": "******",
            "value": 1
        },
        {
            "user": "******",
            "value": 3
        },
        {
            "user": "******",
            "value": 4
        },
        {
            "user": "******",
            "value": 3
        },
        {
            "user": "******",
            "value": 4
        },
        {
            "user": "******",
            "value": 5
        },
        {
            "user": "******",
            "value": 5
        },
        {
            "user": "******",
            "value": 6
        },
        {
            "user": "******",
            "value": 7
        },
    ]

    # fmt:off
    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    assert list(GroupBy(ds, "user").groups()) == [{
        'user': '******'
    }, {
        'user': '******'
    }, {
        'user': '******'
    }]

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    assert list(GroupBy(ds, "user").count()) == [{
        'COUNT(*)': 6,
        'user': '******'
    }, {
        'COUNT(*)': 5,
        'user': '******'
    }, {
        'COUNT(*)': 2,
        'user': '******'
    }]

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    gb = list(GroupBy(ds, "user").average("value"))
    assert gb == [{
        'AVG(value)': Decimal("4.0"),
        'user': '******'
    }, {
        'AVG(value)': Decimal("1.4"),
        'user': '******'
    }, {
        'AVG(value)': Decimal("6.5"),
        'user': '******'
    }], gb

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    assert list(GroupBy(ds, "user").max("value")) == [{
        'MAX(value)': 5,
        'user': '******'
    }, {
        'MAX(value)': 2,
        'user': '******'
    }, {
        'MAX(value)': 7,
        'user': '******'
    }]

    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    assert list(GroupBy(ds, "user").min("value")) == [{
        'MIN(value)': 3,
        'user': '******'
    }, {
        'MIN(value)': 1,
        'user': '******'
    }, {
        'MIN(value)': 6,
        'user': '******'
    }]