示例#1
0
def test_simple_equals_expressions():

    DATA = DictSet(TEST_DATA, storage_class=STORAGE_CLASS.MEMORY)

    assert DATA.filter("name == 'James Potter'").count() == 2
    assert DATA.filter("age == 10").count() == 1
    assert DATA.filter("alive == true").count() == 3
    assert DATA.filter("affiliations is none").count() == 4
    assert DATA.filter("name like '%Potter%'").count() == 4
示例#2
0
def test_items():
    data = [
        {
            "key": 1,
            "value": "one",
            "plus1": 2
        },
        {
            "key": 2,
            "value": "two",
            "plus1": 3
        },
        {
            "key": 3,
            "value": "three",
            "plus1": 4
        },
        {
            "key": 4,
            "value": "four",
            "plus1": 5
        },
    ]
    ds = DictSet(data, storage_class=STORAGE_CLASS.DISK)
    items = list([i.as_dict() for i in ds.get_items(0, 2)])
    assert items == [
        {
            "key": 1,
            "value": "one",
            "plus1": 2
        },
        {
            "key": 3,
            "value": "three",
            "plus1": 4
        },
    ], items
示例#3
0
def test_simple_compound_expressions():
    DATA = DictSet(TEST_DATA, storage_class=STORAGE_CLASS.MEMORY)

    assert DATA.filter("name like '%Potter' and alive == true").count() == 2
    assert DATA.filter("name like '%Potter' or alive == true").count() == 5
示例#4
0
def test_take():
    data = [
        {
            "key": 1,
            "value": "one",
            "plus1": 2
        },
        {
            "key": 2,
            "value": "two",
            "plus1": 3
        },
        {
            "key": 3,
            "value": "three",
            "plus1": 4
        },
        {
            "key": 4,
            "value": "four",
            "plus1": 5
        },
    ]
    ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY)
    assert ds.take(1).collect_list().pop() == {
        "key": 1,
        "value": "one",
        "plus1": 2,
    }, ds.take(1).collect_list()
    assert ds.take(2).count() == 2, ds.take(2).count()

    # we had a problem where spendable dictsets were taking n+1 records, but only returning n
    burnable = DictSet((a for a in ds.collect_list()),
                       storage_class=STORAGE_CLASS.NO_PERSISTANCE)
    print(burnable.take(3))
    l = len(list(burnable.take(10)))
    assert l == 1, l
示例#5
0
def SqlReader(sql_statement: str, **kwargs):
    """
    Use basic SQL queries to filter Reader.

    Parameters:
        sql_statement: string
        kwargs: parameters to pass to the Reader

    Note:
        `select` is taken from SQL SELECT
        `dataset` is taken from SQL FROM
        `filters` is taken from SQL WHERE
    """

    # some imports here to remove cyclic imports
    from mabel import DictSet, Reader

    sql = SqlParser(sql_statement)
    get_logger().info(repr(sql))

    actual_select = sql.select_expression
    if sql.select_expression is None:
        actual_select = "*"
    elif sql.select_expression != "*":
        actual_select = sql.select_expression + ", *"

    reducer = None
    if sql.select_expression == "COUNT(*)":
        reducer = lambda x: {"*": "*"}

    # FROM clause
    # WHERE clause
    if isinstance(sql.dataset, list):
        # it's a list if it's been parsed into a SQL statement,
        # this is how subqueries are interpretted - the parser
        # doesn't extract a dataset name - it collects parts of
        # a SQL statement which it can then pass to a SqlReader
        # to get back a dataset - which we then use as the
        # dataset for the outer query.
        reader = SqlReader("".join(sql.dataset), **kwargs)
    else:
        reader = Reader(
            select=actual_select,
            dataset=sql.dataset,
            filters=sql.where_expression,
            **kwargs,
        )

    # GROUP BY clause
    if sql.group_by or any(
        [t["type"] == TOKENS.AGGREGATOR
         for t in sql.select_evaluator.tokens]  # type:ignore
    ):
        from ...internals.group_by import GroupBy

        # convert the clause into something we can pass to GroupBy
        if sql.group_by:
            groups = [
                group.strip() for group in sql.group_by.split(",")
                if group.strip() != ""
            ]
        else:
            groups = ["*"]  # we're not really grouping

        aggregations = []
        renames = []
        for t in sql.select_evaluator.tokens:  # type:ignore
            if t["type"] == TOKENS.AGGREGATOR:
                aggregations.append((t["value"], t["parameters"][0]["value"]))
                if t["as"]:
                    t["raw"] = get_function_name(t)
                    renames.append(t)
            elif t["type"] == TOKENS.VARIABLE and t["value"] not in groups:
                raise InvalidSqlError(
                    "Invalid SQL - SELECT clause in a statement with a GROUP BY clause must be made of aggregations or items from the GROUP BY clause."
                )

        if aggregations:
            grouped = GroupBy(reader, groups).aggregate(aggregations)
        else:
            grouped = GroupBy(reader, groups).groups()

        # there could be 250000 groups, so we're not going to load them into memory
        reader = DictSet(grouped)

    # HAVING clause
    # if we have a HAVING clause, filter the grouped data by it
    if sql.having:
        reader = reader.filter(sql.having)

    # SELECT clause
    renames = {}  # type:ignore
    for t in sql.select_evaluator.tokens:  # type:ignore
        if t["as"]:
            renames[get_function_name(t)] = t["as"]

    def _perform_renames(row):
        for k, v in [(k, v) for k, v in row.items()]:
            if k in renames:
                row[renames[k]] = row.pop(k, row.get(renames[k]))
        return row

    if renames:
        reader = DictSet(map(_perform_renames, reader))

    reader = reader.select(sql.select_evaluator.fields())  # type:ignore
    # distinct now we have only the columns we're interested in
    if sql.distinct:
        reader = reader.distinct()

    # ORDER BY clause
    if sql.order_by:
        take = 10000  # the Query UI is currently set to 2000
        if sql.limit:
            take = int(sql.limit)
        reader = DictSet(
            reader.sort_and_take(column=sql.order_by,
                                 take=take,
                                 descending=sql.order_descending))

    # LIMIT clause
    if sql.limit:
        reader = reader.take(sql.limit)

    return reader
示例#6
0
            yield from lines
        if carry_forward:
            yield carry_forward


def en(ls):
    for l in ls:
        yield orjson.loads(l)


lines = list(en(read_file_clear("tests/data/formats/jsonl/tweets.jsonl")))
print(len(lines))
os.makedirs("_temp", exist_ok=True)

with Timer("MEMORY"):
    t = DictSet(lines, storage_class=STORAGE_CLASS.MEMORY)
    for i, r in enumerate(t):
        pass
    print(i)

with Timer("DISK WRITE"):
    t = DictSet(lines, storage_class=STORAGE_CLASS.DISK)
    print(t.count())

with Timer("DISK READ"):
    for i, r in enumerate(t):
        pass
    print(i)

with Timer("COMBINED"):
    t = DictSet(lines, storage_class=STORAGE_CLASS.DISK)