Пример #1
0
def test_frame_record():
    schema = Schema(timestamp="timestamp*",
                    date="date",
                    float_val="float",
                    int_val="int")
    values = {
        "timestamp": [1589455901, 1589455902, 1589455903, 1589455904],
        "date": [1, 2, 3, 4],
        "float_val": [1, 2, 3, 4],
        "int_val": [1, 2, 3, 4],
    }
    frm = Frame(schema, values)

    records = list(frm.records(map_dtype="default"))
    assert len(records) == len(frm)
    assert records[0] == {
        "timestamp": datetime(2020, 5, 14, 11, 31, 41),
        "date": date(1970, 1, 2),
        "float_val": 1.0,
        "int_val": 1,
    }
    assert records[-1] == {
        "timestamp": datetime(2020, 5, 14, 11, 31, 44),
        "date": date(1970, 1, 5),
        "float_val": 4.0,
        "int_val": 4,
    }

    records = list(frm.records(map_dtype=None))
    assert len(records) == len(frm)
    assert records[0] == {
        "timestamp": datetime64("2020-05-14T11:31:41"),
        "date": datetime64("1970-01-02"),
        "float_val": 1.0,
        "int_val": 1,
    }
    assert records[-1] == {
        "timestamp": datetime64("2020-05-14T11:31:44"),
        "date": datetime64("1970-01-05"),
        "float_val": 4.0,
        "int_val": 4,
    }

    records = list(frm.records(map_dtype="epoch"))
    assert len(records) == len(frm)
    assert records[0] == {
        "timestamp": 1589455901,
        "date": 86400,
        "float_val": 1.0,
        "int_val": 1,
    }
    assert records[-1] == {
        "timestamp": 1589455904,
        "date": 345600,
        "float_val": 4.0,
        "int_val": 4,
    }
Пример #2
0
def test_df_conversion():
    df = DataFrame({
        "category": NAMES,
        "value": VALUES,
    })
    # Convert to lakota frame and back to df
    frm = Frame(base_schema, df)
    for col in frm:
        assert all(frm.df()[col] == df[col])
Пример #3
0
def test_spill_write(series, how):
    if how == "left":
        ts = [1589455902, 1589455903, 1589455904, 1589455905]
        vals = [22, 33, 44, 55]
    else:
        ts = [1589455903, 1589455904, 1589455905, 1589455906]
        vals = [33, 44, 55, 66]

    frm = Frame(
        schema,
        {
            "timestamp": ts,
            "value": vals,
        },
    )
    series.write(frm)

    # Test full read
    args = [
        # closed is both (default)
        (None, None, "b"),
        (min(ts), max(ts), "b"),
        (None, max(ts), "b"),
        (min(ts), None, "b"),
        # open on left
        (min(ts) - 1, max(ts), "r"),
        # open on right
        (min(ts), max(ts) + 1, "l"),
        # full open
        (min(ts) - 1, max(ts) + 1, "n"),
    ]
    for start, stop, closed in args:
        frm_copy = series.frame(start=start, stop=stop, closed=closed)
        assert frm_copy == frm

    # Test partial read
    expected = Frame(
        schema,
        {
            "timestamp": [1589455903, 1589455904],
            "value": [33, 44],
        },
    )
    args = [
        # closed is both (default)
        (1589455903, 1589455904, "b"),
        # Open on left
        (1589455902, 1589455904, "r"),
        # # open on right
        (1589455903, 1589455905, "l"),
        # # open on both
        (1589455902, 1589455905, "n"),
    ]
    for start, stop, closed in args:
        frm_copy = series.frame(start=start, stop=stop, closed=closed)
        assert frm_copy == expected
Пример #4
0
def test_alias():
    res = AST.parse("(as (asarray (list 1 2 3)) 'new_name')").eval()
    arr = res.value
    alias = res.name
    assert all(arr == asarray([1, 2, 3]))
    assert alias == "new_name"

    frm = Frame(schema, values)
    frm = frm.reduce("(as self.timestamp 'ts')")
    assert all(frm["ts"] == asarray(values["timestamp"], "M"))
Пример #5
0
def test_concat(frm):
    frm2 = Frame.concat(frm, frm)
    for name in frm:
        col = list(frm[name])
        expected = sorted(col + col)
        result = list(frm2[name])
        assert result == expected

    assert Frame.concat(frm) == frm
    assert Frame.concat() is None
Пример #6
0
def test_mask():
    # with an array
    schema = Schema(x="int*")
    frm = Frame(schema, {"x": [1, 2, 3, 4, 5, 5, 5, 6]})
    frm2 = frm.mask(array([True, False] * 4))
    assert all(frm2["x"] == [1, 3, 5, 5])

    # with an expression
    frm2 = frm.mask("(= (% self.x 2) 0")
    assert all(frm2["x"] == [2, 4, 6])
Пример #7
0
def test_reduce_agg():
    schema = Schema(timestamp="timestamp*", category="str*", value="int")
    values = {
        "timestamp": [1589455901, 1589455901, 1589455902, 1589455902],
        "category": list("abab"),
        "value": [1, 2, 3, 4],
    }

    frm = Frame(schema, values)
    for op in AST.aggregates:
        if op == "quantile":
            # quantile not avail with binning
            continue
        new_frm = frm.reduce(category="category", value=f"({op} self.value)")
        if op == "min":
            assert list(new_frm["value"]) == [1, 2]
        elif op == "max":
            assert list(new_frm["value"]) == [3, 4]
        elif op == "sum":
            assert list(new_frm["value"]) == [4, 6]
        elif op in ("mean", "average"):
            assert list(new_frm["value"]) == [2, 3]
        elif op == "first":
            assert list(new_frm["value"]) == [1, 2]
        elif op == "last":
            assert list(new_frm["value"]) == [3, 4]
        elif op in ("count", "len"):
            assert list(new_frm["value"]) == [2, 2]
        else:
            raise ValueError(f'op "{op}" not tested')

    for op in AST.aggregates:
        if op == "quantile":
            # quantile not avail with binning
            continue
        new_frm = frm.reduce(timestamp='(floor self.timestamp "D")',
                             value=f"({op} self.value)")
        if op == "min":
            assert list(new_frm["value"]) == [1]
        elif op == "max":
            assert list(new_frm["value"]) == [4]
        elif op == "sum":
            assert list(new_frm["value"]) == [10]
        elif op in ("mean", "average"):
            assert list(new_frm["value"]) == [2.5]
        elif op == "first":
            assert list(new_frm["value"]) == [1]
        elif op == "last":
            assert list(new_frm["value"]) == [4]
        elif op in ("count", "len"):
            assert list(new_frm["value"]) == [4]
        else:
            raise ValueError(f'op "{op}" not tested')
Пример #8
0
def test_getitem():
    # with a slice
    schema = Schema(x="int*")
    frm = Frame(schema, {"x": [1, 2, 3, 4, 5, 5, 5, 6]})
    frm2 = frm[5:]
    assert all(frm2["x"] == [5, 5, 5, 6])

    # with a mask
    frm2 = frm[array([True, False] * 4)]
    assert all(frm2["x"] == [1, 3, 5, 5])
Пример #9
0
def test_with_frame():
    frm = Frame(schema, values)
    env = {"frm": frm, "floor": floor}
    res = AST.parse("(floor frm.timestamp 'Y')").eval(env)
    expect = asarray(["2020", "2020", "2020"], dtype="datetime64[Y]")
    assert all(res == expect)

    res = AST.parse("(floor frm.timestamp 'h')").eval(env)
    expect = asarray(["2020-01-01T11", "2020-01-02T12", "2020-01-03T13"],
                     dtype="datetime64")
    assert all(res == expect)
Пример #10
0
def test_adjacent_write(series, how):
    if how == "left":
        ts = [1589455901, 1589455902]
        vals = [1.1, 2.2]
    else:
        ts = [1589455906, 1589455907]
        vals = [6.6, 7.7]

    # We do two write of one arrays (should trigger more corner cases)
    for pos, stamp in enumerate(ts):
        frm = Frame(
            schema,
            {
                "timestamp": [stamp],
                "value": [vals[pos]],
            },
        )
        series.write(frm)

    # Full read
    frm_copy = series.frame()
    if how == "left":
        assert all(
            frm_copy["timestamp"]
            == [1589455901, 1589455902, 1589455903, 1589455904, 1589455905]
        )
        assert all(frm_copy["value"] == [1.1, 2.2, 3.3, 4.4, 5.5])

    else:
        assert all(
            frm_copy["timestamp"]
            == [1589455903, 1589455904, 1589455905, 1589455906, 1589455907]
        )
        assert all(frm_copy["value"] == [3.3, 4.4, 5.5, 6.6, 7.7])

    # Slice read - left slice
    frm_copy = series[1589455902:1589455903].frame(closed="b")
    if how == "left":
        assert all(frm_copy["timestamp"] == [1589455902, 1589455903])
        assert all(frm_copy["value"] == [2.2, 3.3])

    else:
        assert all(frm_copy["timestamp"] == [1589455903])
        assert all(frm_copy["value"] == [3.3])

    # Slice read - right slice
    frm_copy = series[1589455905:1589455906].frame(closed="b")
    if how == "left":
        assert all(frm_copy["timestamp"] == [1589455905])
        assert all(frm_copy["value"] == [5.5])

    else:
        assert all(frm_copy["timestamp"] == [1589455905, 1589455906])
        assert all(frm_copy["value"] == [5.5, 6.6])
Пример #11
0
def test_reduce_without_agg():
    schema = Schema(timestamp="timestamp*", category="str*", value="int")
    values = {
        "timestamp": [1589455901, 1589455901, 1589455902, 1589455902],
        "category": list("abab"),
        "value": [1, 2, 3, 4],
    }

    frm = Frame(schema, values)
    # No changes to column
    assert frm == frm.reduce(timestamp="timestamp",
                             category="category",
                             value="value")
    # Mapping on one column
    res = frm.reduce(value="(% self.value 2)")["value"]
    assert list(res) == [1, 0, 1, 0]

    # Mapping over two columns
    expected = frm["timestamp"] + frm["value"]
    new_frm = frm.reduce(new_col="(+ self.value self.timestamp)")
    assert all(new_frm["new_col"] == expected)
Пример #12
0
def test_sort():
    # One index column
    category = ["b", "a", "c"]
    value = [2, 1, 3]
    frm = Frame(
        base_schema,
        {
            "category": category,
            "value": value,
        },
    )
    assert frm.is_sorted() == False

    frm = frm.sorted()
    assert all(frm["category"] == sorted(category))
    assert all(frm["value"] == sorted(value))
    assert frm.is_sorted() == True

    # multi-index
    timestamp = ["2020-01-02", "2020-01-03", "2020-01-02"]
    frm = Frame(
        multi_idx_schema,
        {
            "timestamp": timestamp,
            "category": category,
            "value": value,
        },
    )
    assert frm.is_sorted() == False

    timestamp, category = zip(*sorted(zip(timestamp, category)))
    frm = frm.sorted()
    assert all(frm["timestamp"] == asarray(timestamp, "M"))
    assert all(frm["category"] == category)
    assert all(frm["value"] == [2, 3, 1])
    assert frm.is_sorted() == True
Пример #13
0
def insert(args):
    token, label, year = args
    pod = POD.from_token(token)
    repo = Repo(pod=pod)
    collection = repo / "my_collection"
    series = collection / label
    ts = date_range(f"{year}-01-01",
                    f"{year+1}-01-01",
                    freq="1min",
                    closed="left")
    df = DataFrame({
        "timestamp":
        ts,
        "value":
        numpy.round(numpy.random.random(len(ts)) * 1000, decimals=0),
    })
    sgm = Frame(schema, df)
    series.write(sgm)
    return len(sgm)
Пример #14
0
def test_short_cover(series, how):
    if how == "left":
        ts = [1589455904, 1589455905]
        vals = [44, 55]
    else:
        ts = [1589455903, 1589455904]
        vals = [33, 44]

    frm = Frame(
        schema,
        {"timestamp": ts, "value": vals},
    )
    series.write(frm)

    frm_copy = series.frame()
    assert all(frm_copy["timestamp"] == [1589455903, 1589455904, 1589455905])
    if how == "left":
        assert all(frm_copy["value"] == [3.3, 44, 55])

    else:
        assert all(frm_copy["value"] == [33, 44, 5.5])
Пример #15
0
def test_fragmented_write(series, direction, sgm_size):
    ts = [1589455901, 1589455902, 1589455903, 1589455904, 1589455905, 1589455906]
    vals = [11, 22, 33, 44, 55, 66]

    if direction == "fwd":
        rg = range(len(ts))
    elif direction == "bwd":
        rg = range(len(ts) - 1, -1, -1)
    else:
        rg = list(range(len(ts)))
        shuffle(rg)
    for pos in rg:
        frm = Frame(
            schema,
            {
                "timestamp": ts[pos : pos + sgm_size],
                "value": vals[pos : pos + sgm_size],
            },
        )
        series.write(frm)

    frm = series.frame()
    assert all(frm["timestamp"] == ts)
    assert all(frm["value"] == vals)
Пример #16
0
def frm(frame_values):
    frm = Frame(base_schema, frame_values)
    return frm
Пример #17
0
def test_pull(threaded, large):
    c_label = "a_collection"
    s_label = "a_series"
    remote_repo = Repo()
    remote_coll = remote_repo.create_collection(schema, c_label)
    rseries = remote_coll / s_label

    # Test support of both small dataset (where data is embedded in
    # commits) and large one (arrays are save on their own)
    N = 100_000 if large else 10
    for i in range(10):
        # Create 10 series of size N
        rseries.write({
            "timestamp": range(i, i + N),
            "value": range(i + 100, i + 100 + N),
        })
    nb_items = len(remote_repo.pod.ls())
    if large:
        assert nb_items > 2
    else:
        # for small arrays we have only two folder (one for the repo
        # registry one for the collection)
        assert nb_items == 2
    expected = rseries.frame()

    # Test pull
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    local_coll.pull(remote_coll)
    lseries = local_coll / s_label
    assert lseries.frame() == expected

    # Test push
    other_repo = Repo()
    other_coll = other_repo.create_collection(schema, c_label)
    remote_coll.push(other_coll)
    oseries = other_coll / s_label
    assert oseries.frame() == expected

    # Test with existing series
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    local_coll.pull(remote_coll)
    lseries = (
        other_repo.create_collection(schema, c_label, raise_if_exists=False) /
        s_label)
    assert oseries.frame() == expected

    # Test with existing series with existing data
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    lseries = local_coll / s_label
    frm = Frame(
        schema,
        {
            "timestamp": range(0, 20),
            "value": range(0, 20),
        },
    )
    lseries.write(frm)
    local_coll.pull(remote_coll)
    assert lseries.frame() == frm

    # Test with existing series with other schema
    local_repo = Repo()
    other_schema = Schema(timestamp="int*", value="int")
    local_coll = local_repo.create_collection(other_schema, c_label)
    lseries = local_coll / s_label

    with pytest.raises(ValueError):
        local_repo.pull(remote_repo)
Пример #18
0
def test_index_slice():
    schema = Schema(x="int*")
    frm = Frame(schema, {"x": [1, 2, 3, 4, 5, 5, 5, 6]})

    # include both side
    res = frm.slice(*frm.index_slice([2], [4], closed="b"))["x"]
    assert all(res == [2, 3, 4])

    # include only left
    res = frm.slice(*frm.index_slice([2], [4], closed="l"))["x"]
    assert all(res == [2, 3])

    # include only right
    res = frm.slice(*frm.index_slice([2], [4], closed="r"))["x"]
    assert all(res == [3, 4])

    # implict right
    res = frm.slice(*frm.index_slice([5], [5], closed="b"))["x"]
    assert all(res == [5, 5, 5])

    res = frm.slice(*frm.index_slice([1], [1], closed="b"))["x"]
    assert all(res == [1])

    res = frm.slice(*frm.index_slice([6], [6], closed="b"))["x"]
    assert all(res == [6])