示例#1
0
def test_string():
    df = vaex.from_dict({"A": ["a", None, "cdef", "", "g"]})
    col = df.__dataframe__().get_column_by_name("A")

    assert col._col.tolist() == df.A.tolist()
    assert col.size == 5
    assert col.null_count == 1
    assert col.dtype[0] == _DtypeKind.STRING
    assert col.describe_null == (3, 0)

    df2 = _from_dataframe_to_vaex(df.__dataframe__())
    assert df2.A.tolist() == df.A.tolist()
    assert df2.__dataframe__().get_column_by_name("A").null_count == 1
    assert df2.__dataframe__().get_column_by_name("A").describe_null == (3, 0)
    assert df2.__dataframe__().get_column_by_name(
        "A").dtype[0] == _DtypeKind.STRING

    df_sliced = df[1:]
    col = df_sliced.__dataframe__().get_column_by_name("A")
    assert col.size == 4
    assert col.null_count == 1
    assert col.dtype[0] == _DtypeKind.STRING
    assert col.describe_null == (3, 0)

    df2 = _from_dataframe_to_vaex(df_sliced.__dataframe__())
    assert df2.A.tolist() == df_sliced.A.tolist()
    assert df2.__dataframe__().get_column_by_name("A").null_count == 1
    assert df2.__dataframe__().get_column_by_name("A").describe_null == (3, 0)
    assert df2.__dataframe__().get_column_by_name(
        "A").dtype[0] == _DtypeKind.STRING
示例#2
0
def test_mixed_missing(df_factory_arrow):
    df = df_factory_arrow(x=np.array([True, None, False, None, True]),
                          y=np.array([None, 2, 0, 1, 2]),
                          z=np.array([9.2, 10.5, None, 11.8, None]))

    df2 = _from_dataframe_to_vaex(df.__dataframe__())

    assert df.__dataframe__().metadata == df2.__dataframe__().metadata

    assert df["x"].tolist() == df2["x"].tolist()
    assert not df2["x"].is_masked
    assert df2.__dataframe__().get_column_by_name("x").null_count == 2
    assert df["x"].dtype == df2["x"].dtype

    assert df["y"].tolist() == df2["y"].tolist()
    assert not df2["y"].is_masked
    assert df2.__dataframe__().get_column_by_name("y").null_count == 1
    assert df["y"].dtype == df2["y"].dtype

    assert df["z"].tolist() == df2["z"].tolist()
    assert not df2["z"].is_masked
    assert df2.__dataframe__().get_column_by_name("z").null_count == 2
    assert df["z"].dtype == df2["z"].dtype

    assert_dataframe_equal(df.__dataframe__(), df)
示例#3
0
def test_mixed_intfloatbool(df_factory):
    df = df_factory(x=np.array([True, True, False]),
                    y=np.array([1, 2, 0]),
                    z=np.array([9.2, 10.5, 11.8]))
    df2 = _from_dataframe_to_vaex(df.__dataframe__())

    assert df2.x.tolist() == df.x.tolist()
    assert df2.y.tolist() == df.y.tolist()
    assert df2.z.tolist() == df.z.tolist()
    assert df2.__dataframe__().get_column_by_name("x").null_count == 0
    assert df2.__dataframe__().get_column_by_name("y").null_count == 0
    assert df2.__dataframe__().get_column_by_name("z").null_count == 0

    # Additionl tests for _VaexColumn
    assert df2.__dataframe__().get_column_by_name("x")._allow_copy == True
    assert df2.__dataframe__().get_column_by_name("x").size == 3
    assert df2.__dataframe__().get_column_by_name("x").offset == 0

    assert df2.__dataframe__().get_column_by_name(
        "z").dtype[0] == 2  # 2: float64
    assert df2.__dataframe__().get_column_by_name(
        "z").dtype[1] == 64  # 64: float64
    assert df2.__dataframe__().get_column_by_name("z").dtype == (2, 64, "<f8",
                                                                 "=")

    with pytest.raises(TypeError):
        assert df2.__dataframe__().get_column_by_name("y").describe_categorical
    if df2['y'].dtype.is_arrow:
        assert df2.__dataframe__().get_column_by_name("y").describe_null == (3,
                                                                             0)
    else:
        assert df2.__dataframe__().get_column_by_name("y").describe_null == (
            0, None)

    assert_dataframe_equal(df.__dataframe__(), df)
示例#4
0
def test_virtual_column():
    df = vaex.from_arrays(x=np.array([True, True, False]),
                          y=np.array([1, 2, 0]),
                          z=np.array([9.2, 10.5, 11.8]))
    df.add_virtual_column("r", "sqrt(y**2 + z**2)")
    df2 = _from_dataframe_to_vaex(df.__dataframe__())
    assert df2.r.tolist() == df.r.tolist()
示例#5
0
def test_arrow_dictionary():
    indices = pa.array([0, 1, 0, 1, 2, 0, 1, 2])
    dictionary = pa.array(["foo", "bar", "baz"])
    dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
    df = vaex.from_arrays(x=dict_array)

    # Some detailed testing for correctness of dtype and null handling:
    col = df.__dataframe__().get_column_by_name("x")
    assert col.dtype[0] == _DtypeKind.CATEGORICAL
    assert col.describe_categorical == (False, True, {
        0: "foo",
        1: "bar",
        2: "baz"
    })
    if df['x'].dtype.is_arrow:
        assert col.describe_null == (3, 0)
    else:
        assert col.describe_null == (0, None)
    assert col.dtype == (23, 64, "u", "=")

    df2 = _from_dataframe_to_vaex(df.__dataframe__())
    assert df2.x.tolist() == df.x.tolist()
    assert df2.__dataframe__().get_column_by_name("x").null_count == 0

    assert_dataframe_equal(df.__dataframe__(), df)
示例#6
0
def test_missing_from_masked(df_factory_numpy):
    df = df_factory_numpy(
        x=np.ma.array([1, 2, 3, 4, 0], mask=[0, 0, 0, 1, 1], dtype=int),
        y=np.ma.array([1.5, 2.5, 3.5, 4.5, 0],
                      mask=[False, True, True, True, False],
                      dtype=float),
        z=np.ma.array([True, False, True, True, True],
                      mask=[1, 0, 0, 1, 0],
                      dtype=bool),
    )

    df2 = _from_dataframe_to_vaex(df.__dataframe__())

    assert df.__dataframe__().metadata == df2.__dataframe__().metadata

    assert df["x"].tolist() == df2["x"].tolist()
    assert not df2["x"].is_masked
    assert df2.__dataframe__().get_column_by_name("x").null_count == 2
    assert df["x"].dtype == df2["x"].dtype

    assert df["y"].tolist() == df2["y"].tolist()
    assert not df2["y"].is_masked
    assert df2.__dataframe__().get_column_by_name("y").null_count == 3
    assert df["y"].dtype == df2["y"].dtype

    assert df["z"].tolist() == df2["z"].tolist()
    assert not df2["z"].is_masked
    assert df2.__dataframe__().get_column_by_name("z").null_count == 2
    assert df["z"].dtype == df2["z"].dtype

    assert_dataframe_equal(df.__dataframe__(), df)
示例#7
0
def test_mixed_intfloat(df_factory):
    df = df_factory(x=[1, 2, 0], y=[9.2, 10.5, 11.8])
    df2 = _from_dataframe_to_vaex(df.__dataframe__())

    assert df2.x.tolist() == df.x.tolist()
    assert df2.y.tolist() == df.y.tolist()
    assert df2.__dataframe__().get_column_by_name("x").null_count == 0
    assert df2.__dataframe__().get_column_by_name("y").null_count == 0

    assert_dataframe_equal(df.__dataframe__(), df)
示例#8
0
def test_categorical():
    df = vaex.from_arrays(year=[2012, 2013, 2015, 2019], weekday=[0, 1, 4, 6])
    df = df.categorize("year", min_value=2012, max_value=2019)
    df = df.categorize(
        "weekday", labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])

    # Some detailed testing for correctness of dtype and null handling:
    col = df.__dataframe__().get_column_by_name("year")
    assert col.dtype[0] == _DtypeKind.CATEGORICAL
    assert col.describe_categorical == (False, True, {
        0: 2012,
        1: 2013,
        2: 2014,
        3: 2015,
        4: 2016,
        5: 2017,
        6: 2018,
        7: 2019
    })
    assert col.describe_null == (0, None)
    assert col.dtype == (23, 64, "u", "=")
    col2 = df.__dataframe__().get_column_by_name("weekday")
    assert col2.dtype[0] == _DtypeKind.CATEGORICAL
    assert col2.describe_categorical == (False, True, {
        0: "Mon",
        1: "Tue",
        2: "Wed",
        3: "Thu",
        4: "Fri",
        5: "Sat",
        6: "Sun"
    })
    assert col2.describe_null == (0, None)
    assert col2.dtype == (23, 64, "u", "=")

    df2 = _from_dataframe_to_vaex(df.__dataframe__())
    assert df2["year"].tolist() == [2012, 2013, 2015, 2019]
    assert df2["weekday"].tolist() == ["Mon", "Tue", "Fri", "Sun"]

    assert_dataframe_equal(df.__dataframe__(), df)
示例#9
0
def test_arrow_dictionary_missing():
    indices = pa.array([0, 1, 2, 0, 1],
                       mask=np.array([0, 1, 1, 0, 0], dtype=bool))
    dictionary = pa.array(["aap", "noot", "mies"])
    dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
    df = vaex.from_arrays(x=dict_array)

    # Some detailed testing for correctness of dtype and null handling:
    col = df.__dataframe__().get_column_by_name("x")
    assert col.dtype[0] == _DtypeKind.CATEGORICAL
    assert col.describe_categorical == (False, True, {
        0: "aap",
        1: "noot",
        2: "mies"
    })

    df2 = _from_dataframe_to_vaex(df.__dataframe__())
    assert df2.x.tolist() == df.x.tolist()
    assert df2.__dataframe__().get_column_by_name("x").null_count == 2
    assert df["x"].dtype.index_type == df2["x"].dtype.index_type

    assert_dataframe_equal(df.__dataframe__(), df)
示例#10
0
def test_no_mem_copy():
    strings = ["a", "", "cdef", "", "g"]
    # data for above string array
    dbuf = np.array([97, 99, 100, 101, 102, 103], dtype='uint8')
    obuf = np.array([0, 1, 1, 5, 5, 6], dtype='int64')
    length = 5
    buffers = [None, pa.py_buffer(obuf), pa.py_buffer(dbuf)]
    s = pa.Array.from_buffers(pa.large_utf8(), length, buffers)
    x = np.arange(0, 5)
    df = vaex.from_arrays(x=x, s=s)
    df2 = _from_dataframe_to_vaex(df.__dataframe__())

    # primitive data
    x[0] = 999
    assert df2.x.tolist() == [999, 1, 2, 3, 4]

    # strings
    assert df.s.tolist() == strings
    assert df2.s.tolist() == strings
    # mutate the buffer data (which actually arrow and vaex both don't support/want)
    strings[0] = "b"
    dbuf[0] += 1
    assert df.s.tolist() == strings
    assert df2.s.tolist() == strings