Пример #1
0
def test_count_cat():
    ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green'], counts=[1, 2, 3, 4])
    ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green'], names=['apple', 'apple', 'berry', 'apple'])
    ds = ds0.ordinal_encode(ds0.colors)
    ds = ds0.ordinal_encode(ds0.names)

    ds = ds0.ordinal_encode('colors', ['red', 'green', 'blue'])
    assert ds.count(binby=ds.colors).tolist() == [1, 2, 1]
    ds = ds0.ordinal_encode('colors', ['red', 'blue', 'green', ], inplace=True)
    assert ds.count(binby=ds.colors).tolist() == [1, 1, 2]
Пример #2
0
def test_cat_missing_values():
    colors = ['red', 'green', 'blue', 'green', 'MISSING']
    mask   = [False, False,   False,   False,  True]
    colors = np.ma.array(colors, mask=mask)
    ds0 = vaex.from_arrays(colors=colors)
    ds = ds0.ordinal_encode('colors', ['red', 'green', 'blue'])
    assert ds.count(binby=ds.colors, edges=True).tolist() == [1, 0, 1, 2, 1, 0]
Пример #3
0
def test_expr():
    ar = np.zeros((10, 2)).reshape(20)
    x = ar[::2]
    x[:] = np.arange(10)
    ds = vaex.from_arrays(x=x)
    counts = ds.count('x*2', binby='x*2', limits=[-0.5, 19.5], shape=10)
    assert counts.tolist() == np.ones(10).tolist()
Пример #4
0
def test_groupby_count_string():
    g = np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2])
    s = np.array(list(map(str, [0, 0, 0, 0, 1, 1, 1, 1, 2, 2])))
    df = vaex.from_arrays(g=g, s=s)
    groupby = df.groupby('s')
    dfg = groupby.agg({'c': vaex.agg.count('s')})
    assert dfg.s.tolist() == ['0', '1', '2']
    assert dfg.c.tolist() == [4, 4, 2]
Пример #5
0
def test_big_endian_binning_non_contiguous():
    x = np.arange(20, dtype='>f8')[::2]
    x[:] = np.arange(10, dtype='>f8')
    y = np.arange(20, dtype='>f8')[::2]
    y[:] = np.arange(10, dtype='>f8')
    ds = vaex.from_arrays(x=x, y=y)
    counts = ds.count(binby=[ds.x, ds.y], limits=[[-0.5, 9.5], [-0.5, 9.5]], shape=[10, 10])
    assert np.diagonal(counts).tolist() == np.ones(10).tolist()
Пример #6
0
def test_count_1d():
    x = np.array([-1, -2, 0.5, 1.5, 4.5, 5], dtype='f8')
    df = vaex.from_arrays(x=x)

    bins = 5
    binner = df._binner_scalar('x', [0, 5], bins)
    grid = vaex.superagg.Grid([binner])
    agg = vaex.agg.count()
    grid = df._agg(agg, grid)
    assert grid.tolist() == [0, 2, 1, 1, 0, 0, 1, 1]
Пример #7
0
def test_count_1d_ordinal():
    x = np.array([-1, -2, 0, 1, 4, 5], dtype='i8')
    df = vaex.from_arrays(x=x)

    bins = 5
    binner = df._binner_ordinal('x', 5)
    grid = vaex.superagg.Grid([binner])
    agg = vaex.agg.count()
    grid = df._agg(agg, grid)
    assert grid.tolist() == [0, 2, 1, 1, 0, 0, 1, 1]
Пример #8
0
def test_groupby_datetime_quarter():
    t = np.arange('2015-01-01', '2016-01-02', dtype=np.datetime64)
    y = np.arange(len(t))

    df = vaex.from_arrays(t=t, y=y)
    dfg = df.groupby(vaex.BinnerTime.per_quarter(df.t)).agg({'y': 'sum'})

    values = dfg.y.tolist()
    assert len(values) == 5
    assert sum(values) == sum(y)
Пример #9
0
def test_plain_strings():
    N = 4
    x = np.array(['a', 'bb', 'ccc', 'dddd'], dtype='object')
    df = vaex.from_arrays(x=x)

    assert len(df.columns['x']) == 4
    trimmed = df.columns['x'][2:4]
    assert trimmed[:].tolist() == x[2:4].tolist()
    assert len(df) == N
    assert len(df[1:3]) == 2
    assert df[1:3].x.tolist() == x[1:3].tolist()
Пример #10
0
def test_arrow_strings():
    N = 4
    x = ['a', 'bb', 'ccc', 'dddd']
    xc = vaex.string_column(x)
    df = vaex.from_arrays(x=xc)
    assert len(df.columns['x']) == 4
    trimmed = df.columns['x'][2:4]
    assert trimmed[:].tolist() == x[2:4]
    assert len(df) == N
    assert len(df[1:3]) == 2
    assert df[1:3].x.tolist() == x[1:3]
Пример #11
0
def test_cat_string():
    ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green'])
    ds = ds0.ordinal_encode('colors')  #, ['red', 'green'], inplace=True)
    assert ds.is_category('colors')
    assert ds.limits('colors', shape=128) == ([-0.5, 2.5], 3)

    ds = ds0.ordinal_encode('colors', values=['red', 'green'])
    assert ds.is_category('colors')
    assert ds.limits('colors', shape=128) == ([-0.5, 1.5], 2)
    assert ds.data.colors.tolist() == [0, 1, None, 1]

    assert ds.copy().is_category(ds.colors)
Пример #12
0
def test_vrange():
    N = 1000**3
    df = vaex.from_arrays(x=vaex.vrange(0, N))
    assert len(df.columns['x']) == N
    trimmed = df.columns['x'].trim(2, 4)
    assert trimmed.start == 2
    assert trimmed.stop == 4
    assert len(df) == N
    assert len(df[0:10]) == 10
    assert df[1:11].x.tolist() == (np.arange(1, 11.)).tolist()
    df['y'] = df.x**2
    assert df[1:11].y.tolist() == (np.arange(1, 11)**2).tolist()
Пример #13
0
def test_groupby_count():
    # ds = ds_local.extract()
    g = np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2], dtype='int32')
    s = np.array(list(map(str, [0, 0, 0, 0, 1, 1, 1, 1, 2, 2])))
    df = vaex.from_arrays(g=g, s=s)
    groupby = df.groupby('s')
    dfg = groupby.agg({'g': 'mean'}).sort('s')
    assert dfg.s.tolist() == ['0', '1', '2']
    assert dfg.g.tolist() == [0, 1, 2]

    dfg2 = df.groupby('s', {'g': 'mean'}).sort('s')
    assert dfg._equals(dfg2)
Пример #14
0
def test_groupby_datetime():
    t = np.arange('2015-01-01', '2015-02-01', dtype=np.datetime64)
    y = np.arange(len(t))

    df = vaex.from_arrays(t=t, y=y)

    dfg = df.groupby(vaex.BinnerTime.per_week(df.t), agg={'y': 'sum'})
    assert dfg.y.tolist() == [y[k * 7:(k + 1) * 7].sum() for k in range(5)]

    # other syntax
    dfg = df.groupby(vaex.BinnerTime.per_week(df.t)).agg({'y': 'sum'})
    assert dfg.y.tolist() == [y[k * 7:(k + 1) * 7].sum() for k in range(5)]
Пример #15
0
def test_groupby_count():
    # ds = ds_local.extract()
    g = np.array([0, 0, 0, 0, 1, 1, 1, 1, 0, 1], dtype='int32')
    s = np.array(list(map(str, [0, 0, 0, 0, 1, 1, 1, 1, 2, 2])))
    df = vaex.from_arrays(g=g, s=s)
    groupby = df.groupby('s')
    dfg = groupby.agg({'g': 'mean'}).sort('s')
    assert dfg.s.tolist() == ['0', '1', '2']
    assert dfg.g.tolist() == [0, 1, 0.5]

    dfg2 = df.groupby('s', {'g': 'mean'}).sort('s')
    assert dfg._equals(dfg2)
Пример #16
0
 def dfs(alpha, delta, pm_a, pm_d, radians=radians):
     ds_1 = vaex.from_scalars(alpha=alpha, delta=delta, pm_a=pm_a, pm_d=pm_d, alpha_e=0.01, delta_e=0.02, pm_a_e=0.003, pm_d_e=0.004)
     ds_1 = ds_1.astro.pm_eq2gal("alpha", "delta", "pm_a", "pm_d", "pm_l", "pm_b", propagate_uncertainties=True, radians=radians)
     N = 100000
     # distance
     alpha =        np.random.normal(0, 0.01, N)  + alpha
     delta =        np.random.normal(0, 0.02, N)  + delta
     pm_a =         np.random.normal(0, 0.003, N)  + pm_a
     pm_d =         np.random.normal(0, 0.004, N)  + pm_d
     ds_many = vaex.from_arrays(alpha=alpha, delta=delta, pm_a=pm_a, pm_d=pm_d)
     ds_many.astro.pm_eq2gal("alpha", "delta", "pm_a", "pm_d", "pm_l", "pm_b", radians=radians, inplace=True)
     return ds_1, ds_many
Пример #17
0
def test_format():
    num1 = np.array([1, 2, 3], dtype=np.int32)
    num2 = np.array([1.1, 2.2, 3.3], dtype=np.float32)
    text = ['Here', 'we', 'go']

    df = vaex.from_arrays(num1=num1, num2=num2, text=text)

    assert df.num1.format("%d").tolist() == ['1', '2', '3']
    assert df.num1.format("%04d").tolist() == ['0001', '0002', '0003']
    assert df.num2.format('%f').tolist() == ['1.100000', '2.200000', '3.300000']
    assert df.num2.format('%05.2f').tolist() == ['01.10', '02.20', '03.30']
    assert df.text.format('pre-%s-post').tolist() == ['pre-%s-post' % k for k in text]
Пример #18
0
def test_vrange():
    N = 1000**3
    df = vaex.from_arrays(x=vaex.vrange(0, N))
    assert len(df.columns['x']) == N
    trimmed = df.columns['x'].trim(2,4)
    assert trimmed.start == 2
    assert trimmed.stop == 4
    assert len(df) == N
    assert len(df[0:10]) == 10
    assert df[1:11].x.tolist() == (np.arange(1, 11.)).tolist()
    df['y'] = df.x**2
    assert df[1:11].y.tolist()== (np.arange(1, 11)**2).tolist()
Пример #19
0
def test_countna():
    x = np.array([5, '', 1, 4, None, 6, np.nan, np.nan, 10, '', 0, 0, -13.5])
    y_data = np.array([np.nan, 2, None, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
    y_mask = np.array([0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1])
    y = np.ma.MaskedArray(data=y_data, mask=y_mask)
    df = vaex.from_arrays(x=x, y=y)
    pandas_df = df.to_pandas_df()

    assert df.x.countna() == pandas_df.x.isna().sum()
    assert df.y.countna() == pandas_df.y.isna().sum()
    assert df.x.countnan() == 2
    assert df.y.countmissing() == 6
Пример #20
0
def test_export_json():
    temp_path = './temp.json'
    ds = vaex.from_arrays(**{'A': [1, 2, 3], 'B': ['1', '2', '3']})
    vaex.utils.write_json_or_yaml(temp_path, ds.to_dict(array_type='python'))

    with open(temp_path, 'r') as f:
        data = json.load(f)

    os.remove(temp_path)
    assert 'A' in data
    assert len(data['A']) == 3
    assert data['B'][0] == '1'
Пример #21
0
def test_merge_same_aggregation_tasks():
    df = vaex.from_arrays(x=[1, 2], y=[2, 3])
    binners = df._create_binners('x', [0.5, 2.5], 2)
    binners2 = df._create_binners('x', [0.5, 2.5], 2)
    assert len(binners) == 1
    # these two aggregations should be merged into 1 subtask
    [task1], result1 = vaex.agg.count().add_tasks(df, binners)
    [task2], result2 = vaex.agg.count().add_tasks(df, binners)
    assert len(df.executor.tasks) == 1
    df.execute()
    assert task1 is task2
    assert np.all(result1.get() == result2.get())
Пример #22
0
def test_groupby_datetime():
    t = np.arange('2015-01-01', '2015-02-01', dtype=np.datetime64)
    y = np.arange(len(t))

    df = vaex.from_arrays(t=t, y=y)

    dfg = df.groupby(vaex.BinnerTime.per_week(df.t), agg={'y': 'sum'})
    assert dfg.y.tolist() == [y[k*7:(k+1)*7].sum() for k in range(5)]

    # other syntax
    dfg = df.groupby(vaex.BinnerTime.per_week(df.t)).agg({'y': 'sum'})
    assert dfg.y.tolist() == [y[k*7:(k+1)*7].sum() for k in range(5)]
Пример #23
0
def test_agg_filtered_df_invalid_data():
    # Custom function to be applied to a filtered DataFrame
    def custom_func(x):
        assert 4 not in x
        return x**2

    df = vaex.from_arrays(x=np.arange(10))
    df_filtered = df[df.x != 4]
    df_filtered.add_function('custom_function', custom_func)
    df_filtered['y'] = df_filtered.func.custom_function(df_filtered.x)
    # assert df_filtered.y.tolist() == [0, 1, 4, 9, 25, 36, 49, 64, 81]
    assert df_filtered.count(df_filtered.y) == 9
Пример #24
0
def test_cat_string():
    ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green'])
    ds = ds0.ordinal_encode('colors')#, ['red', 'green'], inplace=True)
    assert ds.is_category('colors')
    assert ds.limits('colors', shape=128) == ([-0.5, 2.5], 3)

    ds = ds0.ordinal_encode('colors', values=['red', 'green'])
    assert ds.is_category('colors')
    assert ds.limits('colors', shape=128) == ([-0.5, 1.5], 2)
    assert ds.data.colors.tolist() == [0, 1, None, 1]

    assert ds.copy().is_category(ds.colors)
Пример #25
0
def test_export_json():
    temp_path = './temp.json'
    ds = vaex.from_arrays(**{'A': [1, 2, 3], 'B': ['1', '2', '3']})
    vaex.utils.write_json_or_yaml(temp_path, ds.to_dict())

    with open(temp_path, 'r') as f:
        data = json.load(f)

    os.remove(temp_path)
    assert 'A' in data
    assert len(data['A']) == 3
    assert data['B'][0] == '1'
Пример #26
0
def test_cyclical_transformer(tmpdir):
    df_train = vaex.from_arrays(hour=[0, 3, 6])
    df_test = vaex.from_arrays(hour=[12, 24, 21, 15])

    trans = vaex.ml.CycleTransformer(n=24,
                                     features=['hour'],
                                     prefix_x='pref_',
                                     prefix_y='pref_')
    df_train = trans.fit_transform(df_train)
    np.testing.assert_array_almost_equal(df_train.pref_hour_x.values,
                                         [1, 0.707107, 0])
    np.testing.assert_array_almost_equal(df_train.pref_hour_y.values,
                                         [0, 0.707107, 1])

    state_path = str(tmpdir.join('state.json'))
    df_train.state_write(state_path)
    df_test.state_load(state_path)
    np.testing.assert_array_almost_equal(df_test.pref_hour_x.values,
                                         [-1, 1, 0.707107, -0.707107])
    np.testing.assert_array_almost_equal(df_test.pref_hour_y.values,
                                         [0, 0, -0.707107, -0.707107])
Пример #27
0
def test_count_basics(df):
    # df = df_l
    y = df.y.to_numpy()
    x = df.x.to_numpy()
    counts = df.count(binby=df.x, limits=[0,10], shape=10)
    assert len(counts) == 10
    assert all(counts == 1), "counts is %r" % counts

    sums = df["y"].sum(binby=df.x, limits=[0,10], shape=10)
    assert len(sums) == 10
    assert(all(sums == y))

    df.select("x < 5")
    mask = x < 5

    counts = df["x"].count(binby=df.x, limits=[0,10], shape=10, selection=True)
    mod_counts = counts * 1.
    mod_counts[~mask] = 0
    assert(all(counts == mod_counts))

    mod_sums = y * 1.
    mod_sums[~mask] = 0
    sums = df["y"].sum(binby=df.x, limits=[0,10], shape=10, selection=True)
    assert(all(sums == mod_sums))

    # TODO: we may want to test this for a remote df
    # 2d
    x = np.array([0, 1, 0, 1])
    y = np.array([0, 0, 1, 1])
    df = vaex.from_arrays(x=x, y=y)
    counts = df.count(binby=[df.x, df.y], limits=[[0.,2.], [0.,2.]], shape=2)
    assert np.all(counts == 1)

    # 3d
    x = np.array([0, 1, 0, 1, 0, 1, 0, 1])
    y = np.array([0, 0, 1, 1, 0, 0, 1, 1])
    z = np.array([0, 0, 0, 0, 1, 1, 1, 1])
    df = vaex.from_arrays(x=x, y=y, z=z)
    counts = df.count(binby=[df.x, df.y, df.z], limits=[[0.,2.], [0.,2.], [0.,2.]], shape=2)
    assert np.all(counts == 1)
Пример #28
0
async def test_auto_execute():
    df = vaex.from_arrays(x=[2, 4])

    async def means():
        count, sum = await asyncio.gather(df.x.count(delay=True),
                                          df.x.sum(delay=True))
        mean = await df.x.mean(delay=True)
        return sum / count, mean

    async with df.executor.auto_execute():
        mean1, mean2 = await means()
        assert mean1 == 3
        assert mean2 == 3
Пример #29
0
def test_nunique():
    s = ['aap', 'aap', 'noot', 'mies', None, 'mies', 'kees', 'mies', 'aap']
    x = [0,     0,     0,      0,      0,     1,      1,     1,      2]
    df = vaex.from_arrays(x=x, s=s)
    dfg = df.groupby(df.x, agg={'nunique': vaex.agg.nunique(df.s)}).sort(df.x)
    items = list(zip(dfg.x.values, dfg.nunique.values))
    assert items == [(0, 4), (1, 2), (2, 1)]

    dfg = df.groupby(df.x, agg={'nunique': vaex.agg.nunique(df.s, dropmissing=True)}).sort(df.x)
    items = list(zip(dfg.x.values, dfg.nunique.values))
    assert items == [(0, 3), (1, 2), (2, 1)]

    mapping = {'aap': 1.2, 'noot': 2.5, 'mies': 3.7, 'kees': 4.8, None: np.nan}
    s = np.array([mapping[k] for k in s], dtype=np.float64)
    df = vaex.from_arrays(x=x, s=s)
    dfg = df.groupby(df.x, agg={'nunique': vaex.agg.nunique(df.s)}).sort(df.x)
    items = list(zip(dfg.x.values, dfg.nunique.values))
    assert items == [(0, 4), (1, 2), (2, 1)]

    dfg = df.groupby(df.x, agg={'nunique': vaex.agg.nunique(df.s, dropnan=True)}).sort(df.x)
    items = list(zip(dfg.x.values, dfg.nunique.values))
    assert items == [(0, 3), (1, 2), (2, 1)]
Пример #30
0
def test_weight_of_evidence_encoder_edge_cases():
    y = [1, 0, 1, 0, 1, 0, 0, 0, 1, 1]
    x = ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd']
    df = vaex.from_arrays(x=x, y=y)

    woe_encoder = vaex.ml.WeightOfEvidenceEncoder(features=['x'], target='y', unseen='zero')
    df = woe_encoder.fit_transform(df)

    expected_values = [0.69314, 0.69314, 0.69314, -0.69314, -0.69314,
                      -0.69314, -13.81550, -13.81550, 13.81551, 13.81551]
    np.testing.assert_array_almost_equal(df.woe_encoded_x.tolist(),
                                         expected_values,
                                         decimal=5)
Пример #31
0
def test_delay_ordinal(binby):
    df = vaex.from_arrays(x=[1, 2, 2, 3, 3, 3], s=["aap", "aap", "aap", "noot", "noot", "mies"])
    df.ordinal_encode("x", inplace=True)
    df.ordinal_encode("s", inplace=True)
    df.executor.passes = 0
    if binby:
        ar1 = df.binby('x', agg='count', delay=True)
        ar2 = df.binby('s', agg='count', delay=True)
    else:
        df1 = df.groupby('x', agg='count', delay=True)
        df2 = df.groupby('s', agg='count', delay=True)
    df.execute()
    assert df.executor.passes == 1
Пример #32
0
def test_is_missing():
    s = vaex.string_column(["aap", None, "noot", "mies"])
    o = ["aap", None, "noot", np.nan]
    x = np.arange(4, dtype=np.float64)
    x[2] = x[3] = np.nan
    m = np.ma.array(x, mask=[0, 1, 0, 1])
    df = vaex.from_arrays(x=x, m=m, s=s, o=o)
    assert (df.x.ismissing().tolist() == [False, False, False, False])
    assert (df.m.ismissing().tolist() == [False, True, False, True])
    assert (df.s.ismissing().tolist() == [False, True, False, False])
    assert (df.o.ismissing().tolist() == [False, True, False, False])

    assert (df.m.notmissing().tolist() == [not k for k in [False, True, False, True]])
Пример #33
0
def test_df_selection_references_virtual_column():
    x = np.arange(10, dtype='i4')
    y = x**2
    df = vaex.from_arrays(x=x, y=y)
    df1 = df.copy()
    df1['z'] = df1.x + df1.y
    df2 = df.copy()
    df2['z'] = df1.x - df1.y
    df1f = df1[df1.z < 5]
    df2f = df2[df2.z < 5]
    fp1 = df1f.fingerprint(dependencies=['x', 'y'])
    fp2 = df2f.fingerprint(dependencies=['x', 'y'])
    assert fp1 != fp2
Пример #34
0
def webserver():
    webserver = vaex.webserver.WebServer(datasets=[],
                                         port=test_port,
                                         cache_byte_size=0,
                                         token='token',
                                         token_trusted='token_trusted')
    x = np.arange(10)
    df = vaex.from_arrays(x=x)
    df.name = 'df'
    webserver.set_datasets([df])
    webserver.serve_threaded()
    yield webserver
    webserver.stop_serving()
Пример #35
0
def test_label_encoder():
    # Create sample data
    x1 = np.array(['dog', 'cat', 'mouse', 'mouse', 'dog', 'dog', 'dog', 'cat', 'cat', 'mouse', 'dog'])
    x2 = np.array(['dog', 'dog', 'cat', 'cat', 'mouse'])
    x3 = np.array(['mouse', 'dragon', 'dog', 'dragon'])  # unseen value 'dragon'
    y1 = np.array([1, 2, 2, 2, 3, 1, 2, 3, 5, 5, 1])
    y2 = np.array([3, 3, 1, 3, 2])
    y3 = np.array([3, 2, 1, 4])  # unseen value 4

    # Create
    df_train = vaex.from_arrays(x=x1, y=y1)
    df_test = vaex.from_arrays(x=x2, y=y2)
    df_unseen = vaex.from_arrays(x=x3, y=y3)

    # # Label Encode with vaex.ml
    label_encoder = df_train.ml.label_encoder(features=['x', 'y'], prefix='mypref_')

    # Assertions: makes sure that the categories are correctly identified:
    assert set(list(label_encoder.labels_['x'].keys())) == set(np.unique(x1))
    assert set(list(label_encoder.labels_['y'].keys())) == set(np.unique(y1))

    # Transform
    df_train = label_encoder.transform(df_train)
    df_test = label_encoder.transform(df_test)

    # Make asserssions on the "correctness" of the implementation by "manually" applying the labels to the categories
    assert df_test.x.apply(lambda elem: label_encoder.labels_['x'][elem]).tolist() == df_test.mypref_x.tolist()
    assert df_test.y.apply(lambda elem: label_encoder.labels_['y'][elem]).tolist() == df_test.mypref_y.tolist()

    # Try to get labels from the dd dataset unseen categories
    with pytest.raises(ValueError):
        label_encoder.transform(df_unseen)

    # Now try again, but allow for unseen categories
    label_encoder = df_train.ml.label_encoder(features=['x', 'y'], prefix='mypref_', allow_unseen=True)
    df_unseen = label_encoder.transform(df_unseen)
    assert set(df_unseen[df_unseen.x == 'dragon'].mypref_x.tolist()) == {-1}
    assert set(df_unseen[df_unseen.y == 4].mypref_x.tolist()) == {-1}
Пример #36
0
def test_agg_binary():
    x = np.arange(5)
    df = vaex.from_arrays(x=x, y=x+1, g=x//4)
    agg = vaex.agg.sum('x') / vaex.agg.sum('y')
    assert repr(agg) == "(vaex.agg.sum('x') / vaex.agg.sum('y'))"
    assert df.groupby('g', agg={'total': agg})['total'].tolist() == [6 / 10, 4 / 5]
    agg = vaex.agg.sum('x') + 99
    assert repr(agg) == "(vaex.agg.sum('x') + 99)"
    assert df.groupby('g', agg={'total': agg})['total'].tolist() == [6 + 99, 4 + 99]
    agg = 99 + vaex.agg.sum('y')
    assert repr(agg) == "(99 + vaex.agg.sum('y'))"
    assert df.groupby('g', agg={'total': agg})['total'].tolist() == [99 + 10, 99 + 5]
    assert df.groupby('g', agg={'total': vaex.agg.sum('x') / 2})['total'].tolist() == [6/2, 4/2]
    assert df.groupby('g', agg={'total': 2/vaex.agg.sum('x')})['total'].tolist() == [2/6, 2/4]
Пример #37
0
def test_passes_mixed_filtering():
    x = np.arange(10)
    df = vaex.from_arrays(x=x, y=x**2)
    df1 = df[df.x < 4]
    df2 = df

    executor = df.executor
    executor.passes = 0
    result1 = df1.sum('x', delay=True)
    result2 = df2.sum('x', delay=True)
    df.execute()
    assert executor.passes == 1
    assert result1.get() == 1 + 2 + 3
    assert result2.get() == 45
Пример #38
0
def test_vconstant(value):
    length = 100
    vc = vaex.vconstant(value=value, length=length)
    df = vaex.from_arrays(x=vc, y=vaex.vrange(0, length))

    assert len(df.columns['x']) == length
    assert df.x[:3].tolist() == [value] * 3
    assert len(vc[:]) == 100
    assert len(vc[10:]) == 90
    assert len(vc[:20]) == 20

    df_filter = df[df.y < 31]
    assert len(df_filter) == 31
    assert df_filter.x[:3].tolist() == [value] * 3
Пример #39
0
def test_agg_selections_equal():
    x = np.array([0, 0, 0, 1, 1, 2, 2])
    y = np.array([1, 3, 5, 1, 7, 1, -1])
    z = np.array([0, 2, 3, 4, 5, 6, 7])
    w = np.array(['dog', 'cat', 'mouse', 'dog', 'dog', 'mouse', 'cat'])

    df = vaex.from_arrays(x=x, y=y, z=z, w=w)


    df_grouped = df.groupby(df.x).agg({'counts': vaex.agg.count(),
                                      'sel_counts': vaex.agg.count(selection=df.y==1.)
                                      })
    assert df_grouped['counts'].tolist() == [3, 2, 2]
    assert df_grouped['sel_counts'].tolist() == [1, 1, 1]
Пример #40
0
def test_map_basics():
    # Generate the test data
    colour = ['red', 'red', 'blue', 'red', 'green', 'green', 'red', 'blue', 'blue', 'green']
    animal = np.array(['dog', 'cat', 'dog', 'dog', 'dog', 'dog', 'cat', 'dog', 'dog', np.nan], dtype='O')
    number = [10, 20, 30, 10, 20, 30, 30, 30, 10, 20]
    floats = [10., 20., 30., 10., 20., 30., 30., 30., 10., np.nan]
    ds = vaex.from_arrays(colour=colour, animal=animal, number=number, floats=floats)
    df = pd.DataFrame(data=np.array([colour, animal, number, floats]).T, columns=['colour', 'animal', 'number', 'floats'])

    # Create a mapper - dictionary
    mapper = {}
    mapper['colour'] = {'red': 1, 'blue': 2, 'green': 3}
    mapper['animal'] = {'dog': 5, 'cat': -1, 'dolphin': 0}
    mapper['number'] = {10: 1, 20: 2, 30: 3}
    mapper['floats'] = {10.: -1, 20.: -2, 30.: -3, np.nan: -4}

    # Map the functions in vaex
    ds['colour_'] = ds.colour.map(mapper['colour'])
    ds['animal_'] = ds.animal.map(mapper['animal'])
    # ds['number_'] = ds.number.map(lambda x: mapper['number'][x])  # test with a function, not just with a dict
    ds['floats_'] = ds.floats.map(mapper['floats'], nan_value=np.nan)

    # Map in pandas
    df['colour_'] = df.colour.map(mapper['colour'])
    df['animal_'] = df.animal.map(mapper['animal'])

    # Make assertions - compare to pandas for string columns
    # we deviate from pandas, we can map nan to something
    assert ds.colour_.values.tolist()[:-1] == df.colour_.values.tolist()[:-1]
    assert ds.animal_.values.tolist()[:-1] == df.animal_.values.tolist()[:-1]
    assert ds.animal_.values.tolist()[-1] is None
    # Make assertions - compare to the expected values for numeric type
    # assert ds.number_.values.tolist() == (np.array(number)/10).tolist()
    assert ds.floats_.values.tolist()[:-1] == (np.array(floats)/-10.).tolist()[:-1]
    assert ds.floats_.values.tolist()[-1] == -4

    # missing keys
    with pytest.raises(ValueError):
        ds.colour.map({'ret': 1, 'blue': 2, 'green': 3})
    with pytest.raises(ValueError):
        ds.colour.map({'blue': 2, 'green': 3})
    # missing keys but user-handled
    ds['colour_unmapped'] = ds.colour.map({'blue': 2, 'green': 3}, default_value=-1)
    assert ds.colour_unmapped.values.tolist() == [-1, -1, 2, -1, 3, 3, -1, 2, 2, 3]
    # extra is ok
    ds.colour.map({'red': 1, 'blue': 2, 'green': 3, 'orange': 4})

    # check masked arrays
    # import pdb; pdb.set_trace()
    assert ds.colour.map({'blue': 2, 'green': 3}, allow_missing=True).tolist() == [None, None, 2, None, 3, 3, None, 2, 2, 3]
Пример #41
0
def test_timedelta_methods():
    delta = np.array([187201, 1449339, 11264958, -181614],
                     dtype='timedelta64[s]')
    df = vaex.from_arrays(delta=delta)
    pdf = pd.DataFrame({'delta': pd.Series(delta, dtype=delta.dtype)})

    assert df.delta.td.days.tolist() == pdf.delta.dt.days.tolist()
    assert df.delta.td.seconds.tolist() == pdf.delta.dt.seconds.tolist()
    assert df.delta.td.microseconds.tolist(
    ) == pdf.delta.dt.microseconds.tolist()
    assert df.delta.td.nanoseconds.tolist() == pdf.delta.dt.nanoseconds.tolist(
    )
    assert df.delta.td.total_seconds().tolist() == pdf.delta.dt.total_seconds(
    ).tolist()
Пример #42
0
def test_count_cat(lazy):
    ds0 = vaex.from_arrays(colors=['red', 'green', 'blue', 'green'],
                           names=['apple', 'apple', 'berry', 'apple'])
    ds = ds0.ordinal_encode('colors', ['red', 'green', 'blue'], lazy=lazy)
    assert ds.count(binby=ds.colors).tolist() == [1, 2, 1]

    ds = ds0.ordinal_encode('colors', [
        'red',
        'blue',
        'green',
    ],
                            inplace=True,
                            lazy=lazy)
    assert ds.count(binby=ds.colors).tolist() == [1, 1, 2]
Пример #43
0
def test_dropinf():
    x = [1, 2, np.inf]
    y = [10, -np.inf, 2]
    z = [1, 2, 3]

    df = vaex.from_arrays(x=x, y=y, z=z)
    df_filter = df.dropinf()

    df_filter.shape == (1, 3)
    df_filter.values.tolist() == [[1.0, 10.0, 1.0]]

    df_filter = df.dropinf(column_names=['x'])
    df_filter.shape == (2, 3)
    df_filter.values.tolist() == [[1.0, 10.0, 1.0], [2.0, -np.inf, 2.0]]
Пример #44
0
def test_join_filtered_inner():
    df_a_filtered = df_a[df_a.y > 0]
    df_joined = df_a_filtered.join(other=df_b,
                                   on='x',
                                   how='inner',
                                   rsuffix='_',
                                   allow_duplication=True)
    assert len(df_joined) == len(df_a_filtered)

    x = np.arange(20)
    df = vaex.from_arrays(x=x, y=x**2)
    df = df[df.x > 5]
    dfj = df.join(df, on='x', rsuffix='right_', how='inner')
    repr(dfj)  # trigger issue with selection cache
Пример #45
0
def test_string_operations_from_mmap_file(tmpdir):
    # if we write the file to disk and mmap it read only, we trigger invalid memory writes
    # see https://github.com/vaexio/vaex/pull/459
    x = np.arange(5)
    y = np.array(['This', 'is', 'a', None, 'test'])
    df = vaex.from_arrays(x=x, y=y)
    filename = str(tmpdir / 'test.hdf5')
    df.export_hdf5(filename)
    df_from_file = vaex.open(filename)
    assert df_from_file.y.str.slice(
        start=0, stop=2).tolist() == ['Th', 'is', 'a', None, 'te']
    assert df_from_file.y.str.upper().tolist() == [
        'THIS', 'IS', 'A', None, 'TEST'
    ]
Пример #46
0
def test_groupby_same_result():
    h = np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], dtype=int)
    df = vaex.from_arrays(h=h)

    # Compare value_counts with the groupby counts for the hour column
    vc = df.h.value_counts()

    with small_buffer(df):
        group = df.groupby(by=df.h).agg({'h': 'count'})
        # second time it uses a new set, this caused a bug
        # see https://github.com/vaexio/vaex/pull/233
        group = df.groupby(by=df.h).agg({'h': 'count'})
        group_sort = group.sort(by='count', ascending=False)

        assert vc.values.tolist() == group_sort['count'].values.tolist(), 'counts are not correct.'
        assert vc.index.tolist() == group_sort['h'].values.tolist(), 'the indices of the counts are not correct.'
Пример #47
0
def from_dict(data):
    """Create an in memory dataset from a dict with column names as keys and list/numpy-arrays as values

    Example

    >>> data = {'A':[1,2,3],'B':['a','b','c']}
    >>> vaex.from_dict(data)
      #    A    B
      0    1   'a'
      1    2   'b'
      2    3   'c'

    :param data: A dict of {columns:[value, value,...]}
    :rtype: DataFrame

    """
    return vaex.from_arrays(**data)
Пример #48
0
def test_groupby_options():
    t = np.arange('2015-01-01', '2015-02-01', dtype=np.datetime64)
    y = np.arange(len(t))
    sum_answer = [y[k*7:(k+1)*7].sum() for k in range(5)]
    mean_answer = [y[k*7:(k+1)*7].mean() for k in range(5)]

    df = vaex.from_arrays(t=t, y=y)
    by = vaex.BinnerTime.per_week(df.t)

    dfg = df.groupby(by, agg={'y': 'sum'})
    assert dfg.y.tolist() == sum_answer
    dfg = df.groupby(by, agg={'y': vaex.agg.sum})
    assert dfg.y.tolist() == sum_answer

    dfg = df.groupby(by, agg={'z': vaex.agg.sum('y')})
    assert dfg.z.tolist() == sum_answer

    dfg = df.groupby(by, agg=[vaex.agg.sum('y')])
    assert dfg.y_sum.tolist() == sum_answer

    dfg = df.groupby(by, agg=[vaex.agg.sum('y'), vaex.agg.mean('y')])
    assert dfg.y_sum.tolist() == sum_answer
    assert dfg.y_mean.tolist() == mean_answer

    dfg = df.groupby(by, agg={'z': [vaex.agg.sum('y'), vaex.agg.mean('y')]})
    assert dfg.z_sum.tolist() == sum_answer
    assert dfg.z_mean.tolist() == mean_answer

    # default is to do all columns
    dfg = df.groupby(by, agg=[vaex.agg.sum, vaex.agg.mean])
    assert dfg.y_sum.tolist() == sum_answer
    assert dfg.y_mean.tolist() == mean_answer

    dfg = df.groupby(by, agg=vaex.agg.sum)
    assert dfg.y_sum.tolist() == sum_answer
    assert "t_sum" not in dfg.get_column_names()

    dfg = df.groupby(by).agg({'y': 'sum'})
    assert dfg.y.tolist() == [y[k*7:(k+1)*7].sum() for k in range(5)]

    dfg = df.groupby(by).agg({'y': 'sum'})
    assert dfg.y.tolist() == [y[k*7:(k+1)*7].sum() for k in range(5)]

    dfg = df.groupby(by, 'sum')
    assert dfg.y_sum.tolist() == sum_answer
Пример #49
0
def test_map():
    # Generate the test data
    colour = ['red', 'red', 'blue', 'red', 'green', 'green', 'red', 'blue', 'blue', 'green']
    animal = np.array(['dog', 'cat', 'dog', 'dog', 'dog', 'dog', 'cat', 'dog', 'dog', np.nan], dtype='O')
    number = [10, 20, 30, 10, 20, 30, 30, 30, 10, 20]
    floats = [10., 20., 30., 10., 20., 30., 30., 30., 10., np.nan]
    ds = vaex.from_arrays(colour=colour, animal=animal, number=number, floats=floats)
    df = pd.DataFrame(data=np.array([colour, animal, number, floats]).T, columns=['colour', 'animal', 'number', 'floats'])

    # Create a mapper - dictionary
    mapper = {}
    mapper['colour'] = {'red': 1, 'blue': 2, 'green': 3}
    mapper['animal'] = {'dog': 5, 'cat': -1, 'dolphin': 0}
    mapper['number'] = {10: 1, 20: 2, 30: 3}
    mapper['floats'] = {10.: -1, 20.: -2, 30.: -3, np.nan: -4}

    # Map the functions in vaex
    ds['colour_'] = ds.colour.map(mapper['colour'])
    ds['animal_'] = ds.animal.map(mapper['animal'])
    # ds['number_'] = ds.number.map(lambda x: mapper['number'][x])  # test with a function, not just with a dict
    ds['floats_'] = ds.floats.map(mapper['floats'], nan_mapping=np.nan)

    # Map in pandas
    df['colour_'] = df.colour.map(mapper['colour'])
    df['animal_'] = df.animal.map(mapper['animal'])

    # Make assertions - compare to pandas for string columns
    # we deviate from pandas, we can map nan to something
    assert ds.colour_.values.tolist()[:-1] == df.colour_.values.tolist()[:-1]
    assert ds.animal_.values.tolist()[:-1] == df.animal_.values.tolist()[:-1]
    assert ds.animal_.values[-1] is None
    # Make assertions - compare to the expected values for numeric type
    # assert ds.number_.values.tolist() == (np.array(number)/10).tolist()
    assert ds.floats_.values.tolist()[:-1] == (np.array(floats)/-10.).tolist()[:-1]
    assert ds.floats_.values.tolist()[-1] == -4

    # missing keys
    with pytest.raises(ValueError):
        ds.colour.map({'ret': 1, 'blue': 2, 'green': 3})
    with pytest.raises(ValueError):
        ds.colour.map({'blue': 2, 'green': 3})
    # extra is ok
    ds.colour.map({'red': 1, 'blue': 2, 'green': 3, 'orange': 4})
Пример #50
0
def test_big_endian_binning():
    x = np.arange(10, dtype='>f8')
    y = np.zeros(10, dtype='>f8')
    ds = vaex.from_arrays(x=x, y=y)
    counts = ds.count(binby=[ds.x, ds.y], limits=[[-0.5, 9.5], [-0.5, 0.5]], shape=[10, 1])
    assert counts.ravel().tolist() == np.ones(10).tolist()
Пример #51
0
def test_categorize():
    ds0 = vaex.from_arrays(c=[0, 1, 1, 3])
    ds0.categorize('c', ['a', 'b', 'c', 'd'])
    assert ds0.is_category(ds0.c)
    assert ds0.category_labels(ds0.c) == ['a', 'b', 'c', 'd']
    assert ds0.category_count(ds0.c) == 4
Пример #52
0
parser.add_argument('--number', "-n", dest="n", type=float, default=7, help="log number of rows to use")
parser.add_argument('--nmax', type=int, default=9, help="number of rows for test dataset")
parser.add_argument('--partitions', type=int, default=multiprocessing.cpu_count()*2, help="number of partitions to split (default: 2x number cores)")
parser.add_argument('--npandas', dest="npandas", type=float, default=7, help="number of rows to use for pandas")
parser.add_argument('--filter', dest="filter", default=None, help="filter for benchmark")
parser.add_argument('--filename', default=default_filename, help='filename to use for benchmark export/reading')
args = parser.parse_args(argv[1:])

use_dask = False


if not os.path.exists(args.filename):
    x = np.arange(0, int(10**args.nmax))
    xs = x.astype(str)
    s = xs#vaex.string_column(xs)
    df_vaex = vaex.from_arrays(x=s, s=s)
    df_vaex.export(args.filename, progress=True, shuffle=True)

df = vaex.open(args.filename)
df_vaex = df[0:int(10**args.n)]
df_vaex.executor.buffer_size = len(df_vaex)//args.partitions
df_pandas = df[:int(10**args.npandas)].to_pandas_df()

if use_dask:
    df_dask = dd.from_pandas(df_pandas, npartitions=4)
timings = {}
def mytimeit(expr, N, scope):
    times = []
    for i in range(N):
        t0 = time.time()
        eval(expr, scope)
Пример #53
0
def test_map_to_string():
    df = vaex.from_arrays(type=[0, 1, 2, 2, 2, np.nan])
    df['role'] = df['type'].map({0: 'admin', 1: 'maintainer', 2: 'user', np.nan: 'unknown'})
    assert df['role'].tolist() == ['admin', 'maintainer', 'user', 'user', 'user', 'unknown']