示例#1
0
def test_array_rename():
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y).hashed()
    ds2 = ds1.renamed({'x': 'z'})
    assert ds2['y'] is y
    assert ds2['z'] is x

    assert 'z' in list(ds2.chunk_iterator(['z']))[0][-1]

    assert ds1 != ds2
    assert rebuild(ds1) != rebuild(ds2)

    ds3 = ds2.renamed({'z': 'x'})
    assert ds3.original is ds1, "no nested renaming"
    assert ds3['y'] is y
    assert ds3['x'] is x

    # different data, but same ids/hashes
    assert ds1 == ds3
    assert rebuild(ds1) == rebuild(ds3)

    # testing that
    # {'a': 'x', 'b': 'y'} and {'x': 'a', 'b': 'z', 'c', 'q'} -> {'b': 'z', 'c': 'q'}
    ds1 = dataset.DatasetArrays(a=x, b=y, c=x + y)
    ds2 = ds1.renamed({'a': 'x', 'b': 'y'})
    ds3 = ds2.renamed({'x': 'a', 'b': 'z', 'c': 'q'})
    assert ds3.original is ds1
    assert ds3.renaming == {'b': 'z', 'c': 'q'}
示例#2
0
def test_slice(rebuild_dataset):
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y)
    ds2 = ds1[1:8]
    ds2b = ds1[1:8]
    ds2c = ds1[1:9]
    assert ds1.hashed() != ds2.hashed()
    assert ds2.hashed() == ds2b.hashed()
    assert ds2.hashed() != ds2c.hashed()
    assert ds1.row_count == 10
    assert ds2.row_count == 7
    assert ds2b.row_count == 7
    assert ds2c.row_count == 8
    assert ds2['x'].tolist() == x[1:8].tolist()


    ds3 = dataset.DatasetArrays(x=x[1:8], y=y[1:8])

    assert ds2.hashed() != ds3.hashed()
    assert rebuild_dataset(ds1.hashed()) != rebuild_dataset(ds2.hashed())
    # TODO: support unhashed?
    assert rebuild_dataset(ds1).hashed() != rebuild_dataset(ds2).hashed()

    assert rebuild_with_skip(ds2.hashed(), ds1.hashed()) == ds2.hashed()
示例#3
0
def test_project():
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y)
    ds2 = ds1.project('x')
    ds3 = dataset.DatasetArrays(x=x)
    assert ds1.hashed() != ds2.hashed()
    assert ds2.hashed() == ds3.hashed()
    assert rebuild(ds2).hashed() == rebuild(ds3).hashed()
示例#4
0
def test_concat():
    x = np.arange(10)
    y = x**2
    ds = dataset.DatasetArrays(x=x, y=y)
    mid = 4
    ds1 = dataset.DatasetArrays(x=x[:mid], y=y[:mid])
    ds2 = dataset.DatasetArrays(y=y[mid:], x=x[mid:])  # order should not matter
    dsc = ds1.concat(ds2)
    assert ds.row_count == dsc.row_count
    assert dsc.row_count == ds1.row_count + ds2.row_count
示例#5
0
def test_merge():
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y).hashed()
    dsx = dataset.DatasetArrays(x=x)
    dsy = dataset.DatasetArrays(y=y)
    ds2 = dsx.merged(dsy).hashed()

    assert ds1 == ds2
    assert rebuild(ds1) == rebuild(ds2)

    with pytest.raises(NameError):
        ds2.merged(dsx)
示例#6
0
def test_no_hash():
    x1 = np.arange(10)
    y1 = x1**2
    ds1 = dataset.DatasetArrays(x=x1, y=y1, hashed=False)

    x2 = np.arange(10)
    y2 = x2**2
    ds2 = dataset.DatasetArrays(x=x2, y=y2, hashed=False)

    assert ds1 != ds2
    assert ds1 != ds2.hashed()
    assert ds1.hashed() != ds2
    assert ds1.hashed() == ds2.hashed()
示例#7
0
def test_concat():
    x = np.arange(10)
    y = x**2
    ds = dataset.DatasetArrays(x=x, y=y)
    mid = 4
    ds1 = dataset.DatasetArrays(x=x[:mid], y=y[:mid])
    ds2 = dataset.DatasetArrays(y=y[mid:], x=x[mid:])  # order should not matter
    dsc = ds1.concat(ds2)
    assert ds.row_count == dsc.row_count
    assert dsc.row_count == ds1.row_count + ds2.row_count

    # an empty list of columns follows a different codepath
    assert list(dsc.chunk_iterator([])) == [(0, 10, {})]
    assert list(dsc.chunk_iterator([], start=5, end=10)) == [(0, 5, {})]
示例#8
0
def test_drop(rebuild_dataset):
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y)
    ds2 = ds1.dropped('x')
    assert 'x' not in ds2
    ds3 = ds1.dropped('y')
    assert 'y' not in ds3
    assert ds1.hashed() == ds2.merged(ds3).hashed()
    assert rebuild_dataset(ds1).hashed() == rebuild_dataset(ds2.merged(ds3)).hashed()

    ds1b = dataset.DatasetArrays(x=x, y=y)
    assert ds1.fingerprint == ds1b.fingerprint
    ds2b = ds1.dropped('x')
    assert ds2.fingerprint == ds2b.fingerprint
示例#9
0
def test_array_eq():
    x1 = np.arange(10)
    y1 = x1**2
    ds1 = dataset.DatasetArrays(x=x1, y=y1).hashed()
    assert ds1['x'] is x1
    assert ds1['y'] is y1

    x2 = np.arange(10)
    y2 = x2**2
    ds2 = dataset.DatasetArrays(x=x2, y=y2).hashed()
    assert ds2['x'] is x2
    assert ds2['y'] is y2

    # different data, but same ids/hashes
    assert ds1 == ds2
    assert ds1 == rebuild(ds2)
示例#10
0
def test_no_hash():
    x1 = np.arange(10)
    y1 = x1**2
    ds1 = dataset.DatasetArrays(x=x1, y=y1)

    x2 = np.arange(10)
    y2 = x2**2
    ds2 = dataset.DatasetArrays(x=x2, y=y2)

    with pytest.raises(ValueError, match='.*hash.*'):
        ds1 == ds2
    with pytest.raises(ValueError, match='.*hash.*'):
        ds1 == ds2.hashed()
    with pytest.raises(ValueError, match='.*hash.*'):
        ds1.hashed() == ds2
    ds1.hashed() == ds2.hashed()
示例#11
0
def test_merge(rebuild_dataset, array_factory):
    x = array_factory(np.arange(10))
    y = array_factory(np.arange(10)**2)
    ds1 = dataset.DatasetArrays(x=x, y=y).hashed()
    dsx = dataset.DatasetArrays(x=x)
    dsy = dataset.DatasetArrays(y=y)
    ds2 = dsx.merged(dsy).hashed()
    assert set(ds2.leafs()) == {dsx.hashed(), dsy.hashed()}

    assert ds1 == ds2
    assert rebuild_dataset(ds1) == rebuild_dataset(ds2)

    with pytest.raises(NameError):
        ds2.merged(dsx)

    assert rebuild_with_skip(ds2, dsx.hashed()) == ds1
示例#12
0
def test_slice_column():
    # slicing a colunm type should keep it column type
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y)
    indices = np.array([1, 2, 5, 7, 9])
    ds2 = ds1.take(indices)
    ds3 = ds2[1:3]
    assert isinstance(ds3['x'], vaex.column.ColumnIndexed)
示例#13
0
def test_drop():
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y)
    ds2 = ds1.dropped('x')
    assert 'x' not in ds2
    ds3 = ds1.dropped('y')
    assert 'y' not in ds3
    assert ds1.hashed() == ds2.merged(ds3).hashed()
    assert rebuild(ds1).hashed() == rebuild(ds2.merged(ds3)).hashed()
示例#14
0
def test_slice():
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y)
    ds2 = ds1[1:8]
    ds2b = ds1[1:8]
    ds2c = ds1[1:9]
    assert ds1.hashed() != ds2.hashed()
    assert ds2.hashed() == ds2b.hashed()
    assert ds2.hashed() != ds2c.hashed()
    assert ds1.row_count == 10
    assert ds2.row_count == 7
    assert ds2b.row_count == 7
    assert ds2c.row_count == 8
    assert ds2['x'].tolist() == x[1:8].tolist()

    ds3 = dataset.DatasetArrays(x=x[1:8], y=y[1:8])

    assert ds2.hashed() != ds3.hashed()
    assert rebuild(ds1).hashed() != rebuild(ds2).hashed()
示例#15
0
def test_hashable():
    # tests if we can use datasets as keys of dicts
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y).hashed()
    df = vaex.example()
    some_dict = {ds1: '1', df.dataset: '2'}
    assert some_dict[ds1] == '1'
    assert some_dict[df.dataset] == '2'

    assert some_dict[rebuild(ds1)] == '1'
    assert some_dict[rebuild(df.dataset)] == '2'
示例#16
0
def test_take():
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y)
    indices = np.array([1, 2, 5])
    indices_other = np.array([1, 2, 6])
    ds2 = ds1.take(indices)
    ds2b = ds1.take(indices)
    ds2c = ds1.take(indices_other)
    assert ds1.hashed() != ds2.hashed()
    assert ds2.hashed() == ds2b.hashed()
    assert ds2.hashed() != ds2c.hashed()
    assert ds1.row_count == 10
    assert ds2.row_count == len(indices)
    assert ds2b.row_count == len(indices)
    assert ds2c.row_count == len(indices_other)
    assert ds2['x'].tolist() == x[indices].tolist()

    ds3 = dataset.DatasetArrays(x=x[indices], y=y[indices])

    assert ds2.hashed() != ds3.hashed()
    assert rebuild(ds1).hashed() != rebuild(ds2).hashed()
示例#17
0
def test_array_rename():
    x = np.arange(10)
    y = x**2
    ds1 = dataset.DatasetArrays(x=x, y=y).hashed()
    ds2 = ds1.renamed({'x': 'z'})
    assert ds2['y'] is y
    assert ds2['z'] is x

    assert ds1 != ds2
    assert rebuild(ds1) != rebuild(ds2)

    ds3 = ds2.renamed({'z': 'x'})
    assert ds3['y'] is y
    assert ds3['x'] is x

    # different data, but same ids/hashes
    assert ds1 == ds3
    assert rebuild(ds1) == rebuild(ds3)
示例#18
0
def test_chunk_iterator():
    x = np.arange(10)
    y = x**2
    ds = dataset.DatasetArrays(x=x, y=y)
    chunk_it = ds.chunk_iterator(['y'], chunk_size=4)
    i1, i2, chunk0 = next(chunk_it)
    assert chunk0['y'].tolist() == y[0:4].tolist()
    assert i1 == 0
    assert i2 == 4

    i1, i2, chunk1 = next(chunk_it)
    assert chunk1['y'].tolist() == y[4:8].tolist()
    assert i1 == 4
    assert i2 == 8

    i1, i2, chunk2 = next(chunk_it)
    assert chunk2['y'].tolist() == y[8:].tolist()
    assert i1 == 8
    assert i2 == 10
示例#19
0
def test_filter(rebuild_dataset):
    x = np.arange(10)
    y = x**2
    filter = (x % 2) == 1
    ds = dataset.DatasetArrays(x=x, y=y)
    ds1 = dataset.DatasetFiltered(ds, filter=filter)
    ds1c = dataset.DatasetFiltered(ds, filter=(x % 2) == 1)
    ds2 = dataset.DatasetFiltered(ds, filter=(x % 3) == 1)
    assert ds1.hashed() != ds2.hashed()
    assert ds1.hashed() == ds1c.hashed()
    assert ds1.row_count == 5
    assert ds2.row_count == 3
    assert ds1.slice(0, 1).row_count == 1
    assert ds1.slice(1, 3).row_count == 2
    iter = ds1.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(3):
        i1, i2, chunks = next(iter)
        assert i1 == i*2
        assert i2 == min(5, (i + 1) * 2)
        chunks['x'].tolist() == x[filter][i1:i2].tolist()
        chunks['y'].tolist() == y[filter][i1:i2].tolist()


    assert ds1['x'].tolist() == x[filter].tolist()
    # TODO unhashed
    # assert rebuild_dataset(ds1).hashed() != rebuild_dataset(ds2).hashed()
    assert rebuild_dataset(ds1.hashed()) != rebuild_dataset(ds2.hashed())
    assert rebuild_with_skip(ds2.hashed(), ds.hashed()) == ds2.hashed()

    # testing the encoding of the expression instead of the array
    df = vaex.from_arrays(x=x, y=y)
    df['z'] = df.x % 2
    dff = df[df.z == 1]
    dff._push_down_filter()
    # df = vaex.from_arrays(x=[11, 12, 13, 15], y=[33, 44, 55, 66])
    ds = dff.dataset.hashed()
    # assert ds.state is not None
    assert ds == rebuild_dataset(ds)
示例#20
0
def test_filter():
    x = np.arange(10)
    y = x**2
    filter = (x % 2) == 1
    ds = dataset.DatasetArrays(x=x, y=y)
    ds1 = dataset.DatasetFiltered(ds, filter=filter)
    ds1c = dataset.DatasetFiltered(ds, filter=(x % 2) == 1)
    ds2 = dataset.DatasetFiltered(ds, filter=(x % 3) == 1)
    assert ds1.hashed() != ds2.hashed()
    assert ds1.hashed() == ds1c.hashed()
    assert ds1.row_count == 5
    assert ds2.row_count == 3
    assert ds1.slice(0, 1).row_count == 1
    assert ds1.slice(1, 3).row_count == 2
    iter = ds1.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(3):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == min(5, (i + 1) * 2)
        chunks['x'].tolist() == x[filter][i1:i2].tolist()
        chunks['y'].tolist() == y[filter][i1:i2].tolist()

    assert ds1['x'].tolist() == x[filter].tolist()
    assert rebuild(ds1).hashed() != rebuild(ds2).hashed()
示例#21
0
def test_array_pickle():
    x = np.arange(10)
    y = x**2
    ds = dataset.DatasetArrays(x=x, y=y).hashed()
    assert ds == rebuild(ds)
示例#22
0
def test_array_rebuild_dataset(rebuild_dataset):
    x = np.arange(10)
    y = x**2
    ds = dataset.DatasetArrays(x=x, y=y).hashed()
    assert ds == rebuild_dataset(ds)