Пример #1
0
def test_convert_logfiles_to_bag():
    with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns:
        logs = chunks(TextFile)(list(map(TextFile, fns)))
        b = odo(logs, Bag)
        assert isinstance(b, Bag)
        assert 'a1.log' in str(b.dask.values())
        assert odo(b, list) == odo(logs, list)
Пример #2
0
def test_append_chunks():
    tbl = resource("sqlite:///:memory:::test", dshape="var * {a: int, b: int}")
    res = odo(chunks(np.ndarray)((np.array([[0, 1], [2, 3]]), np.array([[4, 5], [6, 7]]))), tbl)
    assert res is tbl
    assert (
        odo(tbl, np.ndarray) == np.array([(0, 1), (2, 3), (4, 5), (6, 7)], dtype=[("a", "<i4"), ("b", "<i4")])
    ).all()
Пример #3
0
def test_convert_logfiles_to_bag():
    with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns:
        logs = chunks(TextFile)(list(map(TextFile, fns)))
        b = convert(Bag, logs)
        assert isinstance(b, Bag)
        assert 'a1.log' in str(b.dask.values())
        assert convert(list, b) == convert(list, logs)
Пример #4
0
def test_convert_logfiles_to_bag():
    with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns:
        logs = chunks(TextFile)(list(map(TextFile, fns)))
        b = odo(logs, Bag)
        assert isinstance(b, Bag)
        assert (list(map(methodcaller('strip'), odo(b, list))) ==
                list(map(methodcaller('strip'), odo(logs, list))))
Пример #5
0
def test_glob():
    d = {"accounts1.csv": "name,when\nAlice,100\nBob,200", "accounts2.csv": "name,when\nAlice,300\nBob,400"}
    with filetexts(d) as fns:
        r = resource("accounts*.csv", has_header=True)
        assert convert(list, r) == [("Alice", 100), ("Bob", 200), ("Alice", 300), ("Bob", 400)]

        r = resource("*.csv")
        assert isinstance(r, chunks(CSV))
    def _load(self):
        seq = odo.odo(self.odo_target, odo.chunks(pandas.DataFrame), chunksize=65536)
        # dshape=schema_to_dshape(self.schema))

        print("concatenating df chunks")
        df = pandas.concat(seq, ignore_index=True)
        print("typechecking and sorting")
        return df
    def _load(self):
        seq = odo.odo(self.odo_target, odo.chunks(pandas.DataFrame),
            chunksize=65536)
            #dshape=schema_to_dshape(self.schema))

        print('concatenating df chunks')
        df = pandas.concat(seq, ignore_index=True)
        print('typechecking and sorting')
        return df
Пример #8
0
def test_glob():
    d = {'accounts1.csv': 'name,when\nAlice,100\nBob,200',
         'accounts2.csv': 'name,when\nAlice,300\nBob,400'}
    with filetexts(d) as fns:
        r = resource('accounts*.csv', has_header=True)
        assert convert(list, r) == [('Alice', 100), ('Bob', 200),
                                    ('Alice', 300), ('Bob', 400)]

        r = resource('*.csv')
        assert isinstance(r, chunks(CSV))
Пример #9
0
def test_globbed_csv_to_chunks_of_dataframe():
    header = 'a,b,c\n'
    d = {'a-1.csv': header + '1,2,3\n4,5,6\n',
         'a-2.csv': header + '7,8,9\n10,11,12\n'}

    with filetexts(d):
        dfs = list(odo('a-*.csv', chunks(pd.DataFrame)))

    assert len(dfs) == 2
    columns = 'a', 'b', 'c'
    tm.assert_frame_equal(dfs[0],
                          pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=columns))
    tm.assert_frame_equal(dfs[1],
                          pd.DataFrame([[7, 8, 9], [10, 11, 12]], columns=columns))
Пример #10
0
def test_append_chunks():
    tbl = resource('sqlite:///:memory:::test', dshape='var * {a: int, b: int}')
    res = odo(
        chunks(np.ndarray)((
            np.array([[0, 1], [2, 3]]),
            np.array([[4, 5], [6, 7]]),
        )),
        tbl,
    )
    assert res is tbl
    assert (odo(tbl, np.ndarray) == np.array(
        [(0, 1), (2, 3), (4, 5), (6, 7)],
        dtype=[('a', '<i4'), ('b', '<i4')],
    )).all()
    def load(self):
        seq = odo.odo(self.odo_target, odo.chunks(pandas.DataFrame),
            chunksize=CHUNK_SIZE,
            dshape=schema_to_dshape(self.schema))
        
        def conv_chunks(chunks):
            for chunk in chunks:
                print('typechecking a chunk')
                self.schema.conform_df(chunk, skip_sort=True)
                yield chunk

        print('concatenating df chunks')
        df = from_chunks(conv_chunks(seq))
        
        return df
    def load(self):
        seq = odo.odo(self.odo_target,
                      odo.chunks(pandas.DataFrame),
                      chunksize=CHUNK_SIZE,
                      dshape=schema_to_dshape(self.schema))

        def conv_chunks(chunks):
            for chunk in chunks:
                print('typechecking a chunk')
                self.schema.conform_df(chunk, skip_sort=True)
                yield chunk

        print('concatenating df chunks')
        df = from_chunks(conv_chunks(seq))

        return df
Пример #13
0
def test_append_chunks():
    tbl = resource('sqlite:///:memory:::test', dshape='var * {a: int, b: int}')
    res = odo(
        chunks(np.ndarray)((
            np.array([[0, 1], [2, 3]]),
            np.array([[4, 5], [6, 7]]),
        )),
        tbl,
    )
    assert res is tbl
    assert (
        odo(tbl, np.ndarray) == np.array(
            [(0, 1),
             (2, 3),
             (4, 5),
             (6, 7)],
            dtype=[('a', '<i4'), ('b', '<i4')],
        )
    ).all()
Пример #14
0
from odo import chunks
from blaze import discover, into, compute, symbol
from datashape.predicates import iscollection


L = [1, 2, 3, 4, 5, 6]
cL = chunks(list)([[1., 2., 3.], [4., 5., 6.]])
s = symbol('s', discover(cL))


def test_chunks_compute():
    exprs = [s, s + 1, s.max(), s.mean() + 1, s.head()]
    for e in exprs:
        result = compute(e, {s: cL})
        expected = compute(e, {s: L})
        if iscollection(e.dshape):
            result = into(list, result)
            expected = into(list, expected)
        assert result == expected


def test_chunks_head():
    assert compute(s.head(2), cL) == (1., 2.)


def test_pmap_default():
    flag = [0]

    def mymap(func, seq):
        flag[0] = True
        return map(func, seq)
Пример #15
0
def pre_compute(expr, data, **kwargs):
    return into(chunks(pd.DataFrame), data, **kwargs)
Пример #16
0

@convert.register(float, Array, cost=10.0)
def dask_to_float(x, **kwargs):
    return x.compute()


@append.register(tuple(arrays), Array)
def store_Array_in_ooc_data(out, arr, inplace=False, **kwargs):
    if not inplace:
        # Resize output dataset to accept new data
        assert out.shape[1:] == arr.shape[1:]
        resize(out, out.shape[0] + arr.shape[0])  # elongate
    arr.store(out)
    return out


@convert.register(Iterator, Bag)
def bag_to_iterator(x, **kwargs):
    return iter(x)


@convert.register(Bag, chunks(TextFile))
def bag_to_iterator(x, **kwargs):
    return db.from_filenames([tf.path for tf in x])


@convert.register(Bag, list)
def bag_to_iterator(x, **kwargs):
    return db.from_sequence(x, **filter_kwargs(db.from_sequence, kwargs))
Пример #17
0
from __future__ import absolute_import, division, print_function

from datashape import dshape
from odo.chunks import *
from toolz import first

CL = chunks(list)


def test_chunks_basics():
    assert isinstance(CL, type)
    assert issubclass(CL, Chunks)


def test_chunks_isnt_consumable():
    cl = CL([[1, 2, 3], [4, 5, 6]])

    assert next(iter(cl)) == [1, 2, 3]
    assert next(iter(cl)) == [1, 2, 3]


def test_chunks_is_memoized():
    assert chunks(list) is chunks(list)


def test_callables():
    cl = CL(lambda: (list(range(3)) for i in range(3)))

    assert first(cl) == [0, 1, 2]
    assert first(cl) == [0, 1, 2]
Пример #18
0
def test_chunks_is_memoized():
    assert chunks(list) is chunks(list)
Пример #19
0
from odo import chunks
from blaze import discover, into, compute, symbol
from datashape.predicates import iscollection

L = [1, 2, 3, 4, 5, 6]
cL = chunks(list)([[1., 2., 3.], [4., 5., 6.]])
s = symbol('s', discover(cL))


def test_chunks_compute():
    exprs = [s, s + 1, s.max(), s.mean() + 1, s.head()]
    for e in exprs:
        result = compute(e, {s: cL})
        expected = compute(e, {s: L})
        if iscollection(e.dshape):
            result = into(list, result)
            expected = into(list, expected)
        assert result == expected


def test_chunks_head():
    assert compute(s.head(2), cL) == (1., 2.)


def test_pmap_default():
    flag = [0]

    def mymap(func, seq):
        flag[0] = True
        return map(func, seq)
Пример #20
0
    return x.compute()


@append.register(tuple(arrays), Array)
def store_Array_in_ooc_data(out, arr, inplace=False, **kwargs):
    if not inplace:
        # Resize output dataset to accept new data
        assert out.shape[1:] == arr.shape[1:]
        resize(out, out.shape[0] + arr.shape[0])  # elongate
    return arr.store(out)

############
# dask.bag #
############

@convert.register(Iterator, Bag)
def bag_to_iterator(x, **kwargs):
    return iter(x)


@convert.register(Bag, chunks(TextFile))
def bag_to_iterator(x, **kwargs):
    return Bag.from_filenames([tf.path for tf in x])


@convert.register(Bag, list)
def bag_to_iterator(x, **kwargs):
    keys = keywords(Bag.from_sequence)
    kwargs2 = dict((k, v) for k, v in kwargs.items() if k in keys)
    return Bag.from_sequence(x, **kwargs2)
Пример #21
0
def test_chunks_is_memoized():
    assert chunks(list) is chunks(list)
Пример #22
0
from __future__ import absolute_import, division, print_function

from odo.chunks import *
from toolz import first


CL = chunks(list)


def test_chunks_basics():
    assert isinstance(CL, type)
    assert issubclass(CL, Chunks)


def test_chunks_isnt_consumable():
    cl = CL([[1, 2, 3], [4, 5, 6]])

    assert next(iter(cl)) == [1, 2, 3]
    assert next(iter(cl)) == [1, 2, 3]


def test_chunks_is_memoized():
    assert chunks(list) is chunks(list)


def test_callables():
    cl = CL(lambda: (list(range(3)) for i in range(3)))

    assert first(cl) == [0, 1, 2]
    assert first(cl) == [0, 1, 2]