def test_convert_logfiles_to_bag(): with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns: logs = chunks(TextFile)(list(map(TextFile, fns))) b = odo(logs, Bag) assert isinstance(b, Bag) assert 'a1.log' in str(b.dask.values()) assert odo(b, list) == odo(logs, list)
def test_append_chunks(): tbl = resource("sqlite:///:memory:::test", dshape="var * {a: int, b: int}") res = odo(chunks(np.ndarray)((np.array([[0, 1], [2, 3]]), np.array([[4, 5], [6, 7]]))), tbl) assert res is tbl assert ( odo(tbl, np.ndarray) == np.array([(0, 1), (2, 3), (4, 5), (6, 7)], dtype=[("a", "<i4"), ("b", "<i4")]) ).all()
def test_convert_logfiles_to_bag(): with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns: logs = chunks(TextFile)(list(map(TextFile, fns))) b = convert(Bag, logs) assert isinstance(b, Bag) assert 'a1.log' in str(b.dask.values()) assert convert(list, b) == convert(list, logs)
def test_convert_logfiles_to_bag(): with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns: logs = chunks(TextFile)(list(map(TextFile, fns))) b = odo(logs, Bag) assert isinstance(b, Bag) assert (list(map(methodcaller('strip'), odo(b, list))) == list(map(methodcaller('strip'), odo(logs, list))))
def test_glob(): d = {"accounts1.csv": "name,when\nAlice,100\nBob,200", "accounts2.csv": "name,when\nAlice,300\nBob,400"} with filetexts(d) as fns: r = resource("accounts*.csv", has_header=True) assert convert(list, r) == [("Alice", 100), ("Bob", 200), ("Alice", 300), ("Bob", 400)] r = resource("*.csv") assert isinstance(r, chunks(CSV))
def _load(self): seq = odo.odo(self.odo_target, odo.chunks(pandas.DataFrame), chunksize=65536) # dshape=schema_to_dshape(self.schema)) print("concatenating df chunks") df = pandas.concat(seq, ignore_index=True) print("typechecking and sorting") return df
def _load(self): seq = odo.odo(self.odo_target, odo.chunks(pandas.DataFrame), chunksize=65536) #dshape=schema_to_dshape(self.schema)) print('concatenating df chunks') df = pandas.concat(seq, ignore_index=True) print('typechecking and sorting') return df
def test_glob(): d = {'accounts1.csv': 'name,when\nAlice,100\nBob,200', 'accounts2.csv': 'name,when\nAlice,300\nBob,400'} with filetexts(d) as fns: r = resource('accounts*.csv', has_header=True) assert convert(list, r) == [('Alice', 100), ('Bob', 200), ('Alice', 300), ('Bob', 400)] r = resource('*.csv') assert isinstance(r, chunks(CSV))
def test_globbed_csv_to_chunks_of_dataframe(): header = 'a,b,c\n' d = {'a-1.csv': header + '1,2,3\n4,5,6\n', 'a-2.csv': header + '7,8,9\n10,11,12\n'} with filetexts(d): dfs = list(odo('a-*.csv', chunks(pd.DataFrame))) assert len(dfs) == 2 columns = 'a', 'b', 'c' tm.assert_frame_equal(dfs[0], pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=columns)) tm.assert_frame_equal(dfs[1], pd.DataFrame([[7, 8, 9], [10, 11, 12]], columns=columns))
def test_append_chunks(): tbl = resource('sqlite:///:memory:::test', dshape='var * {a: int, b: int}') res = odo( chunks(np.ndarray)(( np.array([[0, 1], [2, 3]]), np.array([[4, 5], [6, 7]]), )), tbl, ) assert res is tbl assert (odo(tbl, np.ndarray) == np.array( [(0, 1), (2, 3), (4, 5), (6, 7)], dtype=[('a', '<i4'), ('b', '<i4')], )).all()
def load(self): seq = odo.odo(self.odo_target, odo.chunks(pandas.DataFrame), chunksize=CHUNK_SIZE, dshape=schema_to_dshape(self.schema)) def conv_chunks(chunks): for chunk in chunks: print('typechecking a chunk') self.schema.conform_df(chunk, skip_sort=True) yield chunk print('concatenating df chunks') df = from_chunks(conv_chunks(seq)) return df
def test_append_chunks(): tbl = resource('sqlite:///:memory:::test', dshape='var * {a: int, b: int}') res = odo( chunks(np.ndarray)(( np.array([[0, 1], [2, 3]]), np.array([[4, 5], [6, 7]]), )), tbl, ) assert res is tbl assert ( odo(tbl, np.ndarray) == np.array( [(0, 1), (2, 3), (4, 5), (6, 7)], dtype=[('a', '<i4'), ('b', '<i4')], ) ).all()
from odo import chunks from blaze import discover, into, compute, symbol from datashape.predicates import iscollection L = [1, 2, 3, 4, 5, 6] cL = chunks(list)([[1., 2., 3.], [4., 5., 6.]]) s = symbol('s', discover(cL)) def test_chunks_compute(): exprs = [s, s + 1, s.max(), s.mean() + 1, s.head()] for e in exprs: result = compute(e, {s: cL}) expected = compute(e, {s: L}) if iscollection(e.dshape): result = into(list, result) expected = into(list, expected) assert result == expected def test_chunks_head(): assert compute(s.head(2), cL) == (1., 2.) def test_pmap_default(): flag = [0] def mymap(func, seq): flag[0] = True return map(func, seq)
def pre_compute(expr, data, **kwargs): return into(chunks(pd.DataFrame), data, **kwargs)
@convert.register(float, Array, cost=10.0) def dask_to_float(x, **kwargs): return x.compute() @append.register(tuple(arrays), Array) def store_Array_in_ooc_data(out, arr, inplace=False, **kwargs): if not inplace: # Resize output dataset to accept new data assert out.shape[1:] == arr.shape[1:] resize(out, out.shape[0] + arr.shape[0]) # elongate arr.store(out) return out @convert.register(Iterator, Bag) def bag_to_iterator(x, **kwargs): return iter(x) @convert.register(Bag, chunks(TextFile)) def bag_to_iterator(x, **kwargs): return db.from_filenames([tf.path for tf in x]) @convert.register(Bag, list) def bag_to_iterator(x, **kwargs): return db.from_sequence(x, **filter_kwargs(db.from_sequence, kwargs))
from __future__ import absolute_import, division, print_function from datashape import dshape from odo.chunks import * from toolz import first CL = chunks(list) def test_chunks_basics(): assert isinstance(CL, type) assert issubclass(CL, Chunks) def test_chunks_isnt_consumable(): cl = CL([[1, 2, 3], [4, 5, 6]]) assert next(iter(cl)) == [1, 2, 3] assert next(iter(cl)) == [1, 2, 3] def test_chunks_is_memoized(): assert chunks(list) is chunks(list) def test_callables(): cl = CL(lambda: (list(range(3)) for i in range(3))) assert first(cl) == [0, 1, 2] assert first(cl) == [0, 1, 2]
def test_chunks_is_memoized(): assert chunks(list) is chunks(list)
return x.compute() @append.register(tuple(arrays), Array) def store_Array_in_ooc_data(out, arr, inplace=False, **kwargs): if not inplace: # Resize output dataset to accept new data assert out.shape[1:] == arr.shape[1:] resize(out, out.shape[0] + arr.shape[0]) # elongate return arr.store(out) ############ # dask.bag # ############ @convert.register(Iterator, Bag) def bag_to_iterator(x, **kwargs): return iter(x) @convert.register(Bag, chunks(TextFile)) def bag_to_iterator(x, **kwargs): return Bag.from_filenames([tf.path for tf in x]) @convert.register(Bag, list) def bag_to_iterator(x, **kwargs): keys = keywords(Bag.from_sequence) kwargs2 = dict((k, v) for k, v in kwargs.items() if k in keys) return Bag.from_sequence(x, **kwargs2)
from __future__ import absolute_import, division, print_function from odo.chunks import * from toolz import first CL = chunks(list) def test_chunks_basics(): assert isinstance(CL, type) assert issubclass(CL, Chunks) def test_chunks_isnt_consumable(): cl = CL([[1, 2, 3], [4, 5, 6]]) assert next(iter(cl)) == [1, 2, 3] assert next(iter(cl)) == [1, 2, 3] def test_chunks_is_memoized(): assert chunks(list) is chunks(list) def test_callables(): cl = CL(lambda: (list(range(3)) for i in range(3))) assert first(cl) == [0, 1, 2] assert first(cl) == [0, 1, 2]