def test_chunks(): c = convert(chunks(np.ndarray), a, chunksize=2) assert isinstance(c, chunks(np.ndarray)) assert len(list(c)) == 2 assert eq(list(c)[1], [3, 4]) assert eq(convert(np.ndarray, c), a[:])
def test_chunks_numpy_pandas(): x = np.array([('Alice', 100), ('Bob', 200)], dtype=[('name', 'S7'), ('amount', 'i4')]) n = chunks(np.ndarray)([x, x]) pan = convert(chunks(pd.DataFrame), n) num = convert(chunks(np.ndarray), pan) assert isinstance(pan, chunks(pd.DataFrame)) assert all(isinstance(chunk, pd.DataFrame) for chunk in pan) assert isinstance(num, chunks(np.ndarray)) assert all(isinstance(chunk, np.ndarray) for chunk in num)
def pre_compute(expr, data, comfortable_memory=None, chunksize=2**18, **kwargs): comfortable_memory = comfortable_memory or min(1e9, available_memory() / 4) kwargs = dict() # Chunk if the file is large if os.path.getsize(data.path) > comfortable_memory: kwargs['chunksize'] = chunksize else: chunksize = None # Insert projection into read_csv oexpr = optimize(expr, data) leaf = oexpr._leaves()[0] pth = list(path(oexpr, leaf)) if len(pth) >= 2 and isinstance(pth[-2], (Projection, Field)): kwargs['usecols'] = pth[-2].fields if chunksize: return into(chunks(pd.DataFrame), data, dshape=leaf.dshape, **kwargs) else: return into(pd.DataFrame, data, dshape=leaf.dshape, **kwargs)
def test_iterator_and_numpy_chunks(): c = iterator_to_numpy_chunks([1, 2, 3], chunksize=2) assert isinstance(c, chunks(np.ndarray)) assert all(isinstance(chunk, np.ndarray) for chunk in c) c = iterator_to_numpy_chunks([1, 2, 3], chunksize=2) L = convert(list, c) assert L == [1, 2, 3]
def test_pandas_and_chunks_pandas(): df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [1., 2., 3., 4.]}) c = dataframe_to_chunks_dataframe(df, chunksize=2) assert isinstance(c, chunks(pd.DataFrame)) assert len(list(c)) == 2 df2 = chunks_dataframe_to_dataframe(c) assert str(df2) == str(df)
def test_append_chunks(): with file(df) as (fn, f, dset): append(dset, chunks(pd.DataFrame)([df, df])) assert discover(dset).shape[0] == len(df) * 3
def test_chunks_of_lists_and_iterators(): L = [1, 2], [3, 4] cl = chunks(list)(L) assert convert(list, cl) == [1, 2, 3, 4] assert list(convert(Iterator, cl)) == [1, 2, 3, 4] assert len(list(convert(chunks(Iterator), cl))) == 2
def test_numpy_to_chunks_numpy(): x = np.arange(100) c = numpy_to_chunks_numpy(x, chunksize=10) assert isinstance(c, chunks(np.ndarray)) assert len(list(c)) == 10 assert eq(list(c)[0], x[:10])
def test_convert_chunks(): with file(df) as (fn, f, dset): c = convert(chunks(pd.DataFrame), dset, chunksize=len(df) / 2) assert len(list(c)) == 2 assert eq(convert(pd.DataFrame, c), df)
def test_pre_compute_on_large_csv_gives_chunked_reader(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) assert isinstance(pre_compute(s.species, csv, comfortable_memory=10), (chunks(pd.DataFrame), pd.io.parsers.TextFileReader))
def test_append_chunks(): with file(x) as (fn, f, dset): append(dset, chunks(np.ndarray)([x, x])) assert len(dset) == len(x) * 3
def test_append_chunks(): b = carray(x) append(b, chunks(np.ndarray)([x, x])) assert len(b) == len(x) * 3
def test_chunks(): with file(df) as (fn, f, dset): c = convert(chunks(pd.DataFrame), dset) assert eq(convert(np.ndarray, c), df)
def test_chunks(): with file(x) as (fn, f, dset): c = convert(chunks(np.ndarray), dset) assert eq(convert(np.ndarray, c), x)