def test_nonzero(): x = tensor([[1, 0, 0], [0, 2, 0], [1, 1, 0]], chunk_size=2) y = nonzero(x) assert len(y) == 2 tile(y[0])
def test_cum_reduction(): cumsum = lambda x, *args, **kwargs: tile(x.cumsum(*args, **kwargs)) cumprod = lambda x, *args, **kwargs: tile(x.cumprod(*args, **kwargs)) res1 = cumsum(ones((10, 8), chunk_size=3), axis=0) res2 = cumprod(ones((10, 8), chunk_size=3), axis=0) assert res1.shape == (10, 8) assert res1.dtype is not None assert res2.shape == (10, 8) assert res2.dtype is not None res1 = cumsum(ones((10, 8, 8), chunk_size=3), axis=1) res2 = cumprod(ones((10, 8, 8), chunk_size=3), axis=1) assert res1.shape == (10, 8, 8) assert res2.shape == (10, 8, 8) res1 = cumsum(ones((10, 8, 8), chunk_size=3), axis=-2) res2 = cumprod(ones((10, 8, 8), chunk_size=3), axis=-2) assert res1.shape == (10, 8, 8) assert res2.shape == (10, 8, 8) with pytest.raises(np.AxisError): cumsum(ones((10, 8), chunk_size=3), axis=2) with pytest.raises(np.AxisError): cumsum(ones((10, 8), chunk_size=3), axis=-3)
def test_append(): df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2) assert adf.shape == (20, 4) assert isinstance(adf.index_value.value, IndexValue.Int64Index) tiled = tile(adf) assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1)) assert tiled.chunk_shape == (8, 2) for i, c in enumerate(tiled.chunks): index = (i // 2, i % 2) assert c.index == index mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2, ignore_index=True) assert adf.shape == (20, 4) assert isinstance(adf.index_value.value, IndexValue.RangeIndex) pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20)) tiled = tile(adf) assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1)) assert tiled.chunk_shape == (8, 2) assert isinstance(tiled.chunks[0].op, ChunkStandardizeRangeIndex)
def test_head_tail_optimize(): raw = pd.DataFrame(np.random.rand(4, 3)) df = md.DataFrame(raw, chunk_size=2) # no nan chunk shape assert HeadTailOptimizedOperandMixin._need_tile_head_tail( tile(df).head(2).op) is False df2 = tile(df[df[0] < 0.5]) # chunk shape on axis 1 greater than 1 assert HeadTailOptimizedOperandMixin._need_tile_head_tail( df2.head(2).op) is False df = md.DataFrame(raw, chunk_size=(2, 3)) df2 = tile(df[df[0] < 0.5]) # not slice assert HeadTailOptimizedOperandMixin._need_tile_head_tail( df2.iloc[2].op) is False # step not None assert HeadTailOptimizedOperandMixin._need_tile_head_tail( df2.iloc[:2:2].op) is False # not head or tail assert HeadTailOptimizedOperandMixin._need_tile_head_tail( df2.iloc[1:3].op) is False # slice 1 is not slice(None) assert HeadTailOptimizedOperandMixin._need_tile_head_tail( df2.iloc[:3, :2].op) is False
def test_drop_duplicates(): rs = np.random.RandomState(0) raw = pd.DataFrame(rs.randint(1000, size=(20, 7)), columns=['c' + str(i + 1) for i in range(7)]) raw['c7'] = [f's{j}' for j in range(20)] df = from_pandas_df(raw, chunk_size=10) with pytest.raises(ValueError): df.drop_duplicates(method='unknown') with pytest.raises(KeyError): df.drop_duplicates(subset='c8') # test auto method selection assert tile(df.drop_duplicates()).chunks[0].op.method == 'tree' # subset size less than chunk_store_limit assert tile(df.drop_duplicates( subset=['c1', 'c3'])).chunks[0].op.method == 'subset_tree' with option_context({'chunk_store_limit': 5}): # subset size greater than chunk_store_limit assert tile(df.drop_duplicates( subset=['c1', 'c3'])).chunks[0].op.method == 'tree' assert tile( df.drop_duplicates(subset=['c1', 'c7'])).chunks[0].op.method == 'tree' assert tile(df['c7'].drop_duplicates()).chunks[0].op.method == 'tree' s = df['c7'] with pytest.raises(ValueError): s.drop_duplicates(method='unknown')
def test_beta_inc(): raw1 = np.random.rand(4, 3, 2) raw2 = np.random.rand(4, 3, 2) raw3 = np.random.rand(4, 3, 2) a = tensor(raw1, chunk_size=3) b = tensor(raw2, chunk_size=3) c = tensor(raw3, chunk_size=3) r = betainc(a, b, c) expect = scipy_betainc(raw1, raw2, raw3) assert r.shape == raw1.shape assert r.dtype == expect.dtype tiled_a, r = tile(a, r) assert r.nsplits == tiled_a.nsplits for chunk in r.chunks: assert isinstance(chunk.op, TensorBetaInc) assert chunk.index == chunk.inputs[0].index assert chunk.shape == chunk.inputs[0].shape betainc(a, b, c, out=a) expect = scipy_betainc(raw1, raw2, raw3) assert a.shape == raw1.shape assert a.dtype == expect.dtype b, tiled_a = tile(b, a) assert tiled_a.nsplits == b.nsplits for c in r.chunks: assert isinstance(c.op, TensorBetaInc) assert c.index == c.inputs[0].index assert c.shape == c.inputs[0].shape
def test_imread(): with tempfile.TemporaryDirectory() as tempdir: raws = [] for i in range(10): array = np.random.randint(0, 256, 2500 * 3, dtype=np.uint8).reshape((50, 50, 3)) raws.append(array) im = Image.fromarray(array) im.save(os.path.join(tempdir, f'random_{i}.png')) t = imread(os.path.join(tempdir, 'random_0.png')) assert t.shape == (50, 50, 3) assert t.dtype == np.dtype('uint8') tiled = tile(t) assert len(tiled.chunks) == 1 assert tiled.chunks[0].shape == (50, 50, 3) assert tiled.chunks[0].dtype == np.dtype('uint8') t = imread(os.path.join(tempdir, 'random_*.png'), chunk_size=3) assert t.shape == (10, 50, 50, 3) tiled = tile(t) assert len(tiled.chunks) == 4 assert tiled.nsplits == ((3, 3, 3, 1), (50, ), (50, ), (3, )) assert tiled.chunks[0].dtype == np.dtype('uint8') assert tiled.chunks[0].index == (0, 0, 0, 0) assert tiled.chunks[0].shape == (3, 50, 50, 3) assert tiled.chunks[1].index == (1, 0, 0, 0) assert tiled.chunks[1].shape == (3, 50, 50, 3) assert tiled.chunks[2].index == (2, 0, 0, 0) assert tiled.chunks[2].shape == (3, 50, 50, 3) assert tiled.chunks[3].index == (3, 0, 0, 0) assert tiled.chunks[3].shape == (1, 50, 50, 3)
def test_groupby(): df = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4] }) mdf = md.DataFrame(df, chunk_size=2) with pytest.raises(KeyError): mdf.groupby('c2') with pytest.raises(KeyError): mdf.groupby(['b', 'c2']) grouped = mdf.groupby('b') assert isinstance(grouped, DataFrameGroupBy) assert isinstance(grouped.op, DataFrameGroupByOperand) assert list(grouped.key_dtypes.index) == ['b'] grouped = tile(grouped) assert len(grouped.chunks) == 5 for chunk in grouped.chunks: assert isinstance(chunk.op, DataFrameGroupByOperand) series = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) ms = md.Series(series, chunk_size=3) grouped = ms.groupby(lambda x: x + 1) assert isinstance(grouped, SeriesGroupBy) assert isinstance(grouped.op, DataFrameGroupByOperand) grouped = tile(grouped) assert len(grouped.chunks) == 3 for chunk in grouped.chunks: assert isinstance(chunk.op, DataFrameGroupByOperand) with pytest.raises(TypeError): ms.groupby(lambda x: x + 1, as_index=False)
def test_divide(): t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() t = t1 / 10 assert t.issparse() is True assert type(t) is SparseTensor t = tile(t) assert t.chunks[0].op.sparse is True t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse() t = t1 / t2 assert t.issparse() is False assert type(t) is Tensor t = tile(t) assert t.chunks[0].op.sparse is False t3 = tensor([1, 1, 1], chunk_size=2) t = t1 / t3 assert t.issparse() is False assert type(t) is Tensor t = tile(t) assert t.chunks[0].op.sparse is False t = t3 / t1 assert t.issparse() is False assert type(t) is Tensor t = tile(t) assert t.chunks[0].op.sparse is False
def test_multiply(): t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() t = t1 * 10 assert t.issparse() is True assert type(t) is SparseTensor t = tile(t) assert t.chunks[0].op.sparse is True t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse() t = t1 * t2 assert t.issparse() is True assert type(t) is SparseTensor t = tile(t) assert t.chunks[0].op.sparse is True t3 = tensor([1, 1, 1], chunk_size=2) t = t1 * t3 assert t.issparse() is True assert type(t) is SparseTensor t = tile(t) assert t.chunks[0].op.sparse is True
def test_isin(): element = 2 * arange(4, chunk_size=1).reshape(2, 2) test_elements = [1, 2, 4, 8] mask = isin(element, test_elements) assert mask.shape == (2, 2) assert mask.dtype == np.bool_ mask, element = tile(mask, element) assert len(mask.chunks) == len(element.chunks) assert len(mask.op.test_elements.chunks) == 1 assert mask.chunks[0].inputs[0] is element.chunks[0].data element = 2 * arange(4, chunk_size=1).reshape(2, 2) test_elements = tensor([1, 2, 4, 8], chunk_size=2) mask = isin(element, test_elements, invert=True) assert mask.shape == (2, 2) assert mask.dtype == np.bool_ mask, element = tile(mask, element) assert len(mask.chunks) == len(element.chunks) assert len(mask.op.test_elements.chunks) == 1 assert mask.chunks[0].inputs[0] is element.chunks[0].data assert mask.chunks[0].op.invert is True
def test_hermitian_fft(): t = ones((10, 20, 30), chunk_size=(3, 20, 30)) t1 = hfft(t) assert t1.shape == np.fft.hfft(np.ones(t.shape)).shape t1 = tile(t1) assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) t = ones((10, 20, 30), chunk_size=(3, 20, 30)) t1 = hfft(t, n=100) assert t1.shape == np.fft.hfft(np.ones(t.shape), n=100).shape t1 = tile(t1) assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) t = ones((10, 20, 30), chunk_size=(3, 20, 30)) t1 = ihfft(t) assert t1.shape == np.fft.ihfft(np.ones(t.shape)).shape t1 = tile(t1) assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) t = ones((10, 20, 30), chunk_size=(3, 20, 30)) t1 = ihfft(t, n=100) assert t1.shape == np.fft.ihfft(np.ones(t.shape), n=100).shape t1 = tile(t1) assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) t1 = ihfft(t, n=101) assert t1.shape == np.fft.ihfft(np.ones(t.shape), n=101).shape t1 = tile(t1) assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
def test_histogram_bin_edges(): a = array([0, 0, 0, 1, 2, 3, 3, 4, 5], chunk_size=3) with pytest.raises(ValueError): histogram_bin_edges(a, bins='unknown') with pytest.raises(TypeError): # bins is str, weights cannot be provided histogram_bin_edges(a, bins='scott', weights=a) with pytest.raises(ValueError): histogram_bin_edges(a, bins=-1) with pytest.raises(ValueError): # not asc histogram_bin_edges(a, bins=[3, 2, 1]) with pytest.raises(ValueError): # bins cannot be 2d histogram_bin_edges(a, bins=np.random.rand(2, 3)) with pytest.raises(ValueError): histogram_bin_edges(a, range=(5, 0)) with pytest.raises(ValueError): histogram_bin_edges(a, range=(np.nan, np.nan)) bins = histogram_bin_edges(a, bins=3, range=(0, 5)) # if range specified, no error will occur tile(bins)
def test_random(): arr = rand(2, 3) assert arr.dtype is not None arr = tile(beta(1, 2, chunk_size=2)) assert arr.shape == () assert len(arr.chunks) == 1 assert arr.chunks[0].shape == () assert arr.chunks[0].op.dtype == np.dtype('f8') arr = tile(beta([1, 2], [3, 4], chunk_size=2)) assert arr.shape == (2, ) assert len(arr.chunks) == 1 assert arr.chunks[0].shape == (2, ) assert arr.chunks[0].op.dtype == np.dtype('f8') arr = tile( beta([[2, 3]], from_ndarray([[4, 6], [5, 2]], chunk_size=2), chunk_size=1, size=(3, 2, 2))) assert arr.shape == (3, 2, 2) assert len(arr.chunks) == 12 assert arr.chunks[0].op.dtype == np.dtype('f8')
def test_reshape(): a = ones((10, 20, 30), chunk_size=5) b = a.reshape(10, 600) b = tile(b) assert tuple(sum(s) for s in b.nsplits) == (10, 600) a = ones((10, 600), chunk_size=5) b = a.reshape(10, 30, 20) b = tile(b) assert tuple(sum(s) for s in b.nsplits) == (10, 30, 20) a = ones((10, 600), chunk_size=5) a.shape = [10, 30, 20] a = tile(a) assert tuple(sum(s) for s in a.nsplits) == (10, 30, 20) # test reshape unknown shape c = a[a > 0] d = c.reshape(10, 600) assert d.shape == (10, 600) d = c.reshape(-1, 10) assert len(d.shape) == 2 assert np.isnan(d.shape[0]) assert d.shape[1] with pytest.raises(TypeError): a.reshape((10, 30, 20), other_argument=True)
def test_nunique(): data = pd.DataFrame(np.random.randint(0, 6, size=(20, 10)), columns=['c' + str(i) for i in range(10)]) df = from_pandas_df(data, chunk_size=3) result = df.nunique() assert result.shape == (10,) assert result.op.output_types[0] == OutputType.series assert isinstance(result.op, DataFrameNunique) tiled = tile(result) assert tiled.shape == (10,) assert len(tiled.chunks) == 4 assert tiled.nsplits == ((3, 3, 3, 1,),) assert tiled.chunks[0].op.stage == OperandStage.agg assert isinstance(tiled.chunks[0].op, DataFrameAggregate) data2 = data.copy() df2 = from_pandas_df(data2, chunk_size=3) result2 = df2.nunique(axis=1) assert result2.shape == (20,) assert result2.op.output_types[0] == OutputType.series assert isinstance(result2.op, DataFrameNunique) tiled = tile(result2) assert tiled.shape == (20,) assert len(tiled.chunks) == 7 assert tiled.nsplits == ((3, 3, 3, 3, 3, 3, 2,),) assert tiled.chunks[0].op.stage == OperandStage.agg assert isinstance(tiled.chunks[0].op, DataFrameAggregate)
def test_series_reduction(func_name, op, func_opts: FunctionOptions): data = pd.Series(range(20), index=[str(i) for i in range(20)]) series = getattr(from_pandas_series(data, chunk_size=3), func_name)() assert isinstance(series, Tensor) assert isinstance(series.op, op) assert series.shape == () series = tile(series) assert len(series.chunks) == 1 assert isinstance(series.chunks[0].op, DataFrameAggregate) assert isinstance(series.chunks[0].inputs[0].op, DataFrameConcat) assert len(series.chunks[0].inputs[0].inputs) == 2 data = pd.Series(np.random.rand(25), name='a') if func_opts.has_skipna: kwargs = dict(axis='index', skipna=False) else: kwargs = dict() series = getattr(from_pandas_series(data, chunk_size=7), func_name)(**kwargs) assert isinstance(series, Tensor) assert series.shape == () series = tile(series) assert len(series.chunks) == 1 assert isinstance(series.chunks[0].op, DataFrameAggregate) assert isinstance(series.chunks[0].inputs[0].op, DataFrameConcat) assert len(series.chunks[0].inputs[0].inputs) == 4
def test_cum_dataframe_reduction(func_name, op, func_opts: FunctionOptions): data = pd.DataFrame({'a': list(range(20)), 'b': list(range(20, 0, -1))}, index=[str(i) for i in range(20)]) reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() assert isinstance(reduction_df, DataFrame) assert isinstance(reduction_df.index_value._index_value, IndexValue.Index) assert reduction_df.shape == (20, 2) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 7 assert isinstance(reduction_df.chunks[0].op, op) assert reduction_df.chunks[0].op.stage == OperandStage.combine assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op) assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map assert len(reduction_df.chunks[-1].inputs) == 7 data = pd.DataFrame(np.random.rand(20, 10)) reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() assert isinstance(reduction_df, DataFrame) assert isinstance(reduction_df.index_value._index_value, IndexValue.RangeIndex) assert reduction_df.shape == (20, 10) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 28 assert reduction_df.nsplits == ((3, 3, 3, 3, 3, 3, 2), (3, 3, 3, 1)) assert reduction_df.chunks[0].op.stage == OperandStage.combine assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op) assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map assert len(reduction_df.chunks[-1].inputs) == 7
def test_cum_series_reduction(func_name, op, func_opts: FunctionOptions): data = pd.Series({'a': list(range(20))}, index=[str(i) for i in range(20)]) series = getattr(from_pandas_series(data, chunk_size=3), func_name)() assert isinstance(series, Series) assert series.shape == (20,) series = tile(series) assert len(series.chunks) == 7 assert isinstance(series.chunks[0].op, op) assert series.chunks[0].op.stage == OperandStage.combine assert isinstance(series.chunks[-1].inputs[-1].op, op) assert series.chunks[-1].inputs[-1].op.stage == OperandStage.map assert len(series.chunks[-1].inputs) == 7 data = pd.Series(np.random.rand(25), name='a') if func_opts.has_skipna: kwargs = dict(axis='index', skipna=False) else: kwargs = dict() series = getattr(from_pandas_series(data, chunk_size=7), func_name)(**kwargs) assert isinstance(series, Series) assert series.shape == (25,) series = tile(series) assert len(series.chunks) == 4 assert isinstance(series.chunks[0].op, op) assert series.chunks[0].op.stage == OperandStage.combine assert isinstance(series.chunks[-1].inputs[-1].op, op) assert series.chunks[-1].inputs[-1].op.stage == OperandStage.map assert len(series.chunks[-1].inputs) == 4
def test_broadcast_to(): arr = ones((10, 5), chunk_size=2) arr2 = broadcast_to(arr, (20, 10, 5)) arr, arr2 = tile(arr, arr2) assert arr2.shape == (20, 10, 5) assert len(arr2.chunks) == len(arr.chunks) assert arr2.chunks[0].shape == (20, 2, 2) arr = ones((10, 5, 1), chunk_size=2) arr3 = broadcast_to(arr, (5, 10, 5, 6)) arr, arr3 = tile(arr, arr3) assert arr3.shape == (5, 10, 5, 6) assert len(arr3.chunks) == len(arr.chunks) assert arr3.nsplits == ((5,), (2, 2, 2, 2, 2), (2, 2, 1), (6,)) assert arr3.chunks[0].shape == (5, 2, 2, 6) arr = ones((10, 1), chunk_size=2) arr4 = broadcast_to(arr, (20, 10, 5)) arr, arr4 = tile(arr, arr4) assert arr4.shape == (20, 10, 5) assert len(arr4.chunks) == len(arr.chunks) assert arr4.chunks[0].shape == (20, 2, 5) with pytest.raises(ValueError): broadcast_to(arr, (10,)) with pytest.raises(ValueError): broadcast_to(arr, (5, 1)) arr = ones((4, 5), chunk_size=2) with pytest.raises((ValueError)): broadcast_to(arr[arr < 2], (3, 20))
def test_setitem_structured(): # Check to value is properly broadcast for `setitem` on complex record dtype arrays. rec_type = np.dtype([('a', np.int32), ('b', np.double), ('c', np.dtype([('a', np.int16), ('b', np.int64)]))]) t = ones((4, 5), dtype=rec_type, chunk_size=3) # assign tuple to record t[1:4, 1] = (3, 4., (5, 6)) tt = tile(t) assert tt.cix[0, 0].op.value == (3, 4., (5, 6)) # assign scalar to record t[1:4, 2] = 8 tt = tile(t) assert tt.cix[0, 0].op.value == 8 # assign scalar array to record array with broadcast t[1:3] = np.arange(5) tt = tile(t) slices_op = tt.cix[0, 0].op.value.op assert slices_op.slices == [slice(None, None, None), slice(None, 3, None)] broadcast_op = slices_op.inputs[0].op.inputs[0].op assert isinstance(broadcast_op, TensorBroadcastTo) assert broadcast_op.shape == (2, 5) np.testing.assert_array_equal(broadcast_op.inputs[0].op.data, np.arange(5)) # assign scalar array to record array of same shape, no broadcast t[2:4] = np.arange(10).reshape(2, 5) tt = tile(t) slices_op = tt.cix[0, 0].op.value.op assert slices_op.slices == [slice(None, 1, None), slice(None, 3, None)] np.testing.assert_array_equal(slices_op.inputs[0].op.inputs[0].op.data, np.arange(10).reshape(2, 5))
def test_index_tricks(): mgrid = nd_grid() g = mgrid[0:5, 0:5] tile(g) # tileable means no loop exists ogrid = nd_grid(sparse=True) o = ogrid[0:5, 0:5] tile(*o) # tilesable means no loop exists
def test_squareform_execution(setup): from scipy.spatial.distance import pdist as sp_pdist, \ squareform as sp_squareform raw_a = np.random.rand(80, 10) raw_pdsit = sp_pdist(raw_a) raw_square = sp_squareform(raw_pdsit) # tomatrix, test 1 chunk vec = tensor(raw_pdsit, chunk_size=raw_pdsit.shape[0]) mat = distance.squareform(vec, chunk_size=100) result = mat.execute().fetch() np.testing.assert_array_equal(result, raw_square) # tomatrix, test more than 1 chunk vec = tensor(raw_pdsit, chunk_size=33) assert len(tile(vec).chunks) > 1 mat = distance.squareform(vec, chunk_size=34) result = mat.execute().fetch() np.testing.assert_array_equal(result, raw_square) # tovec, test 1 chunk mat = tensor(raw_square) vec = distance.squareform(mat, chunk_size=raw_pdsit.shape[0]) assert len(tile(mat).chunks) == 1 assert len(tile(vec).chunks) == 1 result = vec.execute().fetch() np.testing.assert_array_equal(result, raw_pdsit) # tovec, test more than 1 chunk mat = tensor(raw_square, chunk_size=31) vec = distance.squareform(mat, chunk_size=40) assert len(tile(vec).chunks) > 1 result = vec.execute().fetch() np.testing.assert_array_equal(result, raw_pdsit) # test checks # generate non-symmetric matrix non_sym_arr = np.random.RandomState(0).rand(10, 10) # 1 chunk mat = tensor(non_sym_arr) vec = distance.squareform(mat, checks=True, chunk_size=100) with pytest.raises(ValueError): _ = vec.execute().fetch() # force checks=False vec = distance.squareform(mat, checks=False, chunk_size=100) _ = vec.execute().fetch() # more than 1 chunk mat = tensor(non_sym_arr, chunk_size=6) vec = distance.squareform(mat, checks=True, chunk_size=8) assert len(tile(vec).chunks) > 1 with pytest.raises(ValueError): _ = vec.execute().fetch() # force checks=False vec = distance.squareform(mat, checks=False, chunk_size=100) _ = vec.execute().fetch()
def test_sort(): a = tensor(np.random.rand(10, 10), chunk_size=(5, 10)) sa = sort(a) assert type(sa.op).__name__ == 'TensorSort' sa = tile(sa) assert len(sa.chunks) == 2 for c in sa.chunks: assert type(c.op).__name__ == 'TensorSort' assert type(c.inputs[0].op).__name__ == 'ArrayDataSource' a = tensor(np.random.rand(100), chunk_size=(10)) sa = sort(a) assert type(sa.op).__name__ == 'TensorSort' sa = tile(sa) for c in sa.chunks: assert type(c.op).__name__ == 'PSRSShuffle' assert c.op.stage == OperandStage.reduce assert c.shape == (np.nan, ) a = tensor(np.empty((10, 10), dtype=[('id', np.int32), ('size', np.int64)]), chunk_size=(10, 5)) sa = sort(a) assert sa.op.order == ['id', 'size'] with pytest.raises(ValueError): sort(a, order=['unknown_field']) with pytest.raises(np.AxisError): sort(np.random.rand(100), axis=1) with pytest.raises(ValueError): sort(np.random.rand(100), kind='non_valid_kind') with pytest.raises(ValueError): sort(np.random.rand(100), parallel_kind='non_valid_parallel_kind') with pytest.raises(TypeError): sort(np.random.rand(100), psrs_kinds='non_valid_psrs_kinds') with pytest.raises(ValueError): sort(np.random.rand(100), psrs_kinds=['quicksort'] * 2) with pytest.raises(ValueError): sort(np.random.rand(100), psrs_kinds=['non_valid_kind'] * 3) with pytest.raises(ValueError): sort(np.random.rand(100), psrs_kinds=[None, None, None]) with pytest.raises(ValueError): sort(np.random.rand(100), psrs_kinds=['quicksort', 'mergesort', None])
def test_shuffle_expr(): a = mt.random.rand(10, 3, chunk_size=2) b = md.DataFrame(mt.random.rand(10, 5), chunk_size=2) new_a, new_b = shuffle(a, b, random_state=0) assert new_a.op is new_b.op assert isinstance(new_a.op, LearnShuffle) assert new_a.shape == a.shape assert new_b.shape == b.shape assert b.index_value.key != new_b.index_value.key new_a, new_b = tile(new_a, new_b) assert len(new_a.chunks) == 10 assert np.isnan(new_a.chunks[0].shape[0]) assert len(new_b.chunks) == 15 assert np.isnan(new_b.chunks[0].shape[0]) assert new_b.chunks[0].index_value.key != new_b.chunks[1].index_value.key assert new_a.chunks[0].op.seeds == new_b.chunks[0].op.seeds c = mt.random.rand(10, 5, 3, chunk_size=2) d = md.DataFrame(mt.random.rand(10, 5), chunk_size=(2, 5)) new_c, new_d = shuffle(c, d, axes=(0, 1), random_state=0) assert new_c.op is new_d.op assert isinstance(new_c.op, LearnShuffle) assert new_c.shape == c.shape assert new_d.shape == d.shape assert d.index_value.key != new_d.index_value.key assert not np.all(new_d.dtypes.index[:-1] < new_d.dtypes.index[1:]) pd.testing.assert_series_equal(d.dtypes, new_d.dtypes.sort_index()) new_c, new_d = tile(new_c, new_d) assert len(new_c.chunks) == 5 * 1 * 2 assert np.isnan(new_c.chunks[0].shape[0]) assert len(new_d.chunks) == 5 assert np.isnan(new_d.chunks[0].shape[0]) assert new_d.chunks[0].shape[1] == 5 assert new_d.chunks[0].index_value.key != new_d.chunks[1].index_value.key pd.testing.assert_series_equal(new_d.chunks[0].dtypes.sort_index(), d.dtypes) assert new_c.chunks[0].op.seeds == new_d.chunks[0].op.seeds assert len(new_c.chunks[0].op.seeds) == 1 assert new_c.chunks[0].op.reduce_sizes == (5, ) with pytest.raises(ValueError): a = mt.random.rand(10, 5) b = mt.random.rand(10, 4, 3) shuffle(a, b, axes=1) with pytest.raises(TypeError): shuffle(a, b, unknown_param=True) assert isinstance(shuffle(mt.random.rand(10, 5)), mt.Tensor)
def test_series_isin(): # one chunk in multiple chunks a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=10) b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2) r = tile(a.isin(b)) for i, c in enumerate(r.chunks): assert c.index == (i, ) assert c.dtype == np.dtype('bool') assert c.shape == (10, ) assert len(c.op.inputs) == 2 assert c.op.output_types[0] == OutputType.series assert c.op.inputs[0].index == (i, ) assert c.op.inputs[0].shape == (10, ) assert c.op.inputs[1].index == (0, ) assert c.op.inputs[1].shape == (4, ) # has been rechunked # multiple chunk in one chunks a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=2) b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=4) r = tile(a.isin(b)) for i, c in enumerate(r.chunks): assert c.index == (i, ) assert c.dtype == np.dtype('bool') assert c.shape == (2, ) assert len(c.op.inputs) == 2 assert c.op.output_types[0] == OutputType.series assert c.op.inputs[0].index == (i, ) assert c.op.inputs[0].shape == (2, ) assert c.op.inputs[1].index == (0, ) assert c.op.inputs[1].shape == (4, ) # multiple chunk in multiple chunks a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=2) b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2) r = tile(a.isin(b)) for i, c in enumerate(r.chunks): assert c.index == (i, ) assert c.dtype == np.dtype('bool') assert c.shape == (2, ) assert len(c.op.inputs) == 2 assert c.op.output_types[0] == OutputType.series assert c.op.inputs[0].index == (i, ) assert c.op.inputs[0].shape == (2, ) assert c.op.inputs[1].index == (0, ) assert c.op.inputs[1].shape == (4, ) # has been rechunked with pytest.raises(TypeError): _ = a.isin('sth') with pytest.raises(TypeError): _ = a.to_frame().isin('sth')
def test_replace(): # dataframe cases df_raw = pd.DataFrame(-1, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(30): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) for rowid in range(random.randint(1, 5)): row = random.randint(0, 19) for idx in range(0, 10): df_raw.iloc[row, idx] = random.randint(0, 99) # not supporting fill with limit df = md.DataFrame(df_raw, chunk_size=4) with pytest.raises(NotImplementedError): df.replace(-1, method='ffill', limit=5) r = tile(df.replace(-1, method='ffill')) assert len(r.chunks) == 15 assert r.chunks[0].shape == (4, 4) assert r.chunks[0].op.stage == OperandStage.combine assert r.chunks[0].op.method == 'ffill' assert r.chunks[0].op.limit is None assert r.chunks[-1].inputs[-1].shape == (1, 2) assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map assert r.chunks[-1].inputs[-1].op.method == 'ffill' assert r.chunks[-1].inputs[-1].op.limit is None r = tile(df.replace(-1, 99)) assert len(r.chunks) == 15 assert r.chunks[0].shape == (4, 4) assert r.chunks[0].op.stage is None assert r.chunks[0].op.limit is None # series cases series_raw = pd.Series(-1, index=range(20)) for _ in range(10): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) series = md.Series(series_raw, chunk_size=4) r = tile(series.replace(-1, method='ffill')) assert len(r.chunks) == 5 assert r.chunks[0].shape == (4, ) assert r.chunks[0].op.stage == OperandStage.combine assert r.chunks[0].op.method == 'ffill' assert r.chunks[0].op.limit is None assert r.chunks[-1].inputs[-1].shape == (1, ) assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map assert r.chunks[-1].inputs[-1].op.method == 'ffill' assert r.chunks[-1].inputs[-1].op.limit is None r = tile(series.replace(-1, 99)) assert len(r.chunks) == 5 assert r.chunks[0].shape == (4, ) assert r.chunks[0].op.stage is None assert r.chunks[0].op.limit is None
def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions): data = pd.DataFrame({ 'a': list(range(20)), 'b': list(range(20, 0, -1)) }, index=[str(i) for i in range(20)]) reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() assert isinstance(reduction_df, Series) assert isinstance(reduction_df.op, op) assert isinstance(reduction_df.index_value._index_value, IndexValue.Index) assert reduction_df.shape == (2, ) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 1 assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate) assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) assert len(reduction_df.chunks[0].inputs[0].inputs) == 2 data = pd.DataFrame(np.random.rand(20, 10)) reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() assert isinstance(reduction_df, Series) assert isinstance(reduction_df.index_value._index_value, (IndexValue.RangeIndex, IndexValue.Int64Index)) assert reduction_df.shape == (10, ) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 4 assert reduction_df.nsplits == ((3, 3, 3, 1), ) assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate) assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) assert len(reduction_df.chunks[0].inputs[0].inputs) == 2 data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)]) reduction_df = getattr(from_pandas_df(data, chunk_size=4), func_name)(axis='columns') assert reduction_df.shape == (20, ) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 5 assert reduction_df.nsplits == ((4, ) * 5, ) assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate) assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) assert len(reduction_df.chunks[0].inputs[0].inputs) == 2 with pytest.raises(NotImplementedError): getattr(from_pandas_df(data, chunk_size=3), func_name)(level=0, axis=1)
def test_fft_freq(): t = fftfreq(10, .1, chunk_size=3) assert t.shape == np.fft.fftfreq(10, .1).shape t = tile(t) assert t.shape == tuple(sum(ns) for ns in t.nsplits) t = rfftfreq(10, .1, chunk_size=3) assert t.shape == np.fft.rfftfreq(10, .1).shape t = tile(t) assert t.shape == tuple(sum(ns) for ns in t.nsplits)
def test_map_chunk(): raw = np.random.rand(20) a = tensor(raw, chunk_size=10) mapped = tile(a.map_chunk(lambda x: x * 0.5)) assert np.issubdtype(mapped.dtype, np.floating) is True assert mapped.shape == (np.nan, ) assert len(mapped.chunks) == 2 mapped = tile(a.map_chunk(lambda x: x * 0.5, elementwise=True)) assert np.issubdtype(mapped.dtype, np.floating) is True assert mapped.shape == (20, ) assert len(mapped.chunks) == 2