예제 #1
0
def test_nonzero():
    x = tensor([[1, 0, 0], [0, 2, 0], [1, 1, 0]], chunk_size=2)
    y = nonzero(x)

    assert len(y) == 2

    tile(y[0])
예제 #2
0
def test_cum_reduction():
    cumsum = lambda x, *args, **kwargs: tile(x.cumsum(*args, **kwargs))
    cumprod = lambda x, *args, **kwargs: tile(x.cumprod(*args, **kwargs))

    res1 = cumsum(ones((10, 8), chunk_size=3), axis=0)
    res2 = cumprod(ones((10, 8), chunk_size=3), axis=0)
    assert res1.shape == (10, 8)
    assert res1.dtype is not None
    assert res2.shape == (10, 8)
    assert res2.dtype is not None

    res1 = cumsum(ones((10, 8, 8), chunk_size=3), axis=1)
    res2 = cumprod(ones((10, 8, 8), chunk_size=3), axis=1)
    assert res1.shape == (10, 8, 8)
    assert res2.shape == (10, 8, 8)

    res1 = cumsum(ones((10, 8, 8), chunk_size=3), axis=-2)
    res2 = cumprod(ones((10, 8, 8), chunk_size=3), axis=-2)
    assert res1.shape == (10, 8, 8)
    assert res2.shape == (10, 8, 8)

    with pytest.raises(np.AxisError):
        cumsum(ones((10, 8), chunk_size=3), axis=2)
    with pytest.raises(np.AxisError):
        cumsum(ones((10, 8), chunk_size=3), axis=-3)
예제 #3
0
파일: test_merge.py 프로젝트: qinxuye/mars
def test_append():
    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
    df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

    mdf1 = from_pandas(df1, chunk_size=3)
    mdf2 = from_pandas(df2, chunk_size=3)
    adf = mdf1.append(mdf2)

    assert adf.shape == (20, 4)
    assert isinstance(adf.index_value.value, IndexValue.Int64Index)

    tiled = tile(adf)
    assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1))
    assert tiled.chunk_shape == (8, 2)
    for i, c in enumerate(tiled.chunks):
        index = (i // 2, i % 2)
        assert c.index == index

    mdf1 = from_pandas(df1, chunk_size=3)
    mdf2 = from_pandas(df2, chunk_size=3)
    adf = mdf1.append(mdf2, ignore_index=True)

    assert adf.shape == (20, 4)
    assert isinstance(adf.index_value.value, IndexValue.RangeIndex)
    pd.testing.assert_index_equal(adf.index_value.to_pandas(),
                                  pd.RangeIndex(20))

    tiled = tile(adf)
    assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1))
    assert tiled.chunk_shape == (8, 2)
    assert isinstance(tiled.chunks[0].op, ChunkStandardizeRangeIndex)
예제 #4
0
def test_head_tail_optimize():
    raw = pd.DataFrame(np.random.rand(4, 3))

    df = md.DataFrame(raw, chunk_size=2)

    # no nan chunk shape
    assert HeadTailOptimizedOperandMixin._need_tile_head_tail(
        tile(df).head(2).op) is False

    df2 = tile(df[df[0] < 0.5])
    # chunk shape on axis 1 greater than 1
    assert HeadTailOptimizedOperandMixin._need_tile_head_tail(
        df2.head(2).op) is False

    df = md.DataFrame(raw, chunk_size=(2, 3))
    df2 = tile(df[df[0] < 0.5])
    # not slice
    assert HeadTailOptimizedOperandMixin._need_tile_head_tail(
        df2.iloc[2].op) is False
    # step not None
    assert HeadTailOptimizedOperandMixin._need_tile_head_tail(
        df2.iloc[:2:2].op) is False
    # not head or tail
    assert HeadTailOptimizedOperandMixin._need_tile_head_tail(
        df2.iloc[1:3].op) is False
    # slice 1 is not slice(None)
    assert HeadTailOptimizedOperandMixin._need_tile_head_tail(
        df2.iloc[:3, :2].op) is False
예제 #5
0
def test_drop_duplicates():
    rs = np.random.RandomState(0)
    raw = pd.DataFrame(rs.randint(1000, size=(20, 7)),
                       columns=['c' + str(i + 1) for i in range(7)])
    raw['c7'] = [f's{j}' for j in range(20)]

    df = from_pandas_df(raw, chunk_size=10)
    with pytest.raises(ValueError):
        df.drop_duplicates(method='unknown')
    with pytest.raises(KeyError):
        df.drop_duplicates(subset='c8')

    # test auto method selection
    assert tile(df.drop_duplicates()).chunks[0].op.method == 'tree'
    # subset size less than chunk_store_limit
    assert tile(df.drop_duplicates(
        subset=['c1', 'c3'])).chunks[0].op.method == 'subset_tree'
    with option_context({'chunk_store_limit': 5}):
        # subset size greater than chunk_store_limit
        assert tile(df.drop_duplicates(
            subset=['c1', 'c3'])).chunks[0].op.method == 'tree'
    assert tile(
        df.drop_duplicates(subset=['c1', 'c7'])).chunks[0].op.method == 'tree'
    assert tile(df['c7'].drop_duplicates()).chunks[0].op.method == 'tree'

    s = df['c7']
    with pytest.raises(ValueError):
        s.drop_duplicates(method='unknown')
예제 #6
0
def test_beta_inc():
    raw1 = np.random.rand(4, 3, 2)
    raw2 = np.random.rand(4, 3, 2)
    raw3 = np.random.rand(4, 3, 2)
    a = tensor(raw1, chunk_size=3)
    b = tensor(raw2, chunk_size=3)
    c = tensor(raw3, chunk_size=3)

    r = betainc(a, b, c)
    expect = scipy_betainc(raw1, raw2, raw3)

    assert r.shape == raw1.shape
    assert r.dtype == expect.dtype

    tiled_a, r = tile(a, r)

    assert r.nsplits == tiled_a.nsplits
    for chunk in r.chunks:
        assert isinstance(chunk.op, TensorBetaInc)
        assert chunk.index == chunk.inputs[0].index
        assert chunk.shape == chunk.inputs[0].shape

    betainc(a, b, c, out=a)
    expect = scipy_betainc(raw1, raw2, raw3)

    assert a.shape == raw1.shape
    assert a.dtype == expect.dtype

    b, tiled_a = tile(b, a)

    assert tiled_a.nsplits == b.nsplits
    for c in r.chunks:
        assert isinstance(c.op, TensorBetaInc)
        assert c.index == c.inputs[0].index
        assert c.shape == c.inputs[0].shape
예제 #7
0
def test_imread():
    with tempfile.TemporaryDirectory() as tempdir:
        raws = []
        for i in range(10):
            array = np.random.randint(0, 256, 2500 * 3,
                                      dtype=np.uint8).reshape((50, 50, 3))
            raws.append(array)
            im = Image.fromarray(array)
            im.save(os.path.join(tempdir, f'random_{i}.png'))

        t = imread(os.path.join(tempdir, 'random_0.png'))
        assert t.shape == (50, 50, 3)
        assert t.dtype == np.dtype('uint8')

        tiled = tile(t)
        assert len(tiled.chunks) == 1
        assert tiled.chunks[0].shape == (50, 50, 3)
        assert tiled.chunks[0].dtype == np.dtype('uint8')

        t = imread(os.path.join(tempdir, 'random_*.png'), chunk_size=3)
        assert t.shape == (10, 50, 50, 3)

        tiled = tile(t)
        assert len(tiled.chunks) == 4
        assert tiled.nsplits == ((3, 3, 3, 1), (50, ), (50, ), (3, ))
        assert tiled.chunks[0].dtype == np.dtype('uint8')
        assert tiled.chunks[0].index == (0, 0, 0, 0)
        assert tiled.chunks[0].shape == (3, 50, 50, 3)
        assert tiled.chunks[1].index == (1, 0, 0, 0)
        assert tiled.chunks[1].shape == (3, 50, 50, 3)
        assert tiled.chunks[2].index == (2, 0, 0, 0)
        assert tiled.chunks[2].shape == (3, 50, 50, 3)
        assert tiled.chunks[3].index == (3, 0, 0, 0)
        assert tiled.chunks[3].shape == (1, 50, 50, 3)
예제 #8
0
def test_groupby():
    df = pd.DataFrame({
        'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
        'b': [1, 3, 4, 5, 6, 5, 4, 4, 4]
    })
    mdf = md.DataFrame(df, chunk_size=2)
    with pytest.raises(KeyError):
        mdf.groupby('c2')
    with pytest.raises(KeyError):
        mdf.groupby(['b', 'c2'])

    grouped = mdf.groupby('b')
    assert isinstance(grouped, DataFrameGroupBy)
    assert isinstance(grouped.op, DataFrameGroupByOperand)
    assert list(grouped.key_dtypes.index) == ['b']

    grouped = tile(grouped)
    assert len(grouped.chunks) == 5
    for chunk in grouped.chunks:
        assert isinstance(chunk.op, DataFrameGroupByOperand)

    series = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
    ms = md.Series(series, chunk_size=3)
    grouped = ms.groupby(lambda x: x + 1)

    assert isinstance(grouped, SeriesGroupBy)
    assert isinstance(grouped.op, DataFrameGroupByOperand)

    grouped = tile(grouped)
    assert len(grouped.chunks) == 3
    for chunk in grouped.chunks:
        assert isinstance(chunk.op, DataFrameGroupByOperand)

    with pytest.raises(TypeError):
        ms.groupby(lambda x: x + 1, as_index=False)
예제 #9
0
def test_divide():
    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()

    t = t1 / 10
    assert t.issparse() is True
    assert type(t) is SparseTensor

    t = tile(t)
    assert t.chunks[0].op.sparse is True

    t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse()

    t = t1 / t2
    assert t.issparse() is False
    assert type(t) is Tensor

    t = tile(t)
    assert t.chunks[0].op.sparse is False

    t3 = tensor([1, 1, 1], chunk_size=2)
    t = t1 / t3
    assert t.issparse() is False
    assert type(t) is Tensor

    t = tile(t)
    assert t.chunks[0].op.sparse is False

    t = t3 / t1
    assert t.issparse() is False
    assert type(t) is Tensor

    t = tile(t)
    assert t.chunks[0].op.sparse is False
예제 #10
0
def test_multiply():
    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()

    t = t1 * 10
    assert t.issparse() is True
    assert type(t) is SparseTensor

    t = tile(t)
    assert t.chunks[0].op.sparse is True

    t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse()

    t = t1 * t2
    assert t.issparse() is True
    assert type(t) is SparseTensor

    t = tile(t)
    assert t.chunks[0].op.sparse is True

    t3 = tensor([1, 1, 1], chunk_size=2)
    t = t1 * t3
    assert t.issparse() is True
    assert type(t) is SparseTensor

    t = tile(t)
    assert t.chunks[0].op.sparse is True
예제 #11
0
파일: test_base.py 프로젝트: haijohn/mars
def test_isin():
    element = 2 * arange(4, chunk_size=1).reshape(2, 2)
    test_elements = [1, 2, 4, 8]

    mask = isin(element, test_elements)
    assert mask.shape == (2, 2)
    assert mask.dtype == np.bool_

    mask, element = tile(mask, element)

    assert len(mask.chunks) == len(element.chunks)
    assert len(mask.op.test_elements.chunks) == 1
    assert mask.chunks[0].inputs[0] is element.chunks[0].data

    element = 2 * arange(4, chunk_size=1).reshape(2, 2)
    test_elements = tensor([1, 2, 4, 8], chunk_size=2)

    mask = isin(element, test_elements, invert=True)
    assert mask.shape == (2, 2)
    assert mask.dtype == np.bool_

    mask, element = tile(mask, element)

    assert len(mask.chunks) == len(element.chunks)
    assert len(mask.op.test_elements.chunks) == 1
    assert mask.chunks[0].inputs[0] is element.chunks[0].data
    assert mask.chunks[0].op.invert is True
예제 #12
0
파일: test_fft.py 프로젝트: fyrestone/mars
def test_hermitian_fft():
    t = ones((10, 20, 30), chunk_size=(3, 20, 30))

    t1 = hfft(t)
    assert t1.shape == np.fft.hfft(np.ones(t.shape)).shape
    t1 = tile(t1)
    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)

    t = ones((10, 20, 30), chunk_size=(3, 20, 30))

    t1 = hfft(t, n=100)
    assert t1.shape == np.fft.hfft(np.ones(t.shape), n=100).shape
    t1 = tile(t1)
    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)

    t = ones((10, 20, 30), chunk_size=(3, 20, 30))

    t1 = ihfft(t)
    assert t1.shape == np.fft.ihfft(np.ones(t.shape)).shape
    t1 = tile(t1)
    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)

    t = ones((10, 20, 30), chunk_size=(3, 20, 30))

    t1 = ihfft(t, n=100)
    assert t1.shape == np.fft.ihfft(np.ones(t.shape), n=100).shape
    t1 = tile(t1)
    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)

    t1 = ihfft(t, n=101)
    assert t1.shape == np.fft.ihfft(np.ones(t.shape), n=101).shape
    t1 = tile(t1)
    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
예제 #13
0
def test_histogram_bin_edges():
    a = array([0, 0, 0, 1, 2, 3, 3, 4, 5], chunk_size=3)

    with pytest.raises(ValueError):
        histogram_bin_edges(a, bins='unknown')

    with pytest.raises(TypeError):
        # bins is str, weights cannot be provided
        histogram_bin_edges(a, bins='scott', weights=a)

    with pytest.raises(ValueError):
        histogram_bin_edges(a, bins=-1)

    with pytest.raises(ValueError):
        # not asc
        histogram_bin_edges(a, bins=[3, 2, 1])

    with pytest.raises(ValueError):
        # bins cannot be 2d
        histogram_bin_edges(a, bins=np.random.rand(2, 3))

    with pytest.raises(ValueError):
        histogram_bin_edges(a, range=(5, 0))

    with pytest.raises(ValueError):
        histogram_bin_edges(a, range=(np.nan, np.nan))

    bins = histogram_bin_edges(a, bins=3, range=(0, 5))
    # if range specified, no error will occur
    tile(bins)
예제 #14
0
파일: test_random.py 프로젝트: qinxuye/mars
def test_random():
    arr = rand(2, 3)

    assert arr.dtype is not None

    arr = tile(beta(1, 2, chunk_size=2))

    assert arr.shape == ()
    assert len(arr.chunks) == 1
    assert arr.chunks[0].shape == ()
    assert arr.chunks[0].op.dtype == np.dtype('f8')

    arr = tile(beta([1, 2], [3, 4], chunk_size=2))

    assert arr.shape == (2, )
    assert len(arr.chunks) == 1
    assert arr.chunks[0].shape == (2, )
    assert arr.chunks[0].op.dtype == np.dtype('f8')

    arr = tile(
        beta([[2, 3]],
             from_ndarray([[4, 6], [5, 2]], chunk_size=2),
             chunk_size=1,
             size=(3, 2, 2)))

    assert arr.shape == (3, 2, 2)
    assert len(arr.chunks) == 12
    assert arr.chunks[0].op.dtype == np.dtype('f8')
예제 #15
0
def test_reshape():
    a = ones((10, 20, 30), chunk_size=5)
    b = a.reshape(10, 600)

    b = tile(b)

    assert tuple(sum(s) for s in b.nsplits) == (10, 600)

    a = ones((10, 600), chunk_size=5)
    b = a.reshape(10, 30, 20)

    b = tile(b)

    assert tuple(sum(s) for s in b.nsplits) == (10, 30, 20)

    a = ones((10, 600), chunk_size=5)
    a.shape = [10, 30, 20]

    a = tile(a)

    assert tuple(sum(s) for s in a.nsplits) == (10, 30, 20)

    # test reshape unknown shape
    c = a[a > 0]
    d = c.reshape(10, 600)
    assert d.shape == (10, 600)
    d = c.reshape(-1, 10)
    assert len(d.shape) == 2
    assert np.isnan(d.shape[0])
    assert d.shape[1]

    with pytest.raises(TypeError):
        a.reshape((10, 30, 20), other_argument=True)
예제 #16
0
def test_nunique():
    data = pd.DataFrame(np.random.randint(0, 6, size=(20, 10)),
                        columns=['c' + str(i) for i in range(10)])
    df = from_pandas_df(data, chunk_size=3)
    result = df.nunique()

    assert result.shape == (10,)
    assert result.op.output_types[0] == OutputType.series
    assert isinstance(result.op, DataFrameNunique)

    tiled = tile(result)
    assert tiled.shape == (10,)
    assert len(tiled.chunks) == 4
    assert tiled.nsplits == ((3, 3, 3, 1,),)
    assert tiled.chunks[0].op.stage == OperandStage.agg
    assert isinstance(tiled.chunks[0].op, DataFrameAggregate)

    data2 = data.copy()
    df2 = from_pandas_df(data2, chunk_size=3)
    result2 = df2.nunique(axis=1)

    assert result2.shape == (20,)
    assert result2.op.output_types[0] == OutputType.series
    assert isinstance(result2.op, DataFrameNunique)

    tiled = tile(result2)
    assert tiled.shape == (20,)
    assert len(tiled.chunks) == 7
    assert tiled.nsplits == ((3, 3, 3, 3, 3, 3, 2,),)
    assert tiled.chunks[0].op.stage == OperandStage.agg
    assert isinstance(tiled.chunks[0].op, DataFrameAggregate)
예제 #17
0
def test_series_reduction(func_name, op, func_opts: FunctionOptions):
    data = pd.Series(range(20), index=[str(i) for i in range(20)])
    series = getattr(from_pandas_series(data, chunk_size=3), func_name)()

    assert isinstance(series, Tensor)
    assert isinstance(series.op, op)
    assert series.shape == ()

    series = tile(series)

    assert len(series.chunks) == 1
    assert isinstance(series.chunks[0].op, DataFrameAggregate)
    assert isinstance(series.chunks[0].inputs[0].op, DataFrameConcat)
    assert len(series.chunks[0].inputs[0].inputs) == 2

    data = pd.Series(np.random.rand(25), name='a')
    if func_opts.has_skipna:
        kwargs = dict(axis='index', skipna=False)
    else:
        kwargs = dict()
    series = getattr(from_pandas_series(data, chunk_size=7), func_name)(**kwargs)

    assert isinstance(series, Tensor)
    assert series.shape == ()

    series = tile(series)

    assert len(series.chunks) == 1
    assert isinstance(series.chunks[0].op, DataFrameAggregate)
    assert isinstance(series.chunks[0].inputs[0].op, DataFrameConcat)
    assert len(series.chunks[0].inputs[0].inputs) == 4
예제 #18
0
def test_cum_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
    data = pd.DataFrame({'a': list(range(20)), 'b': list(range(20, 0, -1))},
                        index=[str(i) for i in range(20)])
    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()

    assert isinstance(reduction_df, DataFrame)
    assert isinstance(reduction_df.index_value._index_value, IndexValue.Index)
    assert reduction_df.shape == (20, 2)

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 7
    assert isinstance(reduction_df.chunks[0].op, op)
    assert reduction_df.chunks[0].op.stage == OperandStage.combine
    assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op)
    assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert len(reduction_df.chunks[-1].inputs) == 7

    data = pd.DataFrame(np.random.rand(20, 10))
    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()

    assert isinstance(reduction_df, DataFrame)
    assert isinstance(reduction_df.index_value._index_value, IndexValue.RangeIndex)
    assert reduction_df.shape == (20, 10)

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 28
    assert reduction_df.nsplits == ((3, 3, 3, 3, 3, 3, 2), (3, 3, 3, 1))
    assert reduction_df.chunks[0].op.stage == OperandStage.combine
    assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op)
    assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert len(reduction_df.chunks[-1].inputs) == 7
예제 #19
0
def test_cum_series_reduction(func_name, op, func_opts: FunctionOptions):
    data = pd.Series({'a': list(range(20))}, index=[str(i) for i in range(20)])
    series = getattr(from_pandas_series(data, chunk_size=3), func_name)()

    assert isinstance(series, Series)
    assert series.shape == (20,)

    series = tile(series)

    assert len(series.chunks) == 7
    assert isinstance(series.chunks[0].op, op)
    assert series.chunks[0].op.stage == OperandStage.combine
    assert isinstance(series.chunks[-1].inputs[-1].op, op)
    assert series.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert len(series.chunks[-1].inputs) == 7

    data = pd.Series(np.random.rand(25), name='a')
    if func_opts.has_skipna:
        kwargs = dict(axis='index', skipna=False)
    else:
        kwargs = dict()
    series = getattr(from_pandas_series(data, chunk_size=7), func_name)(**kwargs)

    assert isinstance(series, Series)
    assert series.shape == (25,)

    series = tile(series)

    assert len(series.chunks) == 4
    assert isinstance(series.chunks[0].op, op)
    assert series.chunks[0].op.stage == OperandStage.combine
    assert isinstance(series.chunks[-1].inputs[-1].op, op)
    assert series.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert len(series.chunks[-1].inputs) == 4
예제 #20
0
파일: test_base.py 프로젝트: haijohn/mars
def test_broadcast_to():
    arr = ones((10, 5), chunk_size=2)
    arr2 = broadcast_to(arr, (20, 10, 5))
    arr, arr2 = tile(arr, arr2)

    assert arr2.shape == (20, 10, 5)
    assert len(arr2.chunks) == len(arr.chunks)
    assert arr2.chunks[0].shape == (20, 2, 2)

    arr = ones((10, 5, 1), chunk_size=2)
    arr3 = broadcast_to(arr, (5, 10, 5, 6))
    arr, arr3 = tile(arr, arr3)

    assert arr3.shape == (5, 10, 5, 6)
    assert len(arr3.chunks) == len(arr.chunks)
    assert arr3.nsplits == ((5,), (2, 2, 2, 2, 2), (2, 2, 1), (6,))
    assert arr3.chunks[0].shape == (5, 2, 2, 6)

    arr = ones((10, 1), chunk_size=2)
    arr4 = broadcast_to(arr, (20, 10, 5))
    arr, arr4 = tile(arr, arr4)

    assert arr4.shape == (20, 10, 5)
    assert len(arr4.chunks) == len(arr.chunks)
    assert arr4.chunks[0].shape == (20, 2, 5)

    with pytest.raises(ValueError):
        broadcast_to(arr, (10,))

    with pytest.raises(ValueError):
        broadcast_to(arr, (5, 1))

    arr = ones((4, 5), chunk_size=2)
    with pytest.raises((ValueError)):
        broadcast_to(arr[arr < 2], (3, 20))
예제 #21
0
def test_setitem_structured():
    # Check to value is properly broadcast for `setitem` on complex record dtype arrays.
    rec_type = np.dtype([('a', np.int32), ('b', np.double), ('c', np.dtype([('a', np.int16), ('b', np.int64)]))])

    t = ones((4, 5), dtype=rec_type, chunk_size=3)

    # assign tuple to record
    t[1:4, 1] = (3, 4., (5, 6))
    tt = tile(t)
    assert tt.cix[0, 0].op.value == (3, 4., (5, 6))

    # assign scalar to record
    t[1:4, 2] = 8
    tt = tile(t)
    assert tt.cix[0, 0].op.value == 8

    # assign scalar array to record array with broadcast
    t[1:3] = np.arange(5)
    tt = tile(t)
    slices_op = tt.cix[0, 0].op.value.op
    assert slices_op.slices == [slice(None, None, None), slice(None, 3, None)]
    broadcast_op = slices_op.inputs[0].op.inputs[0].op
    assert isinstance(broadcast_op, TensorBroadcastTo)
    assert broadcast_op.shape == (2, 5)
    np.testing.assert_array_equal(broadcast_op.inputs[0].op.data, np.arange(5))

    # assign scalar array to record array of same shape, no broadcast
    t[2:4] = np.arange(10).reshape(2, 5)
    tt = tile(t)
    slices_op = tt.cix[0, 0].op.value.op
    assert slices_op.slices == [slice(None, 1, None), slice(None, 3, None)]
    np.testing.assert_array_equal(slices_op.inputs[0].op.inputs[0].op.data, np.arange(10).reshape(2, 5))
예제 #22
0
def test_index_tricks():
    mgrid = nd_grid()
    g = mgrid[0:5, 0:5]
    tile(g)  # tileable means no loop exists

    ogrid = nd_grid(sparse=True)
    o = ogrid[0:5, 0:5]
    tile(*o)  # tilesable means no loop exists
예제 #23
0
def test_squareform_execution(setup):
    from scipy.spatial.distance import pdist as sp_pdist, \
        squareform as sp_squareform

    raw_a = np.random.rand(80, 10)
    raw_pdsit = sp_pdist(raw_a)
    raw_square = sp_squareform(raw_pdsit)

    # tomatrix, test 1 chunk
    vec = tensor(raw_pdsit, chunk_size=raw_pdsit.shape[0])
    mat = distance.squareform(vec, chunk_size=100)
    result = mat.execute().fetch()
    np.testing.assert_array_equal(result, raw_square)

    # tomatrix, test more than 1 chunk
    vec = tensor(raw_pdsit, chunk_size=33)
    assert len(tile(vec).chunks) > 1
    mat = distance.squareform(vec, chunk_size=34)
    result = mat.execute().fetch()
    np.testing.assert_array_equal(result, raw_square)

    # tovec, test 1 chunk
    mat = tensor(raw_square)
    vec = distance.squareform(mat, chunk_size=raw_pdsit.shape[0])
    assert len(tile(mat).chunks) == 1
    assert len(tile(vec).chunks) == 1
    result = vec.execute().fetch()
    np.testing.assert_array_equal(result, raw_pdsit)

    # tovec, test more than 1 chunk
    mat = tensor(raw_square, chunk_size=31)
    vec = distance.squareform(mat, chunk_size=40)
    assert len(tile(vec).chunks) > 1
    result = vec.execute().fetch()
    np.testing.assert_array_equal(result, raw_pdsit)

    # test checks
    # generate non-symmetric matrix
    non_sym_arr = np.random.RandomState(0).rand(10, 10)

    # 1 chunk
    mat = tensor(non_sym_arr)
    vec = distance.squareform(mat, checks=True, chunk_size=100)
    with pytest.raises(ValueError):
        _ = vec.execute().fetch()
    # force checks=False
    vec = distance.squareform(mat, checks=False, chunk_size=100)
    _ = vec.execute().fetch()

    # more than 1 chunk
    mat = tensor(non_sym_arr, chunk_size=6)
    vec = distance.squareform(mat, checks=True, chunk_size=8)
    assert len(tile(vec).chunks) > 1
    with pytest.raises(ValueError):
        _ = vec.execute().fetch()
    # force checks=False
    vec = distance.squareform(mat, checks=False, chunk_size=100)
    _ = vec.execute().fetch()
예제 #24
0
파일: test_base.py 프로젝트: qinxuye/mars
def test_sort():
    a = tensor(np.random.rand(10, 10), chunk_size=(5, 10))

    sa = sort(a)
    assert type(sa.op).__name__ == 'TensorSort'

    sa = tile(sa)

    assert len(sa.chunks) == 2
    for c in sa.chunks:
        assert type(c.op).__name__ == 'TensorSort'
        assert type(c.inputs[0].op).__name__ == 'ArrayDataSource'

    a = tensor(np.random.rand(100), chunk_size=(10))

    sa = sort(a)
    assert type(sa.op).__name__ == 'TensorSort'

    sa = tile(sa)

    for c in sa.chunks:
        assert type(c.op).__name__ == 'PSRSShuffle'
        assert c.op.stage == OperandStage.reduce
        assert c.shape == (np.nan, )

    a = tensor(np.empty((10, 10), dtype=[('id', np.int32),
                                         ('size', np.int64)]),
               chunk_size=(10, 5))
    sa = sort(a)
    assert sa.op.order == ['id', 'size']

    with pytest.raises(ValueError):
        sort(a, order=['unknown_field'])

    with pytest.raises(np.AxisError):
        sort(np.random.rand(100), axis=1)

    with pytest.raises(ValueError):
        sort(np.random.rand(100), kind='non_valid_kind')

    with pytest.raises(ValueError):
        sort(np.random.rand(100), parallel_kind='non_valid_parallel_kind')

    with pytest.raises(TypeError):
        sort(np.random.rand(100), psrs_kinds='non_valid_psrs_kinds')

    with pytest.raises(ValueError):
        sort(np.random.rand(100), psrs_kinds=['quicksort'] * 2)

    with pytest.raises(ValueError):
        sort(np.random.rand(100), psrs_kinds=['non_valid_kind'] * 3)

    with pytest.raises(ValueError):
        sort(np.random.rand(100), psrs_kinds=[None, None, None])

    with pytest.raises(ValueError):
        sort(np.random.rand(100), psrs_kinds=['quicksort', 'mergesort', None])
예제 #25
0
def test_shuffle_expr():
    a = mt.random.rand(10, 3, chunk_size=2)
    b = md.DataFrame(mt.random.rand(10, 5), chunk_size=2)

    new_a, new_b = shuffle(a, b, random_state=0)

    assert new_a.op is new_b.op
    assert isinstance(new_a.op, LearnShuffle)
    assert new_a.shape == a.shape
    assert new_b.shape == b.shape
    assert b.index_value.key != new_b.index_value.key

    new_a, new_b = tile(new_a, new_b)

    assert len(new_a.chunks) == 10
    assert np.isnan(new_a.chunks[0].shape[0])
    assert len(new_b.chunks) == 15
    assert np.isnan(new_b.chunks[0].shape[0])
    assert new_b.chunks[0].index_value.key != new_b.chunks[1].index_value.key
    assert new_a.chunks[0].op.seeds == new_b.chunks[0].op.seeds

    c = mt.random.rand(10, 5, 3, chunk_size=2)
    d = md.DataFrame(mt.random.rand(10, 5), chunk_size=(2, 5))

    new_c, new_d = shuffle(c, d, axes=(0, 1), random_state=0)

    assert new_c.op is new_d.op
    assert isinstance(new_c.op, LearnShuffle)
    assert new_c.shape == c.shape
    assert new_d.shape == d.shape
    assert d.index_value.key != new_d.index_value.key
    assert not np.all(new_d.dtypes.index[:-1] < new_d.dtypes.index[1:])
    pd.testing.assert_series_equal(d.dtypes, new_d.dtypes.sort_index())

    new_c, new_d = tile(new_c, new_d)

    assert len(new_c.chunks) == 5 * 1 * 2
    assert np.isnan(new_c.chunks[0].shape[0])
    assert len(new_d.chunks) == 5
    assert np.isnan(new_d.chunks[0].shape[0])
    assert new_d.chunks[0].shape[1] == 5
    assert new_d.chunks[0].index_value.key != new_d.chunks[1].index_value.key
    pd.testing.assert_series_equal(new_d.chunks[0].dtypes.sort_index(),
                                   d.dtypes)
    assert new_c.chunks[0].op.seeds == new_d.chunks[0].op.seeds
    assert len(new_c.chunks[0].op.seeds) == 1
    assert new_c.chunks[0].op.reduce_sizes == (5, )

    with pytest.raises(ValueError):
        a = mt.random.rand(10, 5)
        b = mt.random.rand(10, 4, 3)
        shuffle(a, b, axes=1)

    with pytest.raises(TypeError):
        shuffle(a, b, unknown_param=True)

    assert isinstance(shuffle(mt.random.rand(10, 5)), mt.Tensor)
예제 #26
0
def test_series_isin():
    # one chunk in multiple chunks
    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                           chunk_size=10)
    b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2)

    r = tile(a.isin(b))
    for i, c in enumerate(r.chunks):
        assert c.index == (i, )
        assert c.dtype == np.dtype('bool')
        assert c.shape == (10, )
        assert len(c.op.inputs) == 2
        assert c.op.output_types[0] == OutputType.series
        assert c.op.inputs[0].index == (i, )
        assert c.op.inputs[0].shape == (10, )
        assert c.op.inputs[1].index == (0, )
        assert c.op.inputs[1].shape == (4, )  # has been rechunked

    # multiple chunk in one chunks
    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                           chunk_size=2)
    b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=4)

    r = tile(a.isin(b))
    for i, c in enumerate(r.chunks):
        assert c.index == (i, )
        assert c.dtype == np.dtype('bool')
        assert c.shape == (2, )
        assert len(c.op.inputs) == 2
        assert c.op.output_types[0] == OutputType.series
        assert c.op.inputs[0].index == (i, )
        assert c.op.inputs[0].shape == (2, )
        assert c.op.inputs[1].index == (0, )
        assert c.op.inputs[1].shape == (4, )

    # multiple chunk in multiple chunks
    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                           chunk_size=2)
    b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2)

    r = tile(a.isin(b))
    for i, c in enumerate(r.chunks):
        assert c.index == (i, )
        assert c.dtype == np.dtype('bool')
        assert c.shape == (2, )
        assert len(c.op.inputs) == 2
        assert c.op.output_types[0] == OutputType.series
        assert c.op.inputs[0].index == (i, )
        assert c.op.inputs[0].shape == (2, )
        assert c.op.inputs[1].index == (0, )
        assert c.op.inputs[1].shape == (4, )  # has been rechunked

    with pytest.raises(TypeError):
        _ = a.isin('sth')

    with pytest.raises(TypeError):
        _ = a.to_frame().isin('sth')
예제 #27
0
def test_replace():
    # dataframe cases
    df_raw = pd.DataFrame(-1, index=range(0, 20), columns=list('ABCDEFGHIJ'))
    for _ in range(30):
        df_raw.iloc[random.randint(0, 19),
                    random.randint(0, 9)] = random.randint(0, 99)
    for rowid in range(random.randint(1, 5)):
        row = random.randint(0, 19)
        for idx in range(0, 10):
            df_raw.iloc[row, idx] = random.randint(0, 99)

    # not supporting fill with limit
    df = md.DataFrame(df_raw, chunk_size=4)
    with pytest.raises(NotImplementedError):
        df.replace(-1, method='ffill', limit=5)

    r = tile(df.replace(-1, method='ffill'))
    assert len(r.chunks) == 15
    assert r.chunks[0].shape == (4, 4)
    assert r.chunks[0].op.stage == OperandStage.combine
    assert r.chunks[0].op.method == 'ffill'
    assert r.chunks[0].op.limit is None
    assert r.chunks[-1].inputs[-1].shape == (1, 2)
    assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert r.chunks[-1].inputs[-1].op.method == 'ffill'
    assert r.chunks[-1].inputs[-1].op.limit is None

    r = tile(df.replace(-1, 99))
    assert len(r.chunks) == 15
    assert r.chunks[0].shape == (4, 4)
    assert r.chunks[0].op.stage is None
    assert r.chunks[0].op.limit is None

    # series cases
    series_raw = pd.Series(-1, index=range(20))
    for _ in range(10):
        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
    series = md.Series(series_raw, chunk_size=4)

    r = tile(series.replace(-1, method='ffill'))
    assert len(r.chunks) == 5
    assert r.chunks[0].shape == (4, )
    assert r.chunks[0].op.stage == OperandStage.combine
    assert r.chunks[0].op.method == 'ffill'
    assert r.chunks[0].op.limit is None
    assert r.chunks[-1].inputs[-1].shape == (1, )
    assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert r.chunks[-1].inputs[-1].op.method == 'ffill'
    assert r.chunks[-1].inputs[-1].op.limit is None

    r = tile(series.replace(-1, 99))
    assert len(r.chunks) == 5
    assert r.chunks[0].shape == (4, )
    assert r.chunks[0].op.stage is None
    assert r.chunks[0].op.limit is None
예제 #28
0
def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
    data = pd.DataFrame({
        'a': list(range(20)),
        'b': list(range(20, 0, -1))
    },
                        index=[str(i) for i in range(20)])
    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()

    assert isinstance(reduction_df, Series)
    assert isinstance(reduction_df.op, op)
    assert isinstance(reduction_df.index_value._index_value, IndexValue.Index)
    assert reduction_df.shape == (2, )

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 1
    assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate)
    assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat)
    assert len(reduction_df.chunks[0].inputs[0].inputs) == 2

    data = pd.DataFrame(np.random.rand(20, 10))
    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()

    assert isinstance(reduction_df, Series)
    assert isinstance(reduction_df.index_value._index_value,
                      (IndexValue.RangeIndex, IndexValue.Int64Index))
    assert reduction_df.shape == (10, )

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 4
    assert reduction_df.nsplits == ((3, 3, 3, 1), )
    assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate)
    assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat)
    assert len(reduction_df.chunks[0].inputs[0].inputs) == 2

    data = pd.DataFrame(np.random.rand(20, 20),
                        index=[str(i) for i in range(20)])
    reduction_df = getattr(from_pandas_df(data, chunk_size=4),
                           func_name)(axis='columns')

    assert reduction_df.shape == (20, )

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 5
    assert reduction_df.nsplits == ((4, ) * 5, )
    assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate)
    assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat)
    assert len(reduction_df.chunks[0].inputs[0].inputs) == 2

    with pytest.raises(NotImplementedError):
        getattr(from_pandas_df(data, chunk_size=3), func_name)(level=0, axis=1)
예제 #29
0
파일: test_fft.py 프로젝트: fyrestone/mars
def test_fft_freq():
    t = fftfreq(10, .1, chunk_size=3)

    assert t.shape == np.fft.fftfreq(10, .1).shape
    t = tile(t)
    assert t.shape == tuple(sum(ns) for ns in t.nsplits)

    t = rfftfreq(10, .1, chunk_size=3)

    assert t.shape == np.fft.rfftfreq(10, .1).shape
    t = tile(t)
    assert t.shape == tuple(sum(ns) for ns in t.nsplits)
예제 #30
0
파일: test_base.py 프로젝트: qinxuye/mars
def test_map_chunk():
    raw = np.random.rand(20)
    a = tensor(raw, chunk_size=10)

    mapped = tile(a.map_chunk(lambda x: x * 0.5))
    assert np.issubdtype(mapped.dtype, np.floating) is True
    assert mapped.shape == (np.nan, )
    assert len(mapped.chunks) == 2

    mapped = tile(a.map_chunk(lambda x: x * 0.5, elementwise=True))
    assert np.issubdtype(mapped.dtype, np.floating) is True
    assert mapped.shape == (20, )
    assert len(mapped.chunks) == 2