Пример #1
0
    def testFromDataFrameExecution(self):
        mdf = md.DataFrame({
            'angle': [0, 3, 4],
            'degree': [360, 180, 360]
        },
                           index=['circle', 'triangle', 'rectangle'])
        tensor_result = self.executor.execute_tensor(from_dataframe(mdf))
        tensor_expected = self.executor.execute_tensor(
            mt.tensor([[0, 360], [3, 180], [4, 360]]))
        np.testing.assert_equal(tensor_result, tensor_expected)

        # test up-casting
        mdf2 = md.DataFrame({'a': [0.1, 0.2, 0.3], 'b': [1, 2, 3]})
        tensor_result2 = self.executor.execute_tensor(from_dataframe(mdf2))
        np.testing.assert_equal(tensor_result2[0].dtype, np.dtype('float64'))
        tensor_expected2 = self.executor.execute_tensor(
            mt.tensor([[0.1, 1.0], [0.2, 2.0], [0.3, 3.0]]))
        np.testing.assert_equal(tensor_result2, tensor_expected2)

        raw = [[0.1, 0.2, 0.4], [0.4, 0.7, 0.3]]
        mdf3 = md.DataFrame(raw, columns=list('abc'), chunk_size=2)
        tensor_result3 = self.executor.execute_tensor(from_dataframe(mdf3),
                                                      concat=True)[0]
        np.testing.assert_array_equal(tensor_result3, np.asarray(raw))
        self.assertTrue(tensor_result3.flags['F_CONTIGUOUS'])
        self.assertFalse(tensor_result3.flags['C_CONTIGUOUS'])

        # test from series
        series = md.Series([1, 2, 3])
        tensor_result = series.to_tensor().execute()
        np.testing.assert_array_equal(tensor_result, np.array([1, 2, 3]))

        series = md.Series(range(10), chunk_size=3)
        tensor_result = series.to_tensor().execute()
        np.testing.assert_array_equal(tensor_result, np.arange(10))

        # test from index
        index = md.Index(pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]))
        tensor_result = index.to_tensor(extract_multi_index=True).execute()
        np.testing.assert_array_equal(tensor_result,
                                      np.arange(6).reshape((3, 2)))

        index = md.Index(pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]))
        tensor_result = index.to_tensor(extract_multi_index=False).execute()
        np.testing.assert_array_equal(
            tensor_result,
            pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]).to_series())
Пример #2
0
    def testSeriesGetitem(self):
        data = pd.Series(np.random.rand(10))
        series = md.Series(data)
        self.assertEqual(series[1].execute(), data[1])

        data = pd.Series(np.random.rand(10), name='a')
        series = md.Series(data, chunk_size=4)

        for i in range(10):
            series1 = series[i]
            self.assertEqual(series1.execute(), data[i])

        series2 = series[[0, 1, 2, 3, 4]]
        pd.testing.assert_series_equal(series2.execute(), data[[0, 1, 2, 3,
                                                                4]])

        series3 = series[[4, 3, 2, 1, 0]]
        pd.testing.assert_series_equal(series3.execute(), data[[4, 3, 2, 1,
                                                                0]])

        series4 = series[[1, 2, 3, 2, 1, 0]]
        pd.testing.assert_series_equal(series4.execute(),
                                       data[[1, 2, 3, 2, 1, 0]])
        #
        index = ['i' + str(i) for i in range(20)]
        data = pd.Series(np.random.rand(20), index=index, name='a')
        series = md.Series(data, chunk_size=3)

        for idx in index:
            series1 = series[idx]
            self.assertEqual(series1.execute(), data[idx])

        selected = ['i1', 'i2', 'i3', 'i4', 'i5']
        series2 = series[selected]
        pd.testing.assert_series_equal(series2.execute(), data[selected])

        selected = ['i4', 'i7', 'i0', 'i1', 'i5']
        series3 = series[selected]
        pd.testing.assert_series_equal(series3.execute(), data[selected])

        selected = ['i0', 'i1', 'i5', 'i4', 'i0', 'i1']
        series4 = series[selected]
        pd.testing.assert_series_equal(series4.execute(), data[selected])

        selected = ['i0']
        series5 = series[selected]
        pd.testing.assert_series_equal(series5.execute(), data[selected])
Пример #3
0
def test_series_groupby_agg(setup):
    rs = np.random.RandomState(0)
    series1 = pd.Series(rs.rand(10))
    ms1 = md.Series(series1, chunk_size=3)

    agg_funs = [
        'std', 'mean', 'var', 'max', 'count', 'size', 'all', 'any', 'skew',
        'kurt', 'sem'
    ]

    for method in ['tree', 'shuffle']:
        for agg_fun in agg_funs:
            r = ms1.groupby(lambda x: x % 2).agg(agg_fun, method=method)
            pd.testing.assert_series_equal(
                r.execute().fetch(),
                series1.groupby(lambda x: x % 2).agg(agg_fun))

        r = ms1.groupby(lambda x: x % 2).agg(agg_funs, method=method)
        pd.testing.assert_frame_equal(
            r.execute().fetch(),
            series1.groupby(lambda x: x % 2).agg(agg_funs))

        # test groupby series
        r = ms1.groupby(ms1).sum(method=method)
        pd.testing.assert_series_equal(
            r.execute().fetch().sort_index(),
            series1.groupby(series1).sum().sort_index())

        r = ms1.groupby(ms1).sum(method=method)
        pd.testing.assert_series_equal(
            r.execute().fetch().sort_index(),
            series1.groupby(series1).sum().sort_index())

    # test inserted kurt method
    r = ms1.groupby(ms1).kurtosis()
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   series1.groupby(series1).kurtosis())

    for agg_fun in agg_funs:
        r = getattr(ms1.groupby(lambda x: x % 2), agg_fun)(method='tree')
        pd.testing.assert_series_equal(
            r.execute().fetch(),
            getattr(series1.groupby(lambda x: x % 2), agg_fun)())

    r = ms1.groupby(lambda x: x % 2).agg(['cumsum', 'cumcount'], method='tree')
    pd.testing.assert_frame_equal(
        r.execute().fetch().sort_index(),
        series1.groupby(lambda x: x % 2).agg(['cumsum',
                                              'cumcount']).sort_index())

    r = ms1.groupby(lambda x: x % 2).agg(MockReduction2(name='custom_r'))
    pd.testing.assert_series_equal(
        r.execute().fetch(),
        series1.groupby(lambda x: x % 2).agg(MockReduction2(name='custom_r')))

    r = ms1.groupby(lambda x: x % 2).agg(col_var='var', col_skew='skew')
    pd.testing.assert_frame_equal(
        r.execute().fetch(),
        series1.groupby(lambda x: x % 2).agg(col_var='var', col_skew='skew'))
Пример #4
0
def test_series_fill_na_execution(setup):
    series_raw = pd.Series(np.nan, index=range(20))
    for _ in range(3):
        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
    value_series_raw = pd.Series(np.random.randint(0, 100, (10,)).astype(np.float32))

    # test single chunk
    series = md.Series(series_raw)

    r = series.fillna(1)
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   series_raw.fillna(1))

    # test single chunk with value as single chunk
    value_series = md.Series(value_series_raw)
    r = series.fillna(value_series)
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   series_raw.fillna(value_series_raw))

    series = md.Series(series_raw, chunk_size=3)

    # test chunked with numeric fill
    r = series.fillna(1)
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   series_raw.fillna(1))

    # test forward fill in axis=0 without limit
    r = series.fillna(method='pad')
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   series_raw.fillna(method='pad'))

    # test backward fill in axis=0 without limit
    r = series.fillna(method='backfill')
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   series_raw.fillna(method='backfill'))

    # test fill with series
    value_df = md.Series(value_series_raw, chunk_size=4)
    r = series.fillna(value_df)
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   series_raw.fillna(value_series_raw))

    # test inplace tile
    series.fillna(1, inplace=True)
    pd.testing.assert_series_equal(series.execute().fetch(),
                                   series_raw.fillna(1))
Пример #5
0
    def testSeriesAggregate(self):
        all_aggs = [
            'sum', 'prod', 'min', 'max', 'count', 'size', 'mean', 'var', 'std',
            'sem', 'skew', 'kurt'
        ]
        data = pd.Series(np.random.rand(20),
                         index=[str(i) for i in range(20)],
                         name='a')
        series = md.Series(data)

        result = series.agg(all_aggs)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(all_aggs))

        for func in all_aggs:
            result = series.agg(func)
            self.assertAlmostEqual(
                self.executor.execute_dataframe(result, concat=True)[0],
                data.agg(func))

        series = md.Series(data, chunk_size=3)

        for func in all_aggs:
            result = series.agg(func)
            self.assertAlmostEqual(
                self.executor.execute_dataframe(result, concat=True)[0],
                data.agg(func))

        result = series.agg(all_aggs)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(all_aggs))

        result = series.agg({'col_sum': 'sum', 'col_count': 'count'})
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg({
                'col_sum': 'sum',
                'col_count': 'count'
            }))

        result = series.agg(col_var='var', col_skew='skew')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(col_var='var', col_skew='skew'))
Пример #6
0
    def testSeriesExpandingAgg(self):
        raw = pd.Series(np.random.rand(10), name='a')
        raw[:3] = np.nan
        raw[5:7] = np.nan

        series = md.Series(raw, chunk_size=10)

        r = series.expanding().agg(['sum'])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.expanding().agg(['sum']))

        r = series.expanding().agg('sum')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.expanding().agg('sum'))

        series = md.Series(raw, chunk_size=3)

        aggs = ['sum', 'count', 'min', 'max', 'mean', 'var', 'std']

        for fun_name in aggs:
            r = series.expanding().agg(fun_name)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r, concat=True)[0],
                raw.expanding().agg(fun_name))

        r = series.expanding().agg(['sum'])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.expanding().agg(['sum']))

        r = series.expanding().agg(aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.expanding().agg(aggs))

        r = series.expanding(2).agg(aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.expanding(2).agg(aggs))

        r = series.expanding(0).agg(aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.expanding(0).agg(aggs))
Пример #7
0
    def testFetchDataFrameSlices(self, *_):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:
            session = cluster.session
            a = mt.random.rand(10, 10, chunk_size=3)
            df = md.DataFrame(a)

            r = session.run(df)

            r_slice1 = session.fetch(df.iloc[:2])
            pd.testing.assert_frame_equal(r.iloc[:2], r_slice1)

            r_slice2 = session.fetch(df.iloc[2:8, 2:8])
            pd.testing.assert_frame_equal(r.iloc[2:8, 2:8], r_slice2)

            r_slice3 = session.fetch(df.iloc[:, 2:])
            pd.testing.assert_frame_equal(r.iloc[:, 2:], r_slice3)

            r_slice4 = session.fetch(df.iloc[:, -5:])
            pd.testing.assert_frame_equal(r.iloc[:, -5:], r_slice4)

            r_slice5 = session.fetch(df.iloc[4])
            pd.testing.assert_series_equal(r.iloc[4], r_slice5)

            r_slice6 = session.fetch(df.iloc[6:9])
            pd.testing.assert_frame_equal(r.iloc[6:9], r_slice6)

            # test repr
            pdf = pd.DataFrame(np.random.randint(1000, size=(80, 10)))
            df2 = md.DataFrame(pdf, chunk_size=41)
            self.assertEqual(repr(df2.execute(session=session)), repr(pdf))

            ps = pdf[0]
            s = md.Series(ps, chunk_size=41)
            self.assertEqual(repr(s.execute(session=session)), repr(ps))

            web_session = new_session('http://' + cluster._web_endpoint)
            r = web_session.run(df)

            r_slice1 = web_session.fetch(df.iloc[:2])
            pd.testing.assert_frame_equal(r.iloc[:2], r_slice1)

            r_slice2 = web_session.fetch(df.iloc[2:8, 2:8])
            pd.testing.assert_frame_equal(r.iloc[2:8, 2:8], r_slice2)

            r_slice3 = web_session.fetch(df.iloc[:, 2:])
            pd.testing.assert_frame_equal(r.iloc[:, 2:], r_slice3)

            r_slice4 = web_session.fetch(df.iloc[:, -5:])
            pd.testing.assert_frame_equal(r.iloc[:, -5:], r_slice4)

            r_slice5 = web_session.fetch(df.iloc[4])
            pd.testing.assert_series_equal(r.iloc[4], r_slice5)

            r_slice6 = web_session.fetch(df.iloc[6:9])
            pd.testing.assert_frame_equal(r.iloc[6:9], r_slice6)
Пример #8
0
    def testILocGetItem(self):
        df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]],
                           index=['a1', 'a2', 'a3'],
                           columns=['x', 'y', 'z'])
        df2 = md.DataFrame(df1, chunk_size=2)

        # plain index
        expected = df1.iloc[1]
        df3 = df2.iloc[1]
        pd.testing.assert_series_equal(expected, df3.execute())

        # slice index
        expected = df1.iloc[:, 2:4]
        df4 = df2.iloc[:, 2:4]
        pd.testing.assert_frame_equal(expected, df4.execute())

        # plain fancy index
        expected = df1.iloc[[0], [0, 1, 2]]
        df5 = df2.iloc[[0], [0, 1, 2]]
        pd.testing.assert_frame_equal(expected, df5.execute())

        # fancy index
        expected = df1.iloc[[1, 2], [0, 1, 2]]
        df6 = df2.iloc[[1, 2], [0, 1, 2]]
        pd.testing.assert_frame_equal(expected, df6.execute())

        # plain index
        expected = df1.iloc[1, 2]
        df7 = df2.iloc[1, 2]
        self.assertEqual(expected, df7.execute())

        # test Series
        data = pd.Series(np.arange(10))
        series = md.Series(data, chunk_size=3).iloc[:3]
        pd.testing.assert_series_equal(series.execute(), data.iloc[:3])

        series = md.Series(data, chunk_size=3).iloc[4]
        self.assertEqual(series.execute(), data.iloc[4])

        series = md.Series(data, chunk_size=3).iloc[[2, 3, 4, 9]]
        pd.testing.assert_series_equal(series.execute(),
                                       data.iloc[[2, 3, 4, 9]])

        series = md.Series(data).iloc[5:]
        pd.testing.assert_series_equal(series.execute(), data.iloc[5:])
Пример #9
0
def test_custom_series_aggregate(setup, check_ref_counts):
    data = pd.Series(np.random.rand(20))

    s = md.Series(data)
    result = s.agg(MockReduction1())
    assert result.execute().fetch() == data.agg(MockReduction1())

    result = s.agg(MockReduction2())
    assert result.execute().fetch() == data.agg(MockReduction2())

    s = md.Series(data, chunk_size=5)
    result = s.agg(MockReduction2())
    assert pytest.approx(result.execute().fetch()) == data.agg(
        MockReduction2())

    result = s.agg(MockReduction2())
    assert pytest.approx(result.execute().fetch()) == data.agg(
        MockReduction2())
Пример #10
0
def test_groupby_transform(setup):
    df1 = pd.DataFrame({
        'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
        'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
        'c': list('aabaaddce'),
        'd': [3, 4, 5, 3, 5, 4, 1, 2, 3],
        'e': [1, 3, 4, 5, 6, 5, 4, 4, 4],
        'f': list('aabaaddce'),
    })

    def transform_series(s, truncate=True):
        s = s.sort_index()
        if truncate and len(s.index) > 1:
            s = s.iloc[:-1].reset_index(drop=True)
        return s

    mdf = md.DataFrame(df1, chunk_size=3)

    r = mdf.groupby('b').transform(transform_series, truncate=False)
    pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                  df1.groupby('b').transform(transform_series, truncate=False).sort_index())

    if pd.__version__ != '1.1.0':
        r = mdf.groupby('b').transform(['cummax', 'cumsum'], _call_agg=True)
        pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                      df1.groupby('b').agg(['cummax', 'cumsum']).sort_index())

        agg_list = ['cummax', 'cumsum']
        r = mdf.groupby('b').transform(agg_list, _call_agg=True)
        pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                      df1.groupby('b').agg(agg_list).sort_index())

        agg_dict = OrderedDict([('d', 'cummax'), ('b', 'cumsum')])
        r = mdf.groupby('b').transform(agg_dict, _call_agg=True)
        pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                      df1.groupby('b').agg(agg_dict).sort_index())

    agg_list = ['sum', lambda s: s.sum()]
    r = mdf.groupby('b').transform(agg_list, _call_agg=True)
    pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                  df1.groupby('b').agg(agg_list).sort_index())

    series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
    ms1 = md.Series(series1, chunk_size=3)

    r = ms1.groupby(lambda x: x % 3).transform(lambda x: x + 1)
    pd.testing.assert_series_equal(r.execute().fetch().sort_index(),
                                   series1.groupby(lambda x: x % 3).transform(lambda x: x + 1).sort_index())

    r = ms1.groupby(lambda x: x % 3).transform('cummax', _call_agg=True)
    pd.testing.assert_series_equal(r.execute().fetch().sort_index(),
                                   series1.groupby(lambda x: x % 3).agg('cummax').sort_index())

    agg_list = ['cummax', 'cumcount']
    r = ms1.groupby(lambda x: x % 3).transform(agg_list, _call_agg=True)
    pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                  series1.groupby(lambda x: x % 3).agg(agg_list).sort_index())
Пример #11
0
    def testReplace(self):
        # dataframe cases
        df_raw = pd.DataFrame(-1,
                              index=range(0, 20),
                              columns=list('ABCDEFGHIJ'))
        for _ in range(30):
            df_raw.iloc[random.randint(0, 19),
                        random.randint(0, 9)] = random.randint(0, 99)
        for rowid in range(random.randint(1, 5)):
            row = random.randint(0, 19)
            for idx in range(0, 10):
                df_raw.iloc[row, idx] = random.randint(0, 99)

        # not supporting fill with limit
        df = md.DataFrame(df_raw, chunk_size=4)
        with self.assertRaises(NotImplementedError):
            df.replace(-1, method='ffill', limit=5)

        r = df.replace(-1, method='ffill').tiles()
        self.assertEqual(len(r.chunks), 15)
        self.assertEqual(r.chunks[0].shape, (4, 4))
        self.assertEqual(r.chunks[0].op.stage, OperandStage.combine)
        self.assertEqual(r.chunks[0].op.method, 'ffill')
        self.assertIsNone(r.chunks[0].op.limit)
        self.assertEqual(r.chunks[-1].inputs[-1].shape, (1, 2))
        self.assertEqual(r.chunks[-1].inputs[-1].op.stage, OperandStage.map)
        self.assertEqual(r.chunks[-1].inputs[-1].op.method, 'ffill')
        self.assertIsNone(r.chunks[-1].inputs[-1].op.limit)

        r = df.replace(-1, 99).tiles()
        self.assertEqual(len(r.chunks), 15)
        self.assertEqual(r.chunks[0].shape, (4, 4))
        self.assertIsNone(r.chunks[0].op.stage)
        self.assertIsNone(r.chunks[0].op.limit)

        # series cases
        series_raw = pd.Series(-1, index=range(20))
        for _ in range(10):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
        series = md.Series(series_raw, chunk_size=4)

        r = series.replace(-1, method='ffill').tiles()
        self.assertEqual(len(r.chunks), 5)
        self.assertEqual(r.chunks[0].shape, (4, ))
        self.assertEqual(r.chunks[0].op.stage, OperandStage.combine)
        self.assertEqual(r.chunks[0].op.method, 'ffill')
        self.assertIsNone(r.chunks[0].op.limit)
        self.assertEqual(r.chunks[-1].inputs[-1].shape, (1, ))
        self.assertEqual(r.chunks[-1].inputs[-1].op.stage, OperandStage.map)
        self.assertEqual(r.chunks[-1].inputs[-1].op.method, 'ffill')
        self.assertIsNone(r.chunks[-1].inputs[-1].op.limit)

        r = series.replace(-1, 99).tiles()
        self.assertEqual(len(r.chunks), 5)
        self.assertEqual(r.chunks[0].shape, (4, ))
        self.assertIsNone(r.chunks[0].op.stage)
        self.assertIsNone(r.chunks[0].op.limit)
Пример #12
0
def test_series_expanding_agg(setup):
    raw = pd.Series(np.random.rand(10), name='a')
    raw[:3] = np.nan
    raw[5:10:2] = np.nan

    series = md.Series(raw, chunk_size=10)

    r = series.ewm(alpha=0.3).agg(['mean'])
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  raw.ewm(alpha=0.3).agg(['mean']))

    r = series.ewm(alpha=0.3).agg('mean')
    pd.testing.assert_series_equal(r.execute().fetch(),
                                   raw.ewm(alpha=0.3).agg('mean'))

    series = md.Series(raw, chunk_size=3)

    aggs = ['mean', 'var', 'std']

    for fun_name in aggs:
        r = series.ewm(alpha=0.3).agg(fun_name)
        pd.testing.assert_series_equal(r.execute().fetch(),
                                       raw.ewm(alpha=0.3).agg(fun_name))

        r = series.ewm(alpha=0.3, ignore_na=True).agg(fun_name)
        pd.testing.assert_series_equal(
            r.execute().fetch(),
            raw.ewm(alpha=0.3, ignore_na=True).agg(fun_name))

    r = series.ewm(alpha=0.3).agg(['mean'])
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  raw.ewm(alpha=0.3).agg(['mean']))

    r = series.ewm(alpha=0.3).agg(aggs)
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  raw.ewm(alpha=0.3).agg(aggs))

    r = series.ewm(alpha=0.3, min_periods=0).agg(aggs)
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  raw.ewm(alpha=0.3, min_periods=0).agg(aggs))

    r = series.ewm(alpha=0.3, min_periods=2).agg(aggs)
    pd.testing.assert_frame_equal(r.execute().fetch(),
                                  raw.ewm(alpha=0.3, min_periods=2).agg(aggs))
Пример #13
0
    def testRepr(self):
        # test tensor repr
        with np.printoptions(threshold=100):
            arr = np.random.randint(1000, size=(11, 4, 13))

            t = mt.tensor(arr, chunk_size=3)

            result = repr(t.execute())
            expected = repr(arr)
            self.assertEqual(result, expected)

        for size in (5, 58, 60, 62, 64):
            pdf = pd.DataFrame(np.random.randint(1000, size=(size, 10)))

            # test DataFrame repr
            df = md.DataFrame(pdf, chunk_size=size // 2)

            result = repr(df.execute())
            expected = repr(pdf)
            self.assertEqual(result, expected,
                             f'failed repr for DataFrame when size = {size}')

            # test DataFrame _repr_html_
            result = df.execute()._repr_html_()
            expected = pdf._repr_html_()
            self.assertEqual(
                result, expected,
                f'failed repr html for DataFrame when size = {size}')

            # test Series repr
            ps = pdf[0]
            s = md.Series(ps, chunk_size=size // 2)

            result = repr(s.execute())
            expected = repr(ps)
            self.assertEqual(result, expected,
                             f'failed repr for Series when size = {size}')

        # test Index repr
        pind = pd.date_range('2020-1-1', periods=10)
        ind = md.Index(pind, chunk_size=5)

        self.assertIn('DatetimeIndex', repr(ind.execute()))

        # test groupby repr
        df = md.DataFrame(
            pd.DataFrame(np.random.rand(100, 3), columns=list('abc')))
        grouped = df.groupby(['a', 'b']).execute()

        self.assertIn('DataFrameGroupBy', repr(grouped))

        # test Categorical repr
        c = md.qcut(range(5), 3)
        self.assertIn('Categorical', repr(c))
        self.assertIn('Categorical', str(c))
        self.assertEqual(repr(c.execute()), repr(pd.qcut(range(5), 3)))
Пример #14
0
    def testGPUExecution(self):
        df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list('abc'))
        df = to_gpu(md.DataFrame(df_raw, chunk_size=6))

        r = df.sum()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum())

        r = df.kurt()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt())

        r = df.agg(['sum', 'var'])
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(res.to_pandas(),
                                      df_raw.agg(['sum', 'var']))

        s_raw = pd.Series(np.random.rand(30))
        s = to_gpu(md.Series(s_raw, chunk_size=6))

        r = s.sum()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        self.assertAlmostEqual(res, s_raw.sum())

        r = s.kurt()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        self.assertAlmostEqual(res, s_raw.kurt())

        r = s.agg(['sum', 'var'])
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(res.to_pandas(),
                                       s_raw.agg(['sum', 'var']))

        s_raw = pd.Series(
            np.random.randint(0, 3, size=(30, )) *
            np.random.randint(0, 5, size=(30, )))
        s = to_gpu(md.Series(s_raw, chunk_size=6))

        r = s.unique()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        np.testing.assert_array_equal(
            cp.asnumpy(res).sort(),
            s_raw.unique().sort())
Пример #15
0
    def testGroupBy(self):
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce')
        })
        mdf = md.DataFrame(df1, chunk_size=3)
        grouped = mdf.groupby('b')
        r = self.executor.execute_dataframe(grouped, concat=True)[0]
        expected = df1.groupby('b')
        for key, group in r:
            pd.testing.assert_frame_equal(group, expected.get_group(key))

        df2 = pd.DataFrame(
            {
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce')
            },
            index=['i' + str(i) for i in range(9)])
        mdf = md.DataFrame(df2, chunk_size=3)
        grouped = mdf.groupby('b')
        r = self.executor.execute_dataframe(grouped, concat=True)[0]
        expected = df2.groupby('b')
        for key, group in r:
            pd.testing.assert_frame_equal(group, expected.get_group(key))

        series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
        ms1 = md.Series(series1, chunk_size=3)
        grouped = ms1.groupby(lambda x: x % 3)
        r = self.executor.execute_dataframe(grouped, concat=True)[0]
        expected = series1.groupby(lambda x: x % 3)
        for key, group in r:
            pd.testing.assert_series_equal(group, expected.get_group(key))

        series2 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3],
                            index=['i' + str(i) for i in range(9)])
        ms2 = md.Series(series2, chunk_size=3)
        grouped = ms2.groupby(lambda x: int(x[1:]) % 3)
        r = self.executor.execute_dataframe(grouped, concat=True)[0]
        expected = series2.groupby(lambda x: int(x[1:]) % 3)
        for key, group in r:
            pd.testing.assert_series_equal(group, expected.get_group(key))
Пример #16
0
 def setUp(self):
     n_rows = 1000
     n_columns = 10
     chunk_size = 20
     rs = mt.random.RandomState(0)
     self.X = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
     self.y = rs.rand(n_rows, chunk_size=chunk_size)
     self.X_df = md.DataFrame(self.X)
     self.y_series = md.Series(self.y)
     self.weight = rs.rand(n_rows, chunk_size=chunk_size)
Пример #17
0
def test_series_level_reduction(setup, func_name, func_opts: FunctionOptions):
    def compute(data, **kwargs):
        return getattr(data, func_name)(**kwargs)

    rs = np.random.RandomState(0)
    idx = pd.MultiIndex.from_arrays([[str(i) for i in range(100)],
                                     rs.choice(['A', 'B'], size=(100, ))],
                                    names=['a', 'b'])
    data = pd.Series(rs.randint(0, 8, size=(100, )), index=idx)

    r = compute(md.Series(data, chunk_size=13), level=1, method='tree')
    pd.testing.assert_series_equal(
        compute(data, level=1).sort_index(),
        r.execute().fetch().sort_index())

    # test null
    data = pd.Series(rs.rand(100), name='a', index=idx)
    idx_df = idx.to_frame()
    data[data > 0.5] = np.nan
    data[int(idx_df[idx_df.b == 'A'].iloc[0, 0])] = 0.1
    data[int(idx_df[idx_df.b == 'B'].iloc[0, 0])] = 0.1

    r = compute(md.Series(data, chunk_size=13), level=1, method='tree')
    pd.testing.assert_series_equal(
        compute(data, level=1).sort_index(),
        r.execute().fetch().sort_index())

    r = compute(md.Series(data, chunk_size=13),
                level=1,
                skipna=False,
                method='tree')
    pd.testing.assert_series_equal(
        compute(data, level=1, skipna=False).sort_index(),
        r.execute().fetch().sort_index())

    if func_opts.has_min_count:
        r = compute(md.Series(data, chunk_size=13),
                    min_count=1,
                    level=1,
                    method='tree')
        pd.testing.assert_series_equal(
            compute(data, min_count=1, level=1).sort_index(),
            r.execute().fetch().sort_index())
Пример #18
0
def test_series_getitem():
    data = pd.Series(np.random.rand(10, ), name='a')
    series = md.Series(data, chunk_size=3)

    result1 = series[2]
    assert result1.shape == ()

    result1 = tile(result1)
    assert result1.nsplits == ()
    assert len(result1.chunks) == 1
    assert isinstance(result1.chunks[0], TENSOR_CHUNK_TYPE)
    assert result1.chunks[0].shape == ()
    assert result1.chunks[0].dtype == data.dtype

    result2 = series[[4, 5, 1, 2, 3]]
    assert result2.shape == (5, )

    result2 = tile(result2)
    assert result2.nsplits == ((2, 2, 1), )
    assert len(result2.chunks) == 3
    assert result2.chunks[0].op.labels == [4, 5]
    assert result2.chunks[1].op.labels == [1, 2]
    assert result2.chunks[2].op.labels == [3]

    data = pd.Series(np.random.rand(10),
                     index=['i' + str(i) for i in range(10)])
    series = md.Series(data, chunk_size=3)

    result1 = series['i2']
    assert result1.shape == ()

    result1 = tile(result1)
    assert result1.nsplits == ()
    assert result1.chunks[0].dtype == data.dtype
    assert result1.chunks[0].op.labels == 'i2'

    result2 = series[['i2', 'i4']]
    assert result2.shape == (2, )

    result2 = tile(result2)
    assert result2.nsplits == ((2, ), )
    assert result2.chunks[0].dtype == data.dtype
    assert result2.chunks[0].op.labels == ['i2', 'i4']
Пример #19
0
    def testInitializerExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)])
        df = md.DataFrame(pdf, chunk_size=(15, 10))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

        ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a')
        series = md.Series(ps, chunk_size=7)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)
Пример #20
0
    def testSeriesGetitem(self):
        data = pd.Series(np.random.rand(10, ), name='a')
        series = md.Series(data, chunk_size=3)

        result1 = series[2]
        self.assertEqual(result1.shape, ())

        result1.tiles()
        self.assertEqual(result1.nsplits, ())
        self.assertEqual(len(result1.chunks), 1)
        self.assertIsInstance(result1.chunks[0], TENSOR_CHUNK_TYPE)
        self.assertEqual(result1.chunks[0].shape, ())
        self.assertEqual(result1.chunks[0].dtype, data.dtype)

        result2 = series[[4, 5, 1, 2, 3]]
        self.assertEqual(result2.shape, (5, ))

        result2.tiles()
        self.assertEqual(result2.nsplits, ((2, 2, 1), ))
        self.assertEqual(len(result2.chunks), 3)
        self.assertEqual(result2.chunks[0].op.labels, [4, 5])
        self.assertEqual(result2.chunks[1].op.labels, [1, 2])
        self.assertEqual(result2.chunks[2].op.labels, [3])

        data = pd.Series(np.random.rand(10),
                         index=['i' + str(i) for i in range(10)])
        series = md.Series(data, chunk_size=3)

        result1 = series['i2']
        self.assertEqual(result1.shape, ())

        result1.tiles()
        self.assertEqual(result1.nsplits, ())
        self.assertEqual(result1.chunks[0].dtype, data.dtype)
        self.assertTrue(result1.chunks[0].op.labels, ['i2'])

        result2 = series[['i2', 'i4']]
        self.assertEqual(result2.shape, (2, ))

        result2.tiles()
        self.assertEqual(result2.nsplits, ((2, ), ))
        self.assertEqual(result2.chunks[0].dtype, data.dtype)
        self.assertTrue(result2.chunks[0].op.labels, [['i2', 'i4']])
Пример #21
0
def test_replace():
    # dataframe cases
    df_raw = pd.DataFrame(-1, index=range(0, 20), columns=list('ABCDEFGHIJ'))
    for _ in range(30):
        df_raw.iloc[random.randint(0, 19),
                    random.randint(0, 9)] = random.randint(0, 99)
    for rowid in range(random.randint(1, 5)):
        row = random.randint(0, 19)
        for idx in range(0, 10):
            df_raw.iloc[row, idx] = random.randint(0, 99)

    # not supporting fill with limit
    df = md.DataFrame(df_raw, chunk_size=4)
    with pytest.raises(NotImplementedError):
        df.replace(-1, method='ffill', limit=5)

    r = tile(df.replace(-1, method='ffill'))
    assert len(r.chunks) == 15
    assert r.chunks[0].shape == (4, 4)
    assert r.chunks[0].op.stage == OperandStage.combine
    assert r.chunks[0].op.method == 'ffill'
    assert r.chunks[0].op.limit is None
    assert r.chunks[-1].inputs[-1].shape == (1, 2)
    assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert r.chunks[-1].inputs[-1].op.method == 'ffill'
    assert r.chunks[-1].inputs[-1].op.limit is None

    r = tile(df.replace(-1, 99))
    assert len(r.chunks) == 15
    assert r.chunks[0].shape == (4, 4)
    assert r.chunks[0].op.stage is None
    assert r.chunks[0].op.limit is None

    # series cases
    series_raw = pd.Series(-1, index=range(20))
    for _ in range(10):
        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
    series = md.Series(series_raw, chunk_size=4)

    r = tile(series.replace(-1, method='ffill'))
    assert len(r.chunks) == 5
    assert r.chunks[0].shape == (4, )
    assert r.chunks[0].op.stage == OperandStage.combine
    assert r.chunks[0].op.method == 'ffill'
    assert r.chunks[0].op.limit is None
    assert r.chunks[-1].inputs[-1].shape == (1, )
    assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert r.chunks[-1].inputs[-1].op.method == 'ffill'
    assert r.chunks[-1].inputs[-1].op.limit is None

    r = tile(series.replace(-1, 99))
    assert len(r.chunks) == 5
    assert r.chunks[0].shape == (4, )
    assert r.chunks[0].op.stage is None
    assert r.chunks[0].op.limit is None
Пример #22
0
def test_series_reduction(setup, check_ref_counts, func_name,
                          func_opts: FunctionOptions):
    def compute(data, **kwargs):
        return getattr(data, func_name)(**kwargs)

    rs = np.random.RandomState(0)
    data = pd.Series(rs.randint(0, 8, (10, )),
                     index=[str(i) for i in range(10)],
                     name='a')
    r = compute(md.Series(data))
    assert pytest.approx(compute(data)) == r.execute().fetch()

    r = compute(md.Series(data, chunk_size=6))
    assert pytest.approx(compute(data)) == r.execute().fetch()

    r = compute(md.Series(data, chunk_size=3))
    assert pytest.approx(compute(data)) == r.execute().fetch()

    r = compute(md.Series(data, chunk_size=4), axis='index')
    assert pytest.approx(compute(data, axis='index')) == r.execute().fetch()

    r = compute(md.Series(data, chunk_size=4), axis='index')
    assert pytest.approx(compute(data, axis='index')) == r.execute().fetch()

    data = pd.Series(rs.rand(20), name='a')
    data[0] = 0.1  # make sure not all elements are NAN
    data[data > 0.5] = np.nan
    r = compute(md.Series(data, chunk_size=3))
    assert pytest.approx(compute(data)) == r.execute().fetch()

    r = compute(md.Series(data, chunk_size=3), skipna=False)
    assert np.isnan(r.execute().fetch())

    if func_opts.has_min_count:
        r = compute(md.Series(data, chunk_size=3), skipna=False, min_count=2)
        assert np.isnan(r.execute().fetch())

        r = compute(md.Series(data, chunk_size=3), min_count=1)
        assert pytest.approx(compute(data, min_count=1)) == r.execute().fetch()

        reduction_df5 = compute(md.Series(data, chunk_size=3), min_count=21)
        assert np.isnan(reduction_df5.execute().fetch())
Пример #23
0
    def testIndexOnly(self):
        df = md.DataFrame(index=[1, 2, 3])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df, concat=True)[0],
            pd.DataFrame(index=[1, 2, 3]))

        s = md.Series(index=[1, 2, 3])
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(s, concat=True)[0],
            pd.Series(index=[1, 2, 3]))

        df = md.DataFrame(index=md.Index([1, 2, 3]))
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df, concat=True)[0],
            pd.DataFrame(index=[1, 2, 3]))

        s = md.Series(index=md.Index([1, 2, 3]), dtype=object)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(s, concat=True)[0],
            pd.Series(index=[1, 2, 3], dtype=object))
Пример #24
0
    def testMainDataFrameWithoutEtcd(self):
        self.start_processes(
            etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])
        sess = new_session(self.session_manager_ref.address)

        raw1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(10),
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=(10, 5))
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(11, 1, -1),
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=(10, 6))
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10))
        raw1[0] = raw1[0].apply(str)
        df1 = md.DataFrame(raw1, chunk_size=5)
        r = df1.sort_values(0)
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1.sort_values(0))

        s1 = pd.Series(np.random.rand(10),
                       index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = md.Series(s1, chunk_size=6)
        result = series1.execute(session=sess,
                                 timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_series_equal(result, s1)
Пример #25
0
def test_gpu_execution(setup, check_ref_counts):
    df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list('abc'))
    df = to_gpu(md.DataFrame(df_raw, chunk_size=6))

    r = df.sum()
    res = r.execute().fetch()
    pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum())

    r = df.kurt()
    res = r.execute().fetch()
    pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt())

    r = df.agg(['sum', 'var'])
    res = r.execute().fetch()
    pd.testing.assert_frame_equal(res.to_pandas(), df_raw.agg(['sum', 'var']))

    s_raw = pd.Series(np.random.rand(30))
    s = to_gpu(md.Series(s_raw, chunk_size=6))

    r = s.sum()
    res = r.execute().fetch()
    assert pytest.approx(res) == s_raw.sum()

    r = s.kurt()
    res = r.execute().fetch()
    assert pytest.approx(res) == s_raw.kurt()

    r = s.agg(['sum', 'var'])
    res = r.execute().fetch()
    pd.testing.assert_series_equal(res.to_pandas(), s_raw.agg(['sum', 'var']))

    s_raw = pd.Series(
        np.random.randint(0, 3, size=(30, )) *
        np.random.randint(0, 5, size=(30, )))
    s = to_gpu(md.Series(s_raw, chunk_size=6))

    r = s.unique()
    res = r.execute().fetch()
    np.testing.assert_array_equal(
        cp.asnumpy(res).sort(),
        s_raw.unique().sort())
Пример #26
0
    def testInitializerExecution(self):
        arr = np.random.rand(20, 30)

        pdf = pd.DataFrame(arr, index=[np.arange(20), np.arange(20, 0, -1)])
        df = md.DataFrame(pdf, chunk_size=(15, 10))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

        df = md.DataFrame(arr, index=md.date_range('2020-1-1', periods=20))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(
            result,
            pd.DataFrame(arr, index=pd.date_range('2020-1-1', periods=20)))

        df = md.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
                          index=md.date_range('1/1/2010', periods=6, freq='D'))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(
            result,
            pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
                         index=pd.date_range('1/1/2010', periods=6, freq='D')))

        s = np.random.rand(20)

        ps = pd.Series(s,
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = md.Series(ps, chunk_size=7)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

        series = md.Series(s, index=md.date_range('2020-1-1', periods=20))
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(
            result, pd.Series(s, index=pd.date_range('2020-1-1', periods=20)))

        pi = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
        index = md.Index(md.Index(pi))
        result = self.executor.execute_dataframe(index, concat=True)[0]
        pd.testing.assert_index_equal(pi, result)
Пример #27
0
    def testGroupByApply(self):
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce')
        })

        def apply_df(df):
            return df.sort_index()

        def apply_series(s):
            return s.sort_index()

        mdf = md.DataFrame(df1, chunk_size=3)

        applied = mdf.groupby('b').apply(apply_df).tiles()
        pd.testing.assert_series_equal(applied.dtypes, df1.dtypes)
        self.assertEqual(applied.shape, (np.nan, 3))
        self.assertEqual(applied.op._op_type_, opcodes.APPLY)
        self.assertEqual(applied.op.output_types[0], OutputType.dataframe)
        self.assertEqual(len(applied.chunks), 3)
        self.assertEqual(applied.chunks[0].shape, (np.nan, 3))
        pd.testing.assert_series_equal(applied.chunks[0].dtypes, df1.dtypes)

        applied = mdf.groupby('b').apply(lambda df: df.a).tiles()
        self.assertEqual(applied.dtype, df1.a.dtype)
        self.assertEqual(applied.shape, (np.nan, ))
        self.assertEqual(applied.op._op_type_, opcodes.APPLY)
        self.assertEqual(applied.op.output_types[0], OutputType.series)
        self.assertEqual(len(applied.chunks), 3)
        self.assertEqual(applied.chunks[0].shape, (np.nan, ))
        self.assertEqual(applied.chunks[0].dtype, df1.a.dtype)

        applied = mdf.groupby('b').apply(lambda df: df.a.sum()).tiles()
        self.assertEqual(applied.dtype, df1.a.dtype)
        self.assertEqual(applied.shape, (np.nan, ))
        self.assertEqual(applied.op._op_type_, opcodes.APPLY)
        self.assertEqual(applied.op.output_types[0], OutputType.series)
        self.assertEqual(len(applied.chunks), 3)
        self.assertEqual(applied.chunks[0].shape, (np.nan, ))
        self.assertEqual(applied.chunks[0].dtype, df1.a.dtype)

        series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])

        ms1 = md.Series(series1, chunk_size=3)
        applied = ms1.groupby(lambda x: x % 3).apply(apply_series).tiles()
        self.assertEqual(applied.dtype, series1.dtype)
        self.assertEqual(applied.shape, (np.nan, ))
        self.assertEqual(applied.op._op_type_, opcodes.APPLY)
        self.assertEqual(applied.op.output_types[0], OutputType.series)
        self.assertEqual(len(applied.chunks), 3)
        self.assertEqual(applied.chunks[0].shape, (np.nan, ))
        self.assertEqual(applied.chunks[0].dtype, series1.dtype)
Пример #28
0
def test_series_count(setup, check_ref_counts):
    array = np.random.rand(10)
    array[[2, 7, 9]] = np.nan
    data = pd.Series(array)
    series = md.Series(data)

    result = series.count().execute().fetch()
    expected = data.count()
    assert result == expected

    series2 = md.Series(data, chunk_size=1)

    result = series2.count().execute().fetch()
    expected = data.count()
    assert result == expected

    series2 = md.Series(data, chunk_size=3)

    result = series2.count().execute().fetch()
    expected = data.count()
    assert result == expected
Пример #29
0
def test_series_bool_reduction(setup, check_ref_counts, func_name):
    def compute(data, **kwargs):
        return getattr(data, func_name)(**kwargs)

    rs = np.random.RandomState(0)
    data = pd.Series(rs.rand(10) > 0.5,
                     index=[str(i) for i in range(10)],
                     name='a')
    r = compute(md.Series(data))
    assert compute(data) == r.execute().fetch()

    r = compute(md.Series(data, chunk_size=6))
    assert pytest.approx(compute(data)) == r.execute().fetch()

    r = compute(md.Series(data, chunk_size=3))
    assert pytest.approx(compute(data)) == r.execute().fetch()

    r = compute(md.Series(data, chunk_size=4), axis='index')
    assert pytest.approx(compute(data, axis='index')) == r.execute().fetch()

    # test null
    data = pd.Series(rs.rand(20), name='a')
    data[0] = 0.1  # make sure not all elements are NAN
    data[data > 0.5] = np.nan
    r = compute(md.Series(data, chunk_size=3))
    assert compute(data) == r.execute().fetch()

    r = compute(md.Series(data, chunk_size=3), skipna=False)
    assert r.execute().fetch() is True
Пример #30
0
def test_series_cum_reduction(setup, check_ref_counts, func_name):
    def compute(data, **kwargs):
        return getattr(data, func_name)(**kwargs)

    data = pd.Series(np.random.rand(20),
                     index=[str(i) for i in range(20)],
                     name='a')
    r = compute(md.Series(data))
    pd.testing.assert_series_equal(compute(data), r.execute().fetch())

    r = compute(md.Series(data, chunk_size=6))
    pd.testing.assert_series_equal(compute(data), r.execute().fetch())

    r = compute(md.Series(data, chunk_size=3))
    pd.testing.assert_series_equal(compute(data), r.execute().fetch())

    r = compute(md.Series(data, chunk_size=4), axis='index')
    pd.testing.assert_series_equal(compute(data, axis='index'),
                                   r.execute().fetch())

    data = pd.Series(np.random.rand(20), name='a')
    data[0] = 0.1  # make sure not all elements are NAN
    data[data > 0.5] = np.nan
    r = compute(md.Series(data, chunk_size=3))
    pd.testing.assert_series_equal(compute(data), r.execute().fetch())

    r = compute(md.Series(data, chunk_size=3), skipna=False)
    pd.testing.assert_series_equal(compute(data, skipna=False),
                                   r.execute().fetch())