示例#1
0
    def testRechunkExecution(self):
        data = pd.DataFrame(np.random.rand(8, 10))
        df = from_pandas_df(pd.DataFrame(data), chunk_size=3)
        df2 = df.rechunk((3, 4))
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data)
        df2 = df.rechunk(5)
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        # test Series rechunk execution.
        data = pd.Series(np.random.rand(10,))
        series = from_pandas_series(data)
        series2 = series.rechunk(3)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        series2 = series.rechunk(1)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        # test index rechunk execution
        data = pd.Index(np.random.rand(10,))
        index = from_pandas_index(data)
        index2 = index.rechunk(3)
        res = self.executor.execute_dataframe(index2, concat=True)[0]
        pd.testing.assert_index_equal(data, res)

        index2 = index.rechunk(1)
        res = self.executor.execute_dataframe(index2, concat=True)[0]
        pd.testing.assert_index_equal(data, res)
示例#2
0
    def testDrop(self):
        # test dataframe drop
        rs = np.random.RandomState(0)
        raw = pd.DataFrame(rs.randint(1000, size=(20, 8)),
                           columns=['c' + str(i + 1) for i in range(8)])

        df = from_pandas_df(raw, chunk_size=3)

        with self.assertRaises(KeyError):
            df.drop(columns=['c9'])
        with self.assertRaises(NotImplementedError):
            df.drop(columns=from_pandas_series(pd.Series(['c9'])))

        columns = ['c2', 'c4', 'c5', 'c6']
        index = [3, 6, 7]
        r = df.drop(columns=columns, index=index)
        self.assertIsInstance(r, DATAFRAME_TYPE)

        # test series drop
        raw = pd.Series(rs.randint(1000, size=(20, )))
        series = from_pandas_series(raw, chunk_size=3)

        r = series.drop(index=index)
        self.assertIsInstance(r, SERIES_TYPE)

        # test index drop
        ser = pd.Series(range(20))
        rs.shuffle(ser)
        raw = pd.Index(ser)

        idx = from_pandas_index(raw)

        r = idx.drop(index)
        self.assertIsInstance(r, INDEX_TYPE)
示例#3
0
    def testFromPandasIndex(self):
        data = pd.date_range('2020-1-1', periods=10, name='date')
        index = from_pandas_index(data, chunk_size=4)

        self.assertIsInstance(index, DatetimeIndex)
        self.assertEqual(index.name, data.name)
        self.assertEqual(index.dtype, data.dtype)
        self.assertIsInstance(index.index_value.value, IndexValue.DatetimeIndex)

        index = index.tiles()

        for i, c in enumerate(index.chunks):
            self.assertEqual(c.name, data.name)
            pd.testing.assert_index_equal(c.op.data, data[i * 4: (i + 1) * 4])
            self.assertEqual(c.dtype, data.dtype)
            self.assertIsInstance(c.index_value.value, IndexValue.DatetimeIndex)
示例#4
0
def test_from_pandas_index():
    data = pd.date_range('2020-1-1', periods=10, name='date')
    index = from_pandas_index(data, chunk_size=4)

    assert isinstance(index, DatetimeIndex)
    assert index.name == data.name
    assert index.dtype == data.dtype
    assert isinstance(index.index_value.value, IndexValue.DatetimeIndex)

    index = tile(index)

    for i, c in enumerate(index.chunks):
        assert c.name == data.name
        pd.testing.assert_index_equal(c.op.data, data[i * 4: (i + 1) * 4])
        assert c.dtype == data.dtype
        assert isinstance(c.index_value.value, IndexValue.DatetimeIndex)
示例#5
0
def test_drop():
    # test dataframe drop
    rs = np.random.RandomState(0)
    raw = pd.DataFrame(rs.randint(1000, size=(20, 8)),
                       columns=['c' + str(i + 1) for i in range(8)])

    df = from_pandas_df(raw, chunk_size=8)

    with pytest.raises(KeyError):
        df.drop(columns=['c9'])
    with pytest.raises(NotImplementedError):
        df.drop(columns=from_pandas_series(pd.Series(['c9'])))

    r = df.drop(columns=['c1'])
    pd.testing.assert_index_equal(r.index_value.to_pandas(), raw.index)

    tiled = tile(r)
    start = 0
    for c in tiled.chunks:
        raw_index = raw.index[start:start + c.shape[0]]
        start += c.shape[0]
        pd.testing.assert_index_equal(raw_index, c.index_value.to_pandas())

    df = from_pandas_df(raw, chunk_size=3)

    columns = ['c2', 'c4', 'c5', 'c6']
    index = [3, 6, 7]
    r = df.drop(columns=columns, index=index)
    assert isinstance(r, DATAFRAME_TYPE)

    # test series drop
    raw = pd.Series(rs.randint(1000, size=(20, )))
    series = from_pandas_series(raw, chunk_size=3)

    r = series.drop(index=index)
    assert isinstance(r, SERIES_TYPE)

    # test index drop
    ser = pd.Series(range(20))
    rs.shuffle(ser)
    raw = pd.Index(ser)

    idx = from_pandas_index(raw)

    r = idx.drop(index)
    assert isinstance(r, INDEX_TYPE)
示例#6
0
    def testFromPandasIndexExecution(self):
        pd_index = pd.timedelta_range('1 days', periods=10)
        index = from_pandas_index(pd_index, chunk_size=7)

        result = self.executor.execute_dataframe(index, concat=True)[0]
        pd.testing.assert_index_equal(pd_index, result)
示例#7
0
    def testCutExecution(self):
        rs = np.random.RandomState(0)
        raw = rs.random(15) * 1000
        s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)])
        bins = [10, 100, 500]
        ii = pd.interval_range(10, 500, 3)
        labels = ['a', 'b']

        t = tensor(raw, chunk_size=4)
        series = from_pandas_series(s, chunk_size=4)
        iii = from_pandas_index(ii, chunk_size=2)

        # cut on Series
        r = cut(series, bins)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.cut(s, bins))

        r, b = cut(series, bins, retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, bins, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        # cut on tensor
        r = cut(t, bins)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # one chunk
        r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.cut(s, bins, right=False, include_lowest=True))

        # test labels
        r = cut(t, bins, labels=labels)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        r = cut(t, bins, labels=False)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_tensor(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=False)
        np.testing.assert_array_equal(result, expected)

        # test labels which is tensor
        labels_t = tensor(['a', 'b'], chunk_size=1)
        r = cut(raw, bins, labels=labels_t, include_lowest=True)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels, include_lowest=True)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # test labels=False
        r, b = cut(raw, ii, labels=False, retbins=True)
        # result and expected is array whose dtype is CategoricalDtype
        r_result = self.executor.execute_tileable(r, concat=True)[0]
        b_result = self.executor.execute_tileable(b, concat=True)[0]
        r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True)
        for r, e in zip(r_result, r_expected):
            np.testing.assert_equal(r, e)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test bins which is md.IntervalIndex
        r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_dataframe(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test duplicates
        bins2 = [0, 2, 4, 6, 10, 10]
        r, b = cut(s, bins2, labels=False, retbins=True,
                   right=False, duplicates='drop')
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, bins2, labels=False, retbins=True,
                                        right=False, duplicates='drop')
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        ctx, executor = self._create_test_context(self.executor)
        with ctx:
            # test integer bins
            r = cut(series, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s, 3))

            r, b = cut(series, 3, right=False, retbins=True)
            r_result, b_result = executor.execute_dataframes([r, b])
            r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True)
            pd.testing.assert_series_equal(r_result, r_expected)
            np.testing.assert_array_equal(b_result, b_expected)

            # test min max same
            s2 = pd.Series([1.1] * 15)
            r = cut(s2, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s2, 3))

            # test inf exist
            s3 = s2.copy()
            s3[-1] = np.inf
            with self.assertRaises(ValueError):
                executor.execute_dataframes([cut(s3, 3)])
示例#8
0
def test_from_pandas_index_execution(setup):
    pd_index = pd.timedelta_range('1 days', periods=10)
    index = from_pandas_index(pd_index, chunk_size=7)

    result = index.execute().fetch()
    pd.testing.assert_index_equal(pd_index, result)