def testRechunkExecution(self): data = pd.DataFrame(np.random.rand(8, 10)) df = from_pandas_df(pd.DataFrame(data), chunk_size=3) df2 = df.rechunk((3, 4)) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) df2 = df.rechunk(5) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) # test Series rechunk execution. data = pd.Series(np.random.rand(10,)) series = from_pandas_series(data) series2 = series.rechunk(3) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) series2 = series.rechunk(1) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) # test index rechunk execution data = pd.Index(np.random.rand(10,)) index = from_pandas_index(data) index2 = index.rechunk(3) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res) index2 = index.rechunk(1) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res)
def testDrop(self): # test dataframe drop rs = np.random.RandomState(0) raw = pd.DataFrame(rs.randint(1000, size=(20, 8)), columns=['c' + str(i + 1) for i in range(8)]) df = from_pandas_df(raw, chunk_size=3) with self.assertRaises(KeyError): df.drop(columns=['c9']) with self.assertRaises(NotImplementedError): df.drop(columns=from_pandas_series(pd.Series(['c9']))) columns = ['c2', 'c4', 'c5', 'c6'] index = [3, 6, 7] r = df.drop(columns=columns, index=index) self.assertIsInstance(r, DATAFRAME_TYPE) # test series drop raw = pd.Series(rs.randint(1000, size=(20, ))) series = from_pandas_series(raw, chunk_size=3) r = series.drop(index=index) self.assertIsInstance(r, SERIES_TYPE) # test index drop ser = pd.Series(range(20)) rs.shuffle(ser) raw = pd.Index(ser) idx = from_pandas_index(raw) r = idx.drop(index) self.assertIsInstance(r, INDEX_TYPE)
def testFromPandasIndex(self): data = pd.date_range('2020-1-1', periods=10, name='date') index = from_pandas_index(data, chunk_size=4) self.assertIsInstance(index, DatetimeIndex) self.assertEqual(index.name, data.name) self.assertEqual(index.dtype, data.dtype) self.assertIsInstance(index.index_value.value, IndexValue.DatetimeIndex) index = index.tiles() for i, c in enumerate(index.chunks): self.assertEqual(c.name, data.name) pd.testing.assert_index_equal(c.op.data, data[i * 4: (i + 1) * 4]) self.assertEqual(c.dtype, data.dtype) self.assertIsInstance(c.index_value.value, IndexValue.DatetimeIndex)
def test_from_pandas_index(): data = pd.date_range('2020-1-1', periods=10, name='date') index = from_pandas_index(data, chunk_size=4) assert isinstance(index, DatetimeIndex) assert index.name == data.name assert index.dtype == data.dtype assert isinstance(index.index_value.value, IndexValue.DatetimeIndex) index = tile(index) for i, c in enumerate(index.chunks): assert c.name == data.name pd.testing.assert_index_equal(c.op.data, data[i * 4: (i + 1) * 4]) assert c.dtype == data.dtype assert isinstance(c.index_value.value, IndexValue.DatetimeIndex)
def test_drop(): # test dataframe drop rs = np.random.RandomState(0) raw = pd.DataFrame(rs.randint(1000, size=(20, 8)), columns=['c' + str(i + 1) for i in range(8)]) df = from_pandas_df(raw, chunk_size=8) with pytest.raises(KeyError): df.drop(columns=['c9']) with pytest.raises(NotImplementedError): df.drop(columns=from_pandas_series(pd.Series(['c9']))) r = df.drop(columns=['c1']) pd.testing.assert_index_equal(r.index_value.to_pandas(), raw.index) tiled = tile(r) start = 0 for c in tiled.chunks: raw_index = raw.index[start:start + c.shape[0]] start += c.shape[0] pd.testing.assert_index_equal(raw_index, c.index_value.to_pandas()) df = from_pandas_df(raw, chunk_size=3) columns = ['c2', 'c4', 'c5', 'c6'] index = [3, 6, 7] r = df.drop(columns=columns, index=index) assert isinstance(r, DATAFRAME_TYPE) # test series drop raw = pd.Series(rs.randint(1000, size=(20, ))) series = from_pandas_series(raw, chunk_size=3) r = series.drop(index=index) assert isinstance(r, SERIES_TYPE) # test index drop ser = pd.Series(range(20)) rs.shuffle(ser) raw = pd.Index(ser) idx = from_pandas_index(raw) r = idx.drop(index) assert isinstance(r, INDEX_TYPE)
def testFromPandasIndexExecution(self): pd_index = pd.timedelta_range('1 days', periods=10) index = from_pandas_index(pd_index, chunk_size=7) result = self.executor.execute_dataframe(index, concat=True)[0] pd.testing.assert_index_equal(pd_index, result)
def testCutExecution(self): rs = np.random.RandomState(0) raw = rs.random(15) * 1000 s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)]) bins = [10, 100, 500] ii = pd.interval_range(10, 500, 3) labels = ['a', 'b'] t = tensor(raw, chunk_size=4) series = from_pandas_series(s, chunk_size=4) iii = from_pandas_index(ii, chunk_size=2) # cut on Series r = cut(series, bins) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, pd.cut(s, bins)) r, b = cut(series, bins, retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # cut on tensor r = cut(t, bins) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # one chunk r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, pd.cut(s, bins, right=False, include_lowest=True)) # test labels r = cut(t, bins, labels=labels) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) r = cut(t, bins, labels=False) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_tensor(r, concat=True)[0] expected = pd.cut(raw, bins, labels=False) np.testing.assert_array_equal(result, expected) # test labels which is tensor labels_t = tensor(['a', 'b'], chunk_size=1) r = cut(raw, bins, labels=labels_t, include_lowest=True) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels, include_lowest=True) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # test labels=False r, b = cut(raw, ii, labels=False, retbins=True) # result and expected is array whose dtype is CategoricalDtype r_result = self.executor.execute_tileable(r, concat=True)[0] b_result = self.executor.execute_tileable(b, concat=True)[0] r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True) for r, e in zip(r_result, r_expected): np.testing.assert_equal(r, e) pd.testing.assert_index_equal(b_result, b_expected) # test bins which is md.IntervalIndex r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_dataframe(b, concat=True)[0] r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) pd.testing.assert_index_equal(b_result, b_expected) # test duplicates bins2 = [0, 2, 4, 6, 10, 10] r, b = cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) ctx, executor = self._create_test_context(self.executor) with ctx: # test integer bins r = cut(series, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s, 3)) r, b = cut(series, 3, right=False, retbins=True) r_result, b_result = executor.execute_dataframes([r, b]) r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # test min max same s2 = pd.Series([1.1] * 15) r = cut(s2, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s2, 3)) # test inf exist s3 = s2.copy() s3[-1] = np.inf with self.assertRaises(ValueError): executor.execute_dataframes([cut(s3, 3)])
def test_from_pandas_index_execution(setup): pd_index = pd.timedelta_range('1 days', periods=10) index = from_pandas_index(pd_index, chunk_size=7) result = index.execute().fetch() pd.testing.assert_index_equal(pd_index, result)