def test_comp(setup): df1 = DataFrame(pd.DataFrame(np.random.rand(4, 3))) df2 = DataFrame(pd.DataFrame(np.random.rand(4, 3))) with enter_mode(build=True): assert not df1.data == df2.data assert df1.data == df1.data for op in [operator.eq, operator.ne, operator.lt, operator.gt, operator.le, operator.ge]: eq_df = op(df1, df2) pd.testing.assert_index_equal(eq_df.index_value.to_pandas(), df1.index_value.to_pandas()) # index not identical df3 = DataFrame(pd.DataFrame(np.random.rand(4, 3), index=[1, 2, 3, 4])) with pytest.raises(ValueError): op(df1, df3) # columns not identical df4 = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=['a', 'b', 'c'])) with pytest.raises(ValueError): op(df1, df4) # test datetime df = DataFrame(pd.DataFrame(pd.date_range('20130101', periods=6))) for op in [operator.eq, operator.ne, operator.lt, operator.gt, operator.le, operator.ge]: r_df = op(df, datetime(2013, 1, 2)) pd.testing.assert_index_equal(r_df.index_value.to_pandas(), df.index_value.to_pandas())
def testComp(self): df1 = DataFrame(pd.DataFrame(np.random.rand(4, 3))) df2 = DataFrame(pd.DataFrame(np.random.rand(4, 3))) with build_mode(): self.assertFalse(df1.data == df2.data) self.assertTrue(df1.data == df1.data) for op in [ operator.eq, operator.ne, operator.lt, operator.gt, operator.le, operator.ge ]: eq_df = op(df1, df2) pd.testing.assert_index_equal(eq_df.index_value.to_pandas(), df1.index_value.to_pandas()) # index not identical df3 = DataFrame( pd.DataFrame(np.random.rand(4, 3), index=[1, 2, 3, 4])) with self.assertRaises(ValueError): op(df1, df3) # columns not identical df4 = DataFrame( pd.DataFrame(np.random.rand(4, 3), columns=['a', 'b', 'c'])) with self.assertRaises(ValueError): op(df1, df4)
def testSortValues(self): raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) df = DataFrame(raw) sorted_df = dataframe_sort_values(df, by='c') self.assertEqual(sorted_df.shape, raw.shape) self.assertIsInstance(sorted_df.op, DataFrameSortValues) tiled = sorted_df.tiles() self.assertEqual(len(tiled.chunks), 1) self.assertIsInstance(tiled.chunks[0].op, DataFrameSortValues) df = DataFrame(raw, chunk_size=6) sorted_df = dataframe_sort_values(df, by='c') self.assertEqual(sorted_df.shape, raw.shape) self.assertIsInstance(sorted_df.op, DataFrameSortValues) tiled = sorted_df.tiles() self.assertEqual(len(tiled.chunks), 2) self.assertEqual(tiled.chunks[0].op.stage, OperandStage.reduce) df = DataFrame(raw, chunk_size=3) sorted_df = dataframe_sort_values(df, by=['a', 'c']) self.assertEqual(sorted_df.shape, raw.shape) self.assertIsInstance(sorted_df.op, DataFrameSortValues) tiled = sorted_df.tiles() self.assertEqual(len(tiled.chunks), 3) self.assertEqual(tiled.chunks[0].op.stage, OperandStage.reduce) pd.testing.assert_index_equal(tiled.chunks[0].index_value.to_pandas(), pd.RangeIndex(3)) self.assertEqual(tiled.chunks[1].op.stage, OperandStage.reduce) pd.testing.assert_index_equal(tiled.chunks[1].index_value.to_pandas(), pd.RangeIndex(3, 6)) self.assertEqual(tiled.chunks[2].op.stage, OperandStage.reduce) pd.testing.assert_index_equal(tiled.chunks[2].index_value.to_pandas(), pd.RangeIndex(6, 10))
def testSortIndex(self): raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10), index=np.random.rand(10)) df = DataFrame(raw) sorted_df = sort_index(df) self.assertEqual(sorted_df.shape, raw.shape) self.assertIsInstance(sorted_df.op, DataFrameSortIndex) tiled = sorted_df.tiles() self.assertEqual(len(tiled.chunks), 1) self.assertIsInstance(tiled.chunks[0].op, DataFrameSortIndex) df = DataFrame(raw, chunk_size=6) sorted_df = sort_index(df) self.assertEqual(sorted_df.shape, raw.shape) self.assertIsInstance(sorted_df.op, DataFrameSortIndex) tiled = sorted_df.tiles() self.assertEqual(len(tiled.chunks), 2) self.assertEqual(tiled.chunks[0].op.stage, OperandStage.reduce) df = DataFrame(raw, chunk_size=3) sorted_df = sort_index(df) self.assertEqual(sorted_df.shape, raw.shape) self.assertIsInstance(sorted_df.op, DataFrameSortIndex) tiled = sorted_df.tiles() self.assertEqual(len(tiled.chunks), 3) self.assertEqual(tiled.chunks[0].op.stage, OperandStage.reduce) self.assertEqual(tiled.chunks[1].op.stage, OperandStage.reduce) self.assertEqual(tiled.chunks[2].op.stage, OperandStage.reduce) # support on axis 1 df = DataFrame(raw, chunk_size=4) sorted_df = sort_index(df, axis=1) self.assertEqual(sorted_df.shape, raw.shape) self.assertIsInstance(sorted_df.op, DataFrameSortIndex) tiled = sorted_df.tiles() self.assertTrue( all(isinstance(c.op, DataFrameIndex) for c in tiled.chunks))
def test_sort_index(): raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10), index=np.random.rand(10)) df = DataFrame(raw) sorted_df = sort_index(df) assert sorted_df.shape == raw.shape assert isinstance(sorted_df.op, DataFrameSortIndex) tiled = tile(sorted_df) assert len(tiled.chunks) == 1 assert isinstance(tiled.chunks[0].op, DataFrameSortIndex) df = DataFrame(raw, chunk_size=6) sorted_df = sort_index(df) assert sorted_df.shape == raw.shape assert isinstance(sorted_df.op, DataFrameSortIndex) tiled = tile(sorted_df) assert len(tiled.chunks) == 2 assert tiled.chunks[0].op.stage == OperandStage.reduce df = DataFrame(raw, chunk_size=3) sorted_df = sort_index(df) assert sorted_df.shape == raw.shape assert isinstance(sorted_df.op, DataFrameSortIndex) tiled = tile(sorted_df) assert len(tiled.chunks) == 3 assert tiled.chunks[0].op.stage == OperandStage.reduce assert tiled.chunks[1].op.stage == OperandStage.reduce assert tiled.chunks[2].op.stage == OperandStage.reduce # support on axis 1 df = DataFrame(raw, chunk_size=4) sorted_df = sort_index(df, axis=1) assert sorted_df.shape == raw.shape assert isinstance(sorted_df.op, DataFrameSortIndex) tiled = tile(sorted_df) assert all(isinstance(c.op, DataFrameIndex) for c in tiled.chunks) is True
def testFetchDataFrameCornerData(self): max_rows = pd.get_option('display.max_rows') try: min_rows = pd.get_option('display.min_rows') except KeyError: # pragma: no cover min_rows = max_rows sess = new_session() for row in (5, max_rows - 2, max_rows - 1, max_rows, max_rows + 1, max_rows + 2, max_rows + 3): pdf = pd.DataFrame(np.random.rand(row, 5)) df = DataFrame(pdf, chunk_size=max_rows // 2) sess.run(df, fetch=False) corner = fetch_corner_data(df, session=sess) self.assertLessEqual(corner.shape[0], max_rows + 2) corner_max_rows = max_rows if row <= max_rows else corner.shape[0] - 1 self.assertEqual(corner.to_string(max_rows=corner_max_rows, min_rows=min_rows), pdf.to_string(max_rows=max_rows, min_rows=min_rows), 'failed when row == {}'.format(row))
def test_sort_values(): raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) df = DataFrame(raw) sorted_df = dataframe_sort_values(df, by='c') assert sorted_df.shape == raw.shape assert isinstance(sorted_df.op, DataFrameSortValues) tiled = tile(sorted_df) assert len(tiled.chunks) == 1 assert isinstance(tiled.chunks[0].op, DataFrameSortValues) df = DataFrame(raw, chunk_size=6) sorted_df = dataframe_sort_values(df, by='c') assert sorted_df.shape == raw.shape assert isinstance(sorted_df.op, DataFrameSortValues) tiled = tile(sorted_df) assert len(tiled.chunks) == 2 assert tiled.chunks[0].op.stage == OperandStage.reduce df = DataFrame(raw, chunk_size=3) sorted_df = dataframe_sort_values(df, by=['a', 'c']) assert sorted_df.shape == raw.shape assert isinstance(sorted_df.op, DataFrameSortValues) tiled = tile(sorted_df) assert len(tiled.chunks) == 3 assert tiled.chunks[0].op.stage == OperandStage.reduce pd.testing.assert_series_equal(tiled.chunks[0].dtypes, raw.dtypes) assert tiled.chunks[1].op.stage == OperandStage.reduce pd.testing.assert_series_equal(tiled.chunks[1].dtypes, raw.dtypes) assert tiled.chunks[2].op.stage == OperandStage.reduce pd.testing.assert_series_equal(tiled.chunks[2].dtypes, raw.dtypes)
def test_build_concatenated_rows_frame(setup, columns): df = pd.DataFrame(np.random.rand(16, 8), columns=columns) # single chunk mdf = tile(DataFrame(df, chunk_size=8)) concatenated = build_concatenated_rows_frame(mdf) assert len(concatenated.chunks) == 2 pd.testing.assert_frame_equal(concatenated.execute().fetch(), df) # multiple chunks mdf = tile(DataFrame(df, chunk_size=5)) concatenated = build_concatenated_rows_frame(mdf) assert len(concatenated.chunks) == 4 for i in range(4): pd.testing.assert_index_equal( concatenated.chunks[i].columns_value.to_pandas(), df.columns) pd.testing.assert_frame_equal(concatenated.execute().fetch(), df)
def test_comp(setup): df1 = DataFrame(pd.DataFrame(np.random.rand(4, 3))) df2 = DataFrame(pd.DataFrame(np.random.rand(4, 3))) with enter_mode(build=True): assert not df1.data == df2.data assert df1.data == df1.data for op in [ operator.eq, operator.ne, operator.lt, operator.gt, operator.le, operator.ge ]: eq_df = op(df1, df2) pd.testing.assert_index_equal(eq_df.index_value.to_pandas(), df1.index_value.to_pandas()) # index not identical df3 = DataFrame(pd.DataFrame(np.random.rand(4, 3), index=[1, 2, 3, 4])) with pytest.raises(ValueError): op(df1, df3) # columns not identical df4 = DataFrame( pd.DataFrame(np.random.rand(4, 3), columns=['a', 'b', 'c'])) with pytest.raises(ValueError): op(df1, df4) # test datetime df = DataFrame(pd.DataFrame(pd.date_range('20130101', periods=6))) for op in [ operator.eq, operator.ne, operator.lt, operator.gt, operator.le, operator.ge ]: r_df = op(df, datetime(2013, 1, 2)) pd.testing.assert_index_equal(r_df.index_value.to_pandas(), df.index_value.to_pandas()) # test period type raw = pd.period_range("2000-01-01", periods=10, freq="D") raw_series = pd.Series(raw) series = Series(raw, chunk_size=5) r = series >= series[1] pd.testing.assert_series_equal(r.to_pandas(), raw_series >= raw_series[1])
def test_dataframe_params(): raw = pd.DataFrame({'a': [1, 2, 3]}) df = DataFrame(raw) df = df[df['a'] < 2] df = tile(df) c = df.chunks[0] assert any(np.isnan(s) for s in c.params['shape']) assert np.isnan(c.params['index_value'].min_val) c.params = c.get_params_from_data(raw[raw['a'] < 2]) # shape and index_value updated assert not any(np.isnan(s) for s in c.params['shape']) assert not np.isnan(c.params['index_value'].min_val) params = c.params.copy() params.pop('index', None) df.params = params assert np.prod(df.shape) > 0 df.refresh_params()
def test_fetch_dataframe_corner_data(setup): max_rows = pd.get_option('display.max_rows') try: min_rows = pd.get_option('display.min_rows') except KeyError: # pragma: no cover min_rows = max_rows for row in (5, max_rows - 2, max_rows - 1, max_rows, max_rows + 1, max_rows + 2, max_rows + 3): pdf = pd.DataFrame(np.random.rand(row, 5)) df = DataFrame(pdf, chunk_size=max_rows // 2) df.execute() corner = fetch_corner_data(df) assert corner.shape[0] <= max_rows + 2 corner_max_rows = max_rows if row <= max_rows else corner.shape[0] - 1 assert corner.to_string(max_rows=corner_max_rows, min_rows=min_rows) == pdf.to_string( max_rows=max_rows, min_rows=min_rows)
def test_assign(setup): rs = np.random.RandomState(0) raw = pd.DataFrame({"A": rs.rand(10), "B": rs.rand(10)}) df = DataFrame(raw, chunk_size=5) result = df.assign(C=df.B / df.A).execute().fetch() expected = raw.assign(C=raw.B / raw.A) pd.testing.assert_frame_equal(result, expected) # lambda syntax result = df.assign(C=lambda x: x.B / x.A).execute().fetch() expected = raw.assign(C=lambda x: x.B / x.A) pd.testing.assert_frame_equal(result, expected) # Non-Series array-like row_list = rs.rand(10).tolist() result = df.assign(C=row_list).execute().fetch() expected = raw.assign(C=row_list) pd.testing.assert_frame_equal(result, expected) # multiple row_list = rs.rand(10).tolist() result = df.assign(C=row_list, D=df.A, E=lambda x: x.B) result['C'] = result['C'].astype('int64') expected = raw.assign(C=row_list, D=raw.A, E=lambda x: x.B) expected['C'] = expected['C'].astype('int64') pd.testing.assert_frame_equal(result.execute().fetch(), expected)
def test_groupby_params(): raw = pd.DataFrame({'a': [1, 2, 3]}) df = DataFrame(raw) grouped = df.groupby('a') grouped = tile(grouped) c = grouped.chunks[0] c.params = c.get_params_from_data(wrapped_groupby(raw, by='a')) params = c.params.copy() params.pop('index', None) grouped.params = params raw = pd.Series([1, 2, 3], name='a') series = Series(raw) grouped = series.groupby(level=0) grouped = tile(grouped) c = grouped.chunks[0] c.params = c.get_params_from_data(wrapped_groupby(raw, level=0)) params = c.params.copy() params.pop('index', None) grouped.params = params grouped.refresh_params()
def testKeyValue(self): raw = pd.DataFrame(np.random.rand(4, 3), columns=list('ABC')) df = DataFrame(raw) result = self.executor.execute_dataframe(df.values, concat=True)[0] np.testing.assert_array_equal(result, raw.values) result = self.executor.execute_dataframe(df.keys(), concat=True)[0] pd.testing.assert_index_equal(result, raw.keys()) raw = pd.Series(np.random.rand(10)) s = Series(raw) result = self.executor.execute_dataframe(s.values, concat=True)[0] np.testing.assert_array_equal(result, raw.values) result = self.executor.execute_dataframe(s.keys(), concat=True)[0] pd.testing.assert_index_equal(result, raw.keys()) raw = pd.Index(np.random.rand(10)) idx = Index(raw) result = self.executor.execute_dataframe(idx.values, concat=True)[0] np.testing.assert_array_equal(result, raw.values)
def test_key_value(setup): raw = pd.DataFrame(np.random.rand(4, 3), columns=list('ABC')) df = DataFrame(raw) result = df.values.execute().fetch() np.testing.assert_array_equal(result, raw.values) result = df.keys().execute().fetch() pd.testing.assert_index_equal(result, raw.keys()) raw = pd.Series(np.random.rand(10)) s = Series(raw) result = s.values.execute().fetch() np.testing.assert_array_equal(result, raw.values) result = s.keys().execute().fetch() pd.testing.assert_index_equal(result, raw.keys()) raw = pd.Index(np.random.rand(10)) idx = Index(raw) result = idx.values.execute().fetch() np.testing.assert_array_equal(result, raw.values)
def testValidateAxis(self): df = DataFrame(pd.DataFrame(np.random.rand(4, 3))) self.assertEqual(validate_axis(0, df), 0) self.assertEqual(validate_axis('index', df), 0) self.assertEqual(validate_axis(1, df), 1) self.assertEqual(validate_axis('columns', df), 1) with self.assertRaises(ValueError): validate_axis('unknown index', df) with self.assertRaises(ValueError): validate_axis(object(), df) with self.assertRaises(ValueError): validate_axis(-1, df) with self.assertRaises(ValueError): validate_axis(2, df)
def test_validate_axis(): df = DataFrame(pd.DataFrame(np.random.rand(4, 3))) assert validate_axis(0, df) == 0 assert validate_axis('index', df) == 0 assert validate_axis(1, df) == 1 assert validate_axis('columns', df) == 1 with pytest.raises(ValueError): validate_axis('unknown index', df) with pytest.raises(ValueError): validate_axis(object(), df) with pytest.raises(ValueError): validate_axis(-1, df) with pytest.raises(ValueError): validate_axis(2, df) df2 = df[df[0] < 0.5] # create unknown shape assert validate_axis(0, df2) == 0
def test_dataframe_dir(): df = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=list('ABC'))) dir_result = set(dir(df)) for c in df.dtypes.index: assert c in dir_result
def testDataFrameDir(self): df = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=list('ABC'))) dir_result = set(dir(df)) for c in df.dtypes.index: self.assertIn(c, dir_result)