def testConcat(self): df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=4) mdf2 = from_pandas(df2, chunk_size=4) r = concat([mdf1, mdf2], axis='index') self.assertEqual(r.shape, (20, 4)) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) tiled = r.tiles() self.assertEqual(tiled.nsplits, ((4, 4, 2, 4, 4, 2), (4, ))) for i, c in enumerate(tiled.chunks): self.assertEqual(c.index, (i, 0)) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=4) r = concat([mdf1, mdf2], axis='columns') self.assertEqual(r.shape, (10, 8)) expected_dtypes = pd.concat([df1, df2], axis='columns').dtypes pd.testing.assert_series_equal(r.dtypes, expected_dtypes) tiled = r.tiles() self.assertEqual(tiled.nsplits, ((3, 3, 3, 1), (3, 1, 4))) for i, c in enumerate(tiled.chunks): index = (i // 3, i % 3) self.assertEqual(c.index, index) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2], join='inner') self.assertEqual(r.shape, (20, 3)) tiled = r.tiles() self.assertEqual(tiled.nsplits, ((3, 3, 3, 1, 3, 3, 3, 1), (3, )))
def test_concat(setup): df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2]) expected = pd.concat([df1, df2]) result = r.execute().fetch() pd.testing.assert_frame_equal(expected, result) # test different chunk size and ignore_index=True mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2], ignore_index=True) expected = pd.concat([df1, df2], ignore_index=True) result = r.execute(extra_config={'check_index_value': False}).fetch() pd.testing.assert_frame_equal(expected, result) # test axis=1 mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2], axis=1) expected = pd.concat([df1, df2], axis=1) result = r.execute().fetch() pd.testing.assert_frame_equal(expected, result) # test multiply dataframes r = concat([mdf1, mdf2, mdf1]) expected = pd.concat([df1, df2, df1]) result = r.execute().fetch() pd.testing.assert_frame_equal(expected, result) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) # test join=inner r = concat([mdf1, mdf2], join='inner') expected = pd.concat([df1, df2], join='inner') result = r.execute().fetch() pd.testing.assert_frame_equal(expected, result) # test for series series1 = pd.Series(np.random.rand(10, )) series2 = pd.Series(np.random.rand(10, )) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) r = concat([mseries1, mseries2]) expected = pd.concat([series1, series2]) result = r.execute().fetch() pd.testing.assert_series_equal(result, expected) # test different series and ignore_index mseries1 = series_from_pandas(series1, chunk_size=4) mseries2 = series_from_pandas(series2, chunk_size=3) r = concat([mseries1, mseries2], ignore_index=True) expected = pd.concat([series1, series2], ignore_index=True) result = r.execute(extra_config={'check_index_value': False}).fetch() pd.testing.assert_series_equal(result, expected) # test axis=1 mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) r = concat([mseries1, mseries2], axis=1) expected = pd.concat([series1, series2], axis=1) result = r.execute(extra_config={'check_shape': False}).fetch() pd.testing.assert_frame_equal(result, expected) # test merge dataframe and series r = concat([mdf1, mseries2], ignore_index=True) expected = pd.concat([df1, series2], ignore_index=True) result = r.execute(extra_config={'check_index_value': False}).fetch() pd.testing.assert_frame_equal(result, expected) # test merge series and dataframe r = concat([mseries1, mdf2], ignore_index=True) expected = pd.concat([series1, df2], ignore_index=True) result = r.execute(extra_config={'check_index_value': False}).fetch() pd.testing.assert_frame_equal(result, expected) # test merge dataframe and series, axis=1 r = concat([mdf1, mseries2], axis=1) expected = pd.concat([df1, series2], axis=1) result = r.execute().fetch() pd.testing.assert_frame_equal(result, expected) # test merge series and dataframe, axis=1 r = concat([mseries1, mdf2], axis=1) expected = pd.concat([series1, df2], axis=1) result = r.execute().fetch() pd.testing.assert_frame_equal(result, expected)
def test_concat(): df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=4) mdf2 = from_pandas(df2, chunk_size=4) r = concat([mdf1, mdf2], axis='index') assert r.shape == (20, 4) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) tiled = tile(r) assert tiled.nsplits == ((4, 4, 2, 4, 4, 2), (4,)) for i, c in enumerate(tiled.chunks): assert c.index == (i, 0) df3 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'), index=pd.RangeIndex(10, 20)) mdf3 = from_pandas(df3, chunk_size=4) r = concat([mdf1, mdf3], axis='index') assert r.shape == (20, 4) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20)) df4 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'), index=np.random.permutation(np.arange(10))) mdf4 = from_pandas(df4, chunk_size=4) r = concat([mdf1, mdf4], axis='index') assert r.shape == (20, 4) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64)) r = concat([mdf4, mdf1], axis='index') assert r.shape == (20, 4) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64)) r = concat([mdf4, mdf4], axis='index') assert r.shape == (20, 4) pd.testing.assert_series_equal(r.dtypes, df1.dtypes) pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64)) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=4) r = concat([mdf1, mdf2], axis='columns') assert r.shape == (10, 8) expected_dtypes = pd.concat([df1, df2], axis='columns').dtypes pd.testing.assert_series_equal(r.dtypes, expected_dtypes) tiled = tile(r) assert tiled.nsplits == ((3, 3, 3, 1), (3, 1, 4)) for i, c in enumerate(tiled.chunks): index = (i // 3, i % 3) assert c.index == index df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2], join='inner') assert r.shape == (20, 3) tiled = tile(r) assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, ))
def testConcat(self): executor = ExecutorForTest(storage=new_session().context) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2]) expected = pd.concat([df1, df2]) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test different chunk size and ignore_index=True mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2], ignore_index=True) expected = pd.concat([df1, df2], ignore_index=True) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test axis=1 mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2], axis=1) expected = pd.concat([df1, df2], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test multiply dataframes r = concat([mdf1, mdf2, mdf1]) expected = pd.concat([df1, df2, df1]) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) # test join=inner r = concat([mdf1, mdf2], join='inner') expected = pd.concat([df1, df2], join='inner') result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test for series series1 = pd.Series(np.random.rand(10, )) series2 = pd.Series(np.random.rand(10, )) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) r = concat([mseries1, mseries2]) expected = pd.concat([series1, series2]) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, expected) # test different series and ignore_index mseries1 = series_from_pandas(series1, chunk_size=4) mseries2 = series_from_pandas(series2, chunk_size=3) r = concat([mseries1, mseries2], ignore_index=True) expected = pd.concat([series1, series2], ignore_index=True) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, expected) # test axis=1 mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) r = concat([mseries1, mseries2], axis=1) expected = pd.concat([series1, series2], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) # test merge dataframe and series r = concat([mdf1, mseries2], ignore_index=True) expected = pd.concat([df1, series2], ignore_index=True) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) # test merge series and dataframe r = concat([mseries1, mdf2], ignore_index=True) expected = pd.concat([series1, df2], ignore_index=True) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) # test merge dataframe and series, axis=1 r = concat([mdf1, mseries2], axis=1) expected = pd.concat([df1, series2], axis=1) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) # test merge series and dataframe, axis=1 r = concat([mseries1, mdf2], axis=1) expected = pd.concat([series1, df2], axis=1) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected)