def testResetIndex(self): data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=['falcon', 'parrot', 'lion', 'monkey'], columns=('class', 'max_speed')) df = df_reset_index(from_pandas_df(data, chunk_size=2)) r = data.reset_index() self.assertEqual(df.shape, (4, 3)) pd.testing.assert_series_equal(df.dtypes, r.dtypes) df2 = df.tiles() self.assertEqual(len(df2.chunks), 2) self.assertEqual(df2.chunks[0].shape, (2, 3)) pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(2)) pd.testing.assert_series_equal(df2.chunks[0].dtypes, r.dtypes) self.assertEqual(df2.chunks[1].shape, (2, 3)) pd.testing.assert_index_equal(df2.chunks[1].index_value.to_pandas(), pd.RangeIndex(2, 4)) pd.testing.assert_series_equal(df2.chunks[1].dtypes, r.dtypes) df = df_reset_index(from_pandas_df(data, chunk_size=1), drop=True) r = data.reset_index(drop=True) self.assertEqual(df.shape, (4, 2)) pd.testing.assert_series_equal(df.dtypes, r.dtypes) df2 = df.tiles() self.assertEqual(len(df2.chunks), 8) for c in df2.chunks: self.assertEqual(c.shape, (1, 1)) pd.testing.assert_index_equal( c.index_value.to_pandas(), pd.RangeIndex(c.index[0], c.index[0] + 1)) pd.testing.assert_series_equal(c.dtypes, r.dtypes[c.index[1]:c.index[1] + 1]) # test Series series_data = pd.Series([1, 2, 3, 4], name='foo', index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) s = series_reset_index(from_pandas_series(series_data, chunk_size=2)) r = series_data.reset_index() self.assertEqual(s.shape, (4, 2)) pd.testing.assert_series_equal(s.dtypes, r.dtypes) s2 = s.tiles() self.assertEqual(len(s2.chunks), 2) self.assertEqual(s2.chunks[0].shape, (2, 2)) pd.testing.assert_index_equal(s2.chunks[0].index_value.to_pandas(), pd.RangeIndex(2)) self.assertEqual(s2.chunks[1].shape, (2, 2)) pd.testing.assert_index_equal(s2.chunks[1].index_value.to_pandas(), pd.RangeIndex(2, 4))
def testResetIndexExecution(self): data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=['falcon', 'parrot', 'lion', 'monkey'], columns=('class', 'max_speed')) df = from_pandas_df(data) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, drop=True) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(drop=True) pd.testing.assert_frame_equal(result, expected) index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ('bird', 'parrot'), ('mammal', 'lion'), ('mammal', 'monkey')], names=['class', 'name']) data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=index, columns=('type', 'max_speed')) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, level='class') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class') pd.testing.assert_frame_equal(result, expected) columns = pd.MultiIndex.from_tuples([('speed', 'max'), ('species', 'type')]) data.columns = columns df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df, level='class', col_level=1, col_fill='species') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class', col_level=1, col_fill='species') pd.testing.assert_frame_equal(result, expected) # Test Series s = pd.Series([1, 2, 3, 4], name='foo', index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) series = from_pandas_series(s) s2 = series_reset_index(series, name='bar') result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(name='bar') pd.testing.assert_frame_equal(result, expected) series = from_pandas_series(s, chunk_size=2) s2 = series_reset_index(series, drop=True) result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(drop=True) pd.testing.assert_series_equal(result, expected) # Test Unknown shape sess = new_session() data1 = pd.DataFrame(np.random.rand(10, 3), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 3), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) df2 = from_pandas_df(data2, chunk_size=6) df = (df1 + df2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) data1 = pd.Series(np.random.rand(10,), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) series1 = from_pandas_series(data1, chunk_size=3) data2 = pd.Series(np.random.rand(10,), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series2 = from_pandas_series(data2, chunk_size=3) df = (series1 + series2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())