def test_table(self): with self.table('test_table'): pdf = self.test_pdf expected = ks.DataFrame(pdf) # Write out partitioned by one column expected.to_table('test_table', mode='overwrite', partition_cols='i32') # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_table('test_table')[self.test_column_order] self.assert_eq( actual.sort_values(by='f').to_spark().toPandas(), expected.sort_values(by='f').to_spark().toPandas()) # Write out partitioned by two columns expected.to_table('test_table', mode='overwrite', partition_cols=['i32', 'bhello']) # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_table('test_table')[self.test_column_order] self.assert_eq( actual.sort_values(by='f').to_spark().toPandas(), expected.sort_values(by='f').to_spark().toPandas())
def test_table(self): with self.table("test_table"): pdf = self.test_pdf expected = ks.DataFrame(pdf) # Write out partitioned by one column expected.spark.to_table("test_table", mode="overwrite", partition_cols="i32") # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_table("test_table") self.assertFalse((actual.columns == self.test_column_order).all()) actual = actual[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), ) # Write out partitioned by two columns expected.to_table("test_table", mode="overwrite", partition_cols=["i32", "bhello"]) # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_table("test_table") self.assertFalse((actual.columns == self.test_column_order).all()) actual = actual[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), ) # When index columns are known expected_idx = expected.set_index("bhello")[["f", "i32", "i64"]] actual_idx = ks.read_table("test_table", index_col="bhello")[["f", "i32", "i64"]] self.assert_eq( actual_idx.sort_values(by="f").to_spark().toPandas(), expected_idx.sort_values(by="f").to_spark().toPandas(), ) expected_idx = expected.set_index(["bhello"])[["f", "i32", "i64"]] actual_idx = ks.read_table("test_table", index_col=["bhello" ])[["f", "i32", "i64"]] self.assert_eq( actual_idx.sort_values(by="f").to_spark().toPandas(), expected_idx.sort_values(by="f").to_spark().toPandas(), ) expected_idx = expected.set_index(["i32", "bhello"])[["f", "i64"]] actual_idx = ks.read_table("test_table", index_col=["i32", "bhello"])[["f", "i64"]] self.assert_eq( actual_idx.sort_values(by="f").to_spark().toPandas(), expected_idx.sort_values(by="f").to_spark().toPandas(), )