def test_spark_io(self): with self.temp_dir() as tmp: pdf = self.test_pdf expected = ks.DataFrame(pdf) # Write out partitioned by one column expected.to_spark_io(tmp, format='json', mode='overwrite', partition_cols='i32') # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_spark_io(tmp, format='json')[self.test_column_order] self.assert_eq( actual.sort_values(by='f').to_spark().toPandas(), expected.sort_values(by='f').to_spark().toPandas()) # Write out partitioned by two columns expected.to_spark_io(tmp, format='json', mode='overwrite', partition_cols=['i32', 'bhello']) # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_spark_io(path=tmp, format='json')[self.test_column_order] self.assert_eq( actual.sort_values(by='f').to_spark().toPandas(), expected.sort_values(by='f').to_spark().toPandas())
def test_spark_io(self): with self.temp_dir() as tmp: pdf = self.test_pdf expected = ks.DataFrame(pdf) # Write out partitioned by one column expected.to_spark_io(tmp, format="json", mode="overwrite", partition_cols="i32") # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_spark_io(tmp, format="json") self.assertFalse((actual.columns == self.test_column_order).all()) actual = actual[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), ) # Write out partitioned by two columns expected.to_spark_io(tmp, format="json", mode="overwrite", partition_cols=["i32", "bhello"]) # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_spark_io(path=tmp, format="json") self.assertFalse((actual.columns == self.test_column_order).all()) actual = actual[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), ) # When index columns are known pdf = self.test_pdf expected = ks.DataFrame(pdf) col_order = ["f", "i32", "i64"] expected_idx = expected.set_index("bhello")[col_order] actual_idx = ks.read_spark_io(tmp, format="json", index_col="bhello")[col_order] self.assert_eq( actual_idx.sort_values(by="f").to_spark().toPandas(), expected_idx.sort_values(by="f").to_spark().toPandas(), )