示例#1
0
    def test_parquet_write(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ks.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_parquet(tmp, mode="overwrite", partition_cols="i32")
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Write out partitioned by two columns
            expected.to_parquet(tmp, mode="overwrite", partition_cols=["i32", "bhello"])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )
    def test_parquet_read_with_pandas_metadata(self):
        with self.temp_dir() as tmp:
            expected1 = self.test_pdf

            path1 = "{}/file1.parquet".format(tmp)
            expected1.to_parquet(path1)

            self.assert_eq(ks.read_parquet(path1, pandas_metadata=True),
                           expected1)

            expected2 = expected1.reset_index()

            path2 = "{}/file2.parquet".format(tmp)
            expected2.to_parquet(path2)

            self.assert_eq(ks.read_parquet(path2, pandas_metadata=True),
                           expected2)

            expected3 = expected2.set_index("index", append=True)

            path3 = "{}/file3.parquet".format(tmp)
            expected3.to_parquet(path3)

            self.assert_eq(ks.read_parquet(path3, pandas_metadata=True),
                           expected3)
    def test_parquet_read(self):
        with self.temp_dir() as tmp:
            data = self.test_pdf
            self.spark.createDataFrame(data, 'i32 int, i64 long, f double, bhello string') \
                .coalesce(1).write.parquet(tmp, mode='overwrite')

            def check(columns, expected):
                if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                    expected = pd.read_parquet(tmp, columns=columns)
                actual = ks.read_parquet(tmp, columns=columns)
                self.assertPandasEqual(expected, actual.toPandas())

            check(None, data)
            check(['i32', 'i64'], data[['i32', 'i64']])
            check(['i64', 'i32'], data[['i64', 'i32']])
            check(('i32', 'i64'), data[['i32', 'i64']])
            check(['a', 'b', 'i32', 'i64'], data[['i32', 'i64']])
            check([], pd.DataFrame([]))
            check(['a'], pd.DataFrame([]))
            check('i32', pd.DataFrame([]))
            check('float', data[['f']])

            # check with pyspark patch.
            if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                expected = pd.read_parquet(tmp)
            else:
                expected = data
            actual = ks.read_parquet(tmp)
            self.assertPandasEqual(expected, actual.toPandas())
示例#4
0
    def test_local(self):
        with self.temp_dir() as tmp:
            data = pd.DataFrame({
                'i32': np.arange(1000, dtype=np.int32),
                'i64': np.arange(1000, dtype=np.int64),
                'f': np.arange(1000, dtype=np.float64),
                'bhello': np.random.choice(['hello', 'yo', 'people'], size=1000).astype("O")})
            data = data[['i32', 'i64', 'f', 'bhello']]
            self.spark.createDataFrame(data, 'i32 int, i64 long, f double, bhello string') \
                .coalesce(1).write.parquet(tmp, mode='overwrite')

            def check(columns, expected):
                if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                    expected = pd.read_parquet(tmp, columns=columns)
                actual = koalas.read_parquet(tmp, columns=columns)
                self.assertPandasEqual(expected, actual.toPandas())

            check(None, data)
            check(['i32', 'i64'], data[['i32', 'i64']])
            check(['i64', 'i32'], data[['i64', 'i32']])
            check(('i32', 'i64'), data[['i32', 'i64']])
            check(['a', 'b', 'i32', 'i64'], data[['i32', 'i64']])
            check([], pd.DataFrame([]))
            check(['a'], pd.DataFrame([]))
            check('i32', pd.DataFrame([]))
            check('float', data[['f']])

            # check with pyspark patch.
            if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                expected = pd.read_parquet(tmp)
            else:
                expected = data
            actual = koalas.read_parquet(tmp)
            self.assertPandasEqual(expected, actual.toPandas())
    def test_parquet_read(self):
        with self.temp_dir() as tmp:
            data = self.test_pdf
            self.spark.createDataFrame(
                data, "i32 int, i64 long, f double, bhello string").coalesce(
                    1).write.parquet(tmp, mode="overwrite")

            def check(columns, expected):
                if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                    expected = pd.read_parquet(tmp, columns=columns)
                actual = ks.read_parquet(tmp, columns=columns)
                self.assertPandasEqual(expected, actual.to_pandas())

            check(None, data)
            check(["i32", "i64"], data[["i32", "i64"]])
            check(["i64", "i32"], data[["i64", "i32"]])

            if LooseVersion(pa.__version__) < LooseVersion("1.0.0"):
                # TODO: `pd.read_parquet()` changed the behavior due to PyArrow 1.0.0.
                #       We might want to adjust the behavior. Let's see how pandas handles it.
                check(("i32", "i64"), data[["i32", "i64"]])
                check(["a", "b", "i32", "i64"], data[["i32", "i64"]])
                check([], pd.DataFrame([]))
                check(["a"], pd.DataFrame([]))
                check("i32", pd.DataFrame([]))
                check("float", data[["f"]])

            # check with pyspark patch.
            if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                expected = pd.read_parquet(tmp)
            else:
                expected = data
            actual = ks.read_parquet(tmp)
            self.assertPandasEqual(expected, actual.to_pandas())

            # When index columns are known
            pdf = self.test_pdf
            expected = ks.DataFrame(pdf)

            expected_idx = expected.set_index("bhello")[["f", "i32", "i64"]]
            actual_idx = ks.read_parquet(
                tmp, index_col="bhello")[["f", "i32", "i64"]]
            self.assert_eq(
                actual_idx.sort_values(by="f").to_spark().toPandas(),
                expected_idx.sort_values(by="f").to_spark().toPandas(),
            )
    def test_parquet_write(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ks.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_parquet(tmp, mode='overwrite', partition_cols='i32')
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)[self.test_column_order]
            self.assert_eq(actual.sort_values(by='f'), expected.sort_values(by='f'))

            # Write out partitioned by two columns
            expected.to_parquet(tmp, mode='overwrite', partition_cols=['i32', 'bhello'])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)[self.test_column_order]
            self.assert_eq(actual.sort_values(by='f'), expected.sort_values(by='f'))
示例#7
0
    def test_local(self):
        with self.temp_dir() as tmp:
            data = pd.DataFrame({
                'i32':
                np.arange(1000, dtype=np.int32),
                'i64':
                np.arange(1000, dtype=np.int64),
                'f':
                np.arange(1000, dtype=np.float64),
                'bhello':
                np.random.choice(['hello', 'yo', 'people'],
                                 size=1000).astype("O")
            })
            data = data[['i32', 'i64', 'f', 'bhello']]
            self.spark.createDataFrame(data, 'i32 int, i64 long, f double, bhello string') \
                .coalesce(1).write.parquet(tmp, mode='overwrite')

            def check(columns, expected):
                if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                    expected = pd.read_parquet(tmp, columns=columns)
                actual = koalas.read_parquet(tmp, columns=columns)
                self.assertPandasEqual(expected, actual.toPandas())

            check(None, data)
            check(['i32', 'i64'], data[['i32', 'i64']])
            check(['i64', 'i32'], data[['i64', 'i32']])
            check(('i32', 'i64'), data[['i32', 'i64']])
            check(['a', 'b', 'i32', 'i64'], data[['i32', 'i64']])
            check([], pd.DataFrame([]))
            check(['a'], pd.DataFrame([]))
            check('i32', pd.DataFrame([]))
            check('float', data[['f']])

            # check with pyspark patch.
            if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                expected = pd.read_parquet(tmp)
            else:
                expected = data
            actual = koalas.read_parquet(tmp)
            self.assertPandasEqual(expected, actual.toPandas())
 def check(columns, expected):
     if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
         expected = pd.read_parquet(tmp, columns=columns)
     actual = ks.read_parquet(tmp, columns=columns)
     self.assertPandasEqual(expected, actual.toPandas())
示例#9
0
 def check(columns, expected):
     if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
         expected = pd.read_parquet(tmp, columns=columns)
     actual = koalas.read_parquet(tmp, columns=columns)
     self.assertPandasEqual(expected, actual.toPandas())
示例#10
0
# COMMAND ----------

# Pandas
import pandas as pd

pdDF = pd.read_parquet(
    "/dbfs/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet"
)
pdDF.head()

# COMMAND ----------

# Koalas
import databricks.koalas as ks

kdf = ks.read_parquet(
    "/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet")
kdf.head()

# COMMAND ----------

# MAGIC %md
# MAGIC ### Converting to Koalas DataFrame to/from Spark DataFrame

# COMMAND ----------

# Creating a Koalas DataFrame from PySpark DataFrame
kdf = ks.DataFrame(df)

# COMMAND ----------

# Alternative way of creating a Koalas DataFrame from PySpark DataFrame