def test_write_pandas(tmp_path: pathlib.Path, sample_data: pa.Table): # When timestamp is converted to Pandas, it gets casted to ns resolution, # but Delta Lake schemas only support us resolution. sample_pandas = sample_data.to_pandas().drop(["timestamp"], axis=1) write_deltalake(str(tmp_path), sample_pandas) delta_table = DeltaTable(str(tmp_path)) df = delta_table.to_pandas() assert_frame_equal(df, sample_pandas)
class DeltaReaderAppendTest(TestCase): @classmethod def setUpClass(self): self.path = f"tests/{str(uuid.uuid4())}/table1" self.spark = ( pyspark.sql.SparkSession.builder.appName("deltalake").config( "spark.jars.packages", "io.delta:delta-core_2.12:0.7.0").config( "spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension").config( "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog", ).getOrCreate()) df = (self.spark.range(0, 1000).withColumn("number", rand()).withColumn( "number2", when(col("id") < 500, 0).otherwise(1))) for i in range(12): df.write.partitionBy("number2").format("delta").mode( "append").save(self.path) self.table = DeltaTable(self.path) @classmethod def tearDownClass(self): # remove folder when we are done with the test shutil.rmtree(self.path) def test_paths(self): assert self.table.path == self.path assert self.table.log_path == f"{self.path}/_delta_log" def test_versions(self): assert self.table.checkpoint == 10 assert self.table.version == 11 def test_data(self): # read the parquet files using pandas df_pandas = self.table.to_pandas() # read the table using spark df_spark = self.spark.read.format("delta").load( self.table.path).toPandas() # compare dataframes. The index may not be the same order, so we ignore it assert_frame_equal( df_pandas.sort_values("id").reset_index(drop=True), df_spark.sort_values("id").reset_index(drop=True), ) def test_version(self): # read the parquet files using pandas df_pandas = self.table.as_version(5, inplace=False).to_pandas() # read the table using spark df_spark = (self.spark.read.format("delta").option( "versionAsOf", 5).load(self.table.path).toPandas()) # compare dataframes. The index may not be the same order, so we ignore it assert_frame_equal( df_pandas.sort_values("id").reset_index(drop=True), df_spark.sort_values("id").reset_index(drop=True), ) def test_partitioning(self): # Partition pruning should half number of rows assert self.table.to_table( filter=ds.field("number2") == 0).num_rows == 6000 def test_predicate_pushdown(self): # number is random 0-1, so we should have fewer than 12000 rows no matter what assert self.table.to_table( filter=ds.field("number") < 0.5).num_rows < 12000 def test_column_pruning(self): t = self.table.to_table(columns=["number", "number2"]) assert t.column_names == ["number", "number2"]
def test_delta_table_with_filesystem(): table_path = "../rust/tests/data/simple_table" dt = DeltaTable(table_path) filesystem = LocalFileSystem() assert dt.to_pandas(filesystem=filesystem).equals(pd.DataFrame({"id": [5, 7, 9]}))
def test_delta_table_to_pandas(): table_path = "../rust/tests/data/simple_table" dt = DeltaTable(table_path) assert dt.to_pandas().equals(pd.DataFrame({"id": [5, 7, 9]}))
.getOrCreate() ) for n in np.logspace(3, 8): path = f"tests/data/{str(uuid.uuid4())}/table1" df = ( spark.range(0, n) .withColumn("number", rand()) .withColumn("number2", when(col("id") < 500, 0).otherwise(1)) ) df.write.format("delta").mode("append").save(path) table = DeltaTable(path) t = time() df_pandas = table.to_pandas() t_dt = time() - t t = time() df_spark = spark.read.format("delta").load(table.path).toPandas() t_spark = time() - t print(f"{n},t_df,{t_dt}\n{n},t_spark,{t_spark}") with open("performance_tests/results.txt", "a") as f: print(f"{n},delta-lake-reader,{t_dt}", file=f) print(f"{n},spark,{t_spark}", file=f) shutil.rmtree(path) plt.style.use("fivethirtyeight") df = pd.read_csv("performance_tests/results.txt")
def test_read_simple_table_from_remote(s3_localstack): table_path = "s3://deltars/simple" dt = DeltaTable(table_path) assert dt.to_pandas().equals(pd.DataFrame({"id": [5, 7, 9]}))