def test_fails_wrong_partitioning(existing_table: DeltaTable, sample_data: pa.Table): with pytest.raises(AssertionError): write_deltalake(existing_table, sample_data, mode="append", partition_by="int32")
def test_writer_with_max_rows(tmp_path: pathlib.Path, row_count: int, rows_per_file: int, expected_files: int): def get_multifile_stats(table: DeltaTable) -> Iterable[Dict]: log_path = get_log_path(table) # Should only have single add entry for line in open(log_path, "r").readlines(): log_entry = json.loads(line) if "add" in log_entry: yield json.loads(log_entry["add"]["stats"]) data = pa.table({ "colA": pa.array(range(0, row_count), pa.int32()), "colB": pa.array([i * random.random() for i in range(0, row_count)], pa.float64()), }) path = str(tmp_path) write_deltalake( path, data, file_options=ParquetFileFormat().make_write_options(), max_rows_per_file=rows_per_file, max_rows_per_group=rows_per_file, ) table = DeltaTable(path) stats = get_multifile_stats(table) files_written = [f for f in os.listdir(path) if f != "_delta_log"] assert sum([stat_entry["numRecords"] for stat_entry in stats]) == row_count assert len(files_written) == expected_files
def test_write_recordbatchreader(tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table): batches = existing_table.to_pyarrow_dataset().to_batches() reader = RecordBatchReader.from_batches(sample_data.schema, batches) write_deltalake(str(tmp_path), reader, mode="overwrite") assert DeltaTable(str(tmp_path)).to_pyarrow_table() == sample_data
def test_write_pandas(tmp_path: pathlib.Path, sample_data: pa.Table): # When timestamp is converted to Pandas, it gets casted to ns resolution, # but Delta Lake schemas only support us resolution. sample_pandas = sample_data.to_pandas().drop(["timestamp"], axis=1) write_deltalake(str(tmp_path), sample_pandas) delta_table = DeltaTable(str(tmp_path)) df = delta_table.to_pandas() assert_frame_equal(df, sample_pandas)
def test_handle_existing(tmp_path: pathlib.Path, sample_data: pa.Table): # if uri points to a non-empty directory that isn't a delta table, error tmp_path p = tmp_path / "hello.txt" p.write_text("hello") with pytest.raises(OSError) as exception: write_deltalake(str(tmp_path), sample_data, mode="overwrite") assert "directory is not empty" in str(exception)
def test_writer_partitioning(tmp_path: pathlib.Path): test_strings = ["a=b", "hello world", "hello%20world"] data = pa.table({ "p": pa.array(test_strings), "x": pa.array(range(len(test_strings))) }) write_deltalake(str(tmp_path), data) assert DeltaTable(str(tmp_path)).to_pyarrow_table() == data
def test_roundtrip_multi_partitioned(tmp_path: pathlib.Path, sample_data: pa.Table): write_deltalake(str(tmp_path), sample_data, partition_by=["int32", "bool"]) delta_table = DeltaTable(str(tmp_path)) assert delta_table.pyarrow_schema() == sample_data.schema table = delta_table.to_pyarrow_table() table = table.take(pc.sort_indices(table["int64"])) assert table == sample_data
def test_roundtrip_basic(tmp_path: pathlib.Path, sample_data: pa.Table): write_deltalake(str(tmp_path), sample_data) assert ("0" * 20 + ".json") in os.listdir(tmp_path / "_delta_log") delta_table = DeltaTable(str(tmp_path)) assert delta_table.pyarrow_schema() == sample_data.schema table = delta_table.to_pyarrow_table() assert table == sample_data
def test_write_iterator(tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table): batches = existing_table.to_pyarrow_dataset().to_batches() with pytest.raises(ValueError): write_deltalake(str(tmp_path), batches, mode="overwrite") write_deltalake(str(tmp_path), batches, schema=sample_data.schema, mode="overwrite") assert DeltaTable(str(tmp_path)).to_pyarrow_table() == sample_data
def test_writer_null_stats(tmp_path: pathlib.Path): data = pa.table({ "int32": pa.array([1, None, 2, None], pa.int32()), "float64": pa.array([1.0, None, None, None], pa.float64()), "str": pa.array([None] * 4, pa.string()), }) path = str(tmp_path) write_deltalake(path, data) table = DeltaTable(path) stats = get_stats(table) expected_nulls = {"int32": 2, "float64": 3, "str": 4} assert stats["nullCount"] == expected_nulls
def test_writer_with_options(tmp_path: pathlib.Path): column_values = [ datetime(year_, 1, 1, 0, 0, 0) for year_ in range(9000, 9010) ] data = pa.table({"colA": pa.array(column_values, pa.timestamp("us"))}) path = str(tmp_path) opts = (ParquetFileFormat().make_write_options().update( compression="GZIP", coerce_timestamps="us")) write_deltalake(path, data, file_options=opts) table = (DeltaTable(path).to_pyarrow_dataset( parquet_read_options=ParquetReadOptions( coerce_int96_timestamp_unit="us")).to_table()) assert table == data
def test_roundtrip_metadata(tmp_path: pathlib.Path, sample_data: pa.Table): write_deltalake( str(tmp_path), sample_data, name="test_name", description="test_desc", configuration={"configTest": "foobar"}, ) delta_table = DeltaTable(str(tmp_path)) metadata = delta_table.metadata() assert metadata.name == "test_name" assert metadata.description == "test_desc" assert metadata.configuration == {"configTest": "foobar"}
def test_write_modes(tmp_path: pathlib.Path, sample_data: pa.Table): path = str(tmp_path) write_deltalake(path, sample_data) assert DeltaTable(path).to_pyarrow_table() == sample_data with pytest.raises(AssertionError): write_deltalake(path, sample_data, mode="error") write_deltalake(path, sample_data, mode="ignore") assert ("0" * 19 + "1.json") not in os.listdir(tmp_path / "_delta_log") write_deltalake(path, sample_data, mode="append") expected = pa.concat_tables([sample_data, sample_data]) assert DeltaTable(path).to_pyarrow_table() == expected write_deltalake(path, sample_data, mode="overwrite") assert DeltaTable(path).to_pyarrow_table() == sample_data
def existing_table(tmp_path: pathlib.Path, sample_data: pa.Table): path = str(tmp_path) write_deltalake(path, sample_data) return DeltaTable(path)
def test_writer_fails_on_protocol(existing_table: DeltaTable, sample_data: pa.Table): existing_table.protocol = Mock(return_value=ProtocolVersions(1, 2)) with pytest.raises(DeltaTableProtocolError): write_deltalake(existing_table, sample_data, mode="overwrite")
def test_writer_with_table(existing_table: DeltaTable, sample_data: pa.Table): write_deltalake(existing_table, sample_data, mode="overwrite") existing_table.update_incremental() assert existing_table.to_pyarrow_table() == sample_data