def test_read_simple_table_update_incremental(): table_path = "../rust/tests/data/simple_table" dt = DeltaTable(table_path, version=0) assert dt.to_pyarrow_dataset().to_table().to_pydict() == { "id": [0, 1, 2, 3, 4] } dt.update_incremental() assert dt.to_pyarrow_dataset().to_table().to_pydict() == {"id": [5, 7, 9]}
def test_write_recordbatchreader(tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table): batches = existing_table.to_pyarrow_dataset().to_batches() reader = RecordBatchReader.from_batches(sample_data.schema, batches) write_deltalake(str(tmp_path), reader, mode="overwrite") assert DeltaTable(str(tmp_path)).to_pyarrow_table() == sample_data
def test_read_table_with_column_subset(): table_path = "../rust/tests/data/delta-0.8.0-partitioned" dt = DeltaTable(table_path) expected = { "value": ["1", "2", "3", "6", "7", "5", "4"], "day": ["1", "3", "5", "20", "20", "4", "5"], } assert (dt.to_pyarrow_dataset().to_table( columns=["value", "day"]).to_pydict() == expected)
def test_read_partitioned_table_to_dict(): table_path = "../rust/tests/data/delta-0.8.0-partitioned" dt = DeltaTable(table_path) expected = { "value": ["1", "2", "3", "6", "7", "5", "4"], "year": ["2020", "2020", "2020", "2021", "2021", "2021", "2021"], "month": ["1", "2", "2", "12", "12", "12", "4"], "day": ["1", "3", "5", "20", "20", "4", "5"], } assert dt.to_pyarrow_dataset().to_table().to_pydict() == expected
def test_read_table_with_edge_timestamps(): table_path = "../rust/tests/data/table_with_edge_timestamps" dt = DeltaTable(table_path) assert dt.to_pyarrow_dataset( parquet_read_options=ParquetReadOptions(coerce_int96_timestamp_unit="ms") ).to_table().to_pydict() == { "BIG_DATE": [datetime(9999, 12, 31, 0, 0, 0), datetime(9999, 12, 30, 0, 0, 0)], "NORMAL_DATE": [datetime(2022, 1, 1, 0, 0, 0), datetime(2022, 2, 1, 0, 0, 0)], "SOME_VALUE": [1, 2], }
def test_write_iterator(tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table): batches = existing_table.to_pyarrow_dataset().to_batches() with pytest.raises(ValueError): write_deltalake(str(tmp_path), batches, mode="overwrite") write_deltalake(str(tmp_path), batches, schema=sample_data.schema, mode="overwrite") assert DeltaTable(str(tmp_path)).to_pyarrow_table() == sample_data
def test_read_table_with_filter(): table_path = "../rust/tests/data/delta-0.8.0-partitioned" dt = DeltaTable(table_path) expected = { "value": ["6", "7", "5"], "year": ["2021", "2021", "2021"], "month": ["12", "12", "12"], "day": ["20", "20", "4"], } filter_expr = (ds.field("year") == "2021") & (ds.field("month") == "12") dataset = dt.to_pyarrow_dataset() assert len(list(dataset.get_fragments(filter=filter_expr))) == 2 assert dataset.to_table(filter=filter_expr).to_pydict() == expected
def test_read_table_with_stats(): table_path = "../rust/tests/data/COVID-19_NYT" dt = DeltaTable(table_path) dataset = dt.to_pyarrow_dataset() filter_expr = ds.field("date") > "2021-02-20" assert len(list(dataset.get_fragments(filter=filter_expr))) == 2 data = dataset.to_table(filter=filter_expr) assert data.num_rows < 147181 + 47559 filter_expr = ds.field("cases") < 0 assert len(list(dataset.get_fragments(filter=filter_expr))) == 0 data = dataset.to_table(filter=filter_expr) assert data.num_rows == 0
def test_read_simple_table_to_dict(): table_path = "../rust/tests/data/simple_table" dt = DeltaTable(table_path) assert dt.to_pyarrow_dataset().to_table().to_pydict() == {"id": [5, 7, 9]}
def test_read_simple_table_by_version_to_dict(): table_path = "../rust/tests/data/delta-0.2.0" dt = DeltaTable(table_path, version=2) assert dt.to_pyarrow_dataset().to_table().to_pydict() == { "value": [1, 2, 3] }
def test_read_simple_table_using_options_to_dict(): table_path = "../rust/tests/data/delta-0.2.0" dt = DeltaTable(table_path, version=2, storage_options={}) assert dt.to_pyarrow_dataset().to_table().to_pydict() == {"value": [1, 2, 3]}
def test_read_empty_delta_table_after_delete(): table_path = "../rust/tests/data/delta-0.8-empty" dt = DeltaTable(table_path) expected = {"column": []} assert dt.to_pyarrow_dataset().to_table().to_pydict() == expected