def main(data_file: Path, output_file: Path): ds = Dataset("train").from_disk(data_file) print("STATS BEFORE") print("============") print(ds.apply(get_ner_stats, serialize=True)) ds.apply_("recon.v1.upcase_labels") print("STATS AFTER") print("===========") print(ds.apply(get_ner_stats, serialize=True)) ds.to_disk(output_file, force=True)
def test_dataset_to_from_disk(example_data, tmp_path): train_dataset = Dataset("train", example_data["train"]) ner_stats_pre: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats)) assert len(train_dataset.operations) == 0 with pytest.raises(FileNotFoundError): train_dataset.to_disk(tmp_path / "train.jsonl") train_dataset.to_disk(tmp_path / "train.jsonl", force=True) train_dataset_loaded = Dataset("train").from_disk(tmp_path / "train.jsonl") assert len(train_dataset_loaded.operations) == 0 assert train_dataset_loaded.commit_hash == train_dataset.commit_hash train_dataset.apply_("recon.v1.upcase_labels") train_dataset.to_disk(tmp_path / "train.jsonl", force=True) train_dataset_loaded_2 = Dataset("train").from_disk(tmp_path / "train.jsonl") assert len(train_dataset_loaded_2.operations) == 1 assert train_dataset_loaded_2.commit_hash == train_dataset.commit_hash assert train_dataset_loaded_2.commit_hash != train_dataset_loaded.commit_hash op = train_dataset_loaded_2.operations[0] assert op.name == "recon.v1.upcase_labels" assert op.status == OperationStatus.COMPLETED assert len(op.transformations) == 3 for t in op.transformations: assert t.type == TransformationType.EXAMPLE_CHANGED