예제 #1
0
def main(data_file: Path, output_file: Path):
    ds = Dataset("train").from_disk(data_file)

    print("STATS BEFORE")
    print("============")
    print(ds.apply(get_ner_stats, serialize=True))

    ds.apply_("recon.v1.upcase_labels")

    print("STATS AFTER")
    print("===========")
    print(ds.apply(get_ner_stats, serialize=True))

    ds.to_disk(output_file, force=True)
예제 #2
0
def test_dataset_to_from_disk(example_data, tmp_path):
    train_dataset = Dataset("train", example_data["train"])
    ner_stats_pre: NERStats = cast(NERStats,
                                   train_dataset.apply(get_ner_stats))

    assert len(train_dataset.operations) == 0

    with pytest.raises(FileNotFoundError):
        train_dataset.to_disk(tmp_path / "train.jsonl")

    train_dataset.to_disk(tmp_path / "train.jsonl", force=True)
    train_dataset_loaded = Dataset("train").from_disk(tmp_path / "train.jsonl")
    assert len(train_dataset_loaded.operations) == 0
    assert train_dataset_loaded.commit_hash == train_dataset.commit_hash

    train_dataset.apply_("recon.v1.upcase_labels")

    train_dataset.to_disk(tmp_path / "train.jsonl", force=True)
    train_dataset_loaded_2 = Dataset("train").from_disk(tmp_path /
                                                        "train.jsonl")

    assert len(train_dataset_loaded_2.operations) == 1
    assert train_dataset_loaded_2.commit_hash == train_dataset.commit_hash
    assert train_dataset_loaded_2.commit_hash != train_dataset_loaded.commit_hash

    op = train_dataset_loaded_2.operations[0]

    assert op.name == "recon.v1.upcase_labels"
    assert op.status == OperationStatus.COMPLETED
    assert len(op.transformations) == 3

    for t in op.transformations:
        assert t.type == TransformationType.EXAMPLE_CHANGED