def test_dataset_to_from_disk(example_data, tmp_path): train_dataset = Dataset("train", example_data["train"]) ner_stats_pre: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats)) assert len(train_dataset.operations) == 0 with pytest.raises(FileNotFoundError): train_dataset.to_disk(tmp_path / "train.jsonl") train_dataset.to_disk(tmp_path / "train.jsonl", force=True) train_dataset_loaded = Dataset("train").from_disk(tmp_path / "train.jsonl") assert len(train_dataset_loaded.operations) == 0 assert train_dataset_loaded.commit_hash == train_dataset.commit_hash train_dataset.apply_("recon.v1.upcase_labels") train_dataset.to_disk(tmp_path / "train.jsonl", force=True) train_dataset_loaded_2 = Dataset("train").from_disk(tmp_path / "train.jsonl") assert len(train_dataset_loaded_2.operations) == 1 assert train_dataset_loaded_2.commit_hash == train_dataset.commit_hash assert train_dataset_loaded_2.commit_hash != train_dataset_loaded.commit_hash op = train_dataset_loaded_2.operations[0] assert op.name == "recon.v1.upcase_labels" assert op.status == OperationStatus.COMPLETED assert len(op.transformations) == 3 for t in op.transformations: assert t.type == TransformationType.EXAMPLE_CHANGED
def test_dataset_commit_hash(example_data): train_dataset = Dataset("train", example_data["train"][:-1]) dev_dataset = Dataset("train", example_data["dev"]) assert train_dataset.commit_hash != dev_dataset.commit_hash train_commit = train_dataset.commit_hash train_dataset.data.append(example_data["train"][-1]) assert train_dataset.commit_hash != train_commit assert hash(train_dataset) == 1186038092183970443
def test_apply_(example_data): train_dataset = Dataset("train", example_data["train"]) ner_stats_pre: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats)) assert len(train_dataset.operations) == 0 train_dataset.apply_("recon.v1.upcase_labels") ner_stats_post: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats)) pre_keys = sorted(ner_stats_pre.n_annotations_per_type.keys()) post_keys = sorted(ner_stats_post.n_annotations_per_type.keys()) assert pre_keys != post_keys assert pre_keys == ["JOB_ROLE", "PRODUCT", "SKILL", "product", "skill"] assert post_keys == ["JOB_ROLE", "PRODUCT", "SKILL"] assert len(train_dataset.operations) == 1 op = train_dataset.operations[0] assert op.name == "recon.v1.upcase_labels" assert op.status == OperationStatus.COMPLETED assert len(op.transformations) == 3 for t in op.transformations: assert t.type == TransformationType.EXAMPLE_CHANGED
def test_dataset_initialize(example_data): dataset = Dataset("train") assert dataset.name == "train" assert dataset.data == [] assert dataset.example_store._map == {} assert dataset.commit_hash == "94efdd6f628eda9c1ae893467c9652808443ef3e" assert dataset.operations == [] store = ExampleStore() dataset2 = Dataset("dev", example_data["dev"], [], store) assert dataset2.name == "dev" assert dataset2.data == example_data["dev"] assert dataset2.example_store == store assert dataset2.commit_hash == "dd05e54668c166d075bc4406bfee590e4c89a292" assert dataset2.operations == []
def ner_merge( dataset: str, recon_dataset: str, source: Union[str, Dataset], output_dir: Optional[str] = None, exclude: Optional[List[str]] = None, ): """ Stream a List of `recon.types.HardestExample` instances to prodigy for review/correction. Uses the Prodigy blocks interface to display prediction error information along with ner view """ log("RECIPE: Starting recipe recon.ner_merge", locals()) if isinstance(source, str): dataset = Dataset(recon_dataset).from_disk(source) else: dataset = source DB = connect() if dataset not in DB: msg.fail(f"Can't find dataset '{dataset}'", exits=1) prodigy_raw_examples = DB.get_dataset(dataset) prodigy_examples = [Example(**eg) for eg in prodigy_raw_examples if eg["answer"] == "accept"] prodigy_texts_to_examples = {e.text: e for e in prodigy_examples} prev_len = len(dataset) dataset.apply_("recon.v1.prodigy.merge_examples", prodigy_texts_to_examples) assert len(dataset) == prev_len if output_dir: log(f"RECIPE: Fixing {len(prodigy_examples)} examples in data") dataset.to_disk(output_dir)
def test_apply(example_data): train_dataset = Dataset("train", example_data["train"]) ner_stats: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats)) ner_stats_apply: NERStats = cast(NERStats, get_ner_stats(train_dataset.data)) assert ner_stats.n_examples == ner_stats_apply.n_examples assert ner_stats.n_examples_no_entities == ner_stats_apply.n_examples_no_entities assert ner_stats.n_annotations == ner_stats_apply.n_annotations assert ner_stats.n_annotations_per_type == ner_stats_apply.n_annotations_per_type
def main(data_file: Path, output_file: Path): ds = Dataset("train").from_disk(data_file) print("STATS BEFORE") print("============") print(ds.apply(get_ner_stats, serialize=True)) ds.apply_("recon.v1.upcase_labels") print("STATS AFTER") print("===========") print(ds.apply(get_ner_stats, serialize=True))
def ds(): ds = Dataset( name="test", data=[ Example( text="this is a test example with something else", spans=[ Span(text="something", start=28, end=37, label="TEST_ENTITY") ], ) ], ) ds.apply_("recon.v1.add_tokens") return ds
def test_len(example_data): train_dataset = Dataset("train", example_data["train"]) assert len(train_dataset) == len(example_data["train"])