示例#1
0
def test_to_spacy_all_entities():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    spacy_ver = InputSample.create_spacy_dataset(input_samples)

    assert len(spacy_ver) == len(input_samples)
def test_to_spacy_all_entities_specific_entities(small_dataset):
    spacy_ver = InputSample.create_spacy_dataset(small_dataset,
                                                 entities=["PERSON"])

    spacy_ver_with_labels = [
        sample for sample in spacy_ver if len(sample[1]["entities"])
    ]

    assert len(spacy_ver_with_labels) < len(small_dataset)
    assert len(spacy_ver_with_labels) > 0
def test_to_spacy_file_and_back(small_dataset):
    spacy_pipeline = spacy.load("en_core_web_sm")
    InputSample.create_spacy_dataset(
        small_dataset,
        output_path="dataset.spacy",
        translate_tags=False,
        spacy_pipeline=spacy_pipeline,
        alignment_mode="strict",
    )

    db = DocBin()
    db.from_disk("dataset.spacy")
    docs = db.get_docs(vocab=spacy_pipeline.vocab)
    for doc, input_sample in zip(docs, small_dataset):
        input_ents = sorted(input_sample.spans, key=lambda x: x.start_position)
        spacy_ents = sorted(doc.ents, key=lambda x: x.start_char)
        for spacy_ent, input_span in zip(spacy_ents, input_ents):
            assert spacy_ent.start_char == input_span.start_position
            assert spacy_ent.end_char == input_span.end_position
示例#4
0
def test_to_spacy_all_entities_specific_entities():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    spacy_ver = InputSample.create_spacy_dataset(input_samples, entities=['PERSON'])

    spacy_ver_with_labels = [sample for sample in spacy_ver if len(sample[1]['entities'])]

    assert len(spacy_ver_with_labels) < len(input_samples)
    assert len(spacy_ver_with_labels) > 0
def test_to_spacy_all_entities(small_dataset):
    spacy_ver = InputSample.create_spacy_dataset(small_dataset)

    assert len(spacy_ver) == len(small_dataset)