예제 #1
0
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"):
    """
    an adaption of benchmark spacy model which is compatible with spacy v. 3

    running this requires:
    spacy >= 3.0.0
    spacy-transformers
    """
    from spacy.tokens import Doc
    import dacy
    nlp = dacy.load(dacy_model)
    trf = nlp.get_pipe('transformer')
    ner = nlp.get_pipe('ner')

    predictions = []
    start = time.time()
    for token in sentences_tokens:
        doc = Doc(nlp.vocab, words=token)
        doc = trf(doc)
        doc = ner(doc)
        ents = []
        for t in doc:
            if t.ent_iob_ == 'O':
                ents.append(t.ent_iob_)
            else:
                ents.append(t.ent_iob_ + "-" + t.ent_type_)

        predictions.append(ents)
    print('DaCy ({}):'.format(dacy_model))
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #2
0
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"):
    """
    an adaption of benchmark spacy model which is compatible with spacy v. 3

    running this requires:
    spacy >= 3.0.0
    spacy-transformers
    """
    import dacy
    from spacy.tokens import Doc
    nlp = dacy.load(dacy_model)
    trf = nlp.get_pipe('transformer')
    tagger = nlp.get_pipe('tagger')

    start = time.time()

    tags_pred = []
    for sent in sentences_tokens:
        doc = Doc(nlp.vocab, words=sent)
        doc = trf(doc)
        doc = tagger(doc)

        tags = []
        for tok in doc:

            tags.append(tok.tag_)

        tags_pred.append(tags)
    print('**Spacy model**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(tags_pred) == num_sentences
    assert sum([len(s) for s in tags_pred]) == num_tokens

    print(accuracy_report(tags_true, tags_pred), end="\n\n")
예제 #3
0
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"):
    """
    an adaption of benchmark spacy model which is compatible with spacy v. 3

    running this requires:
    spacy >= 3.0.0
    spacy-transformers
    """
    def normalize_spacy_head(i, hd):
        return 0 if i == hd else hd+1

    from spacy.tokens import Doc
    import dacy
    nlp = dacy.load(dacy_model)
    trf = nlp.get_pipe('transformer')
    parser = nlp.get_pipe('parser')

    
    start = time.time()

    deps_pred = []
    for sent in sentences_tokens:
        doc = Doc(nlp.vocab, words=sent)
        doc = trf(doc)
        doc = parser(doc)

        deprels = []
        depheads = []
        for i, tok in enumerate(doc):
            deprels.append(tok.dep_.lower())
            depheads.append(normalize_spacy_head(i, tok.head.i))
        deps_pred.append([(r,h) for r,h in zip(deprels, depheads)])

    print('**Spacy model**')
    print_speed_performance(start, num_sentences, num_tokens)
    
    assert len(deps_pred)==num_sentences
    assert sum([len(s) for s in deps_pred])==num_tokens
    
    print(dependency_report(deps_true, deps_pred))
예제 #4
0
def test_LIX():
    Doc.set_extension("LIX", getter=LIX_getter)

    nlp = dacy.load("da_dacy_medium_tft-0.0.0")
    doc = nlp("Dette er en test tekst")
    doc._.LIX
spacy.prefer_gpu()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    if platform == "linux" or platform == "linux2" or platform == "darwin":
        multiprocessing.set_start_method("fork")
except RuntimeError:
    pass
# elif platform == "win32":
#     multiprocessing.set_start_method("spawn")

######### DaCy multiprocessing hack START #########
# Hack to make DaCy multiprocessable for both spawn and fork (SpaCy 3.0 issue with pickle)
torch.set_num_threads(1)
num_cpus: int = int(os.cpu_count())  # type: ignore
ner_model = dacy.load("da_dacy_large_tft-0.0.0")


def worker(text: List[str]):  # type: ignore
    return list(ner_model.pipe(text, batch_size=len(text)))


######### DaCy multiprocessing hack END #########


class TextAnonymizer(object):
    """
    Object of a text corpus to apply masking function for anonymization

    Args:
        corpus: The corpus containing a list of strings