예제 #1
0
def benchmark_polyglot_mdl():
    """
    Running ployglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot
    """
    from polyglot.tag import NEChunker
    from polyglot.text import WordList

    start = time.time()

    predictions = []
    for tokens in sentences_tokens:
        word_list = WordList(tokens, language='da')
        ne_chunker = NEChunker(lang='da')
        word_ent_tuples = list(ne_chunker.annotate(word_list))

        predictions.append([entity for word, entity in word_ent_tuples])
    print('polyglot:')
    print_speed_performance(start, num_sentences, num_tokens)
    assert len(predictions) == len(sentences_entities)

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #2
0
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"):
    """
    an adaption of benchmark spacy model which is compatible with spacy v. 3

    running this requires:
    spacy >= 3.0.0
    spacy-transformers
    """
    from spacy.tokens import Doc
    import dacy
    nlp = dacy.load(dacy_model)
    trf = nlp.get_pipe('transformer')
    ner = nlp.get_pipe('ner')

    predictions = []
    start = time.time()
    for token in sentences_tokens:
        doc = Doc(nlp.vocab, words=token)
        doc = trf(doc)
        doc = ner(doc)
        ents = []
        for t in doc:
            if t.ent_iob_ == 'O':
                ents.append(t.ent_iob_)
            else:
                ents.append(t.ent_iob_ + "-" + t.ent_type_)

        predictions.append(ents)
    print('DaCy ({}):'.format(dacy_model))
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #3
0
def benchmark_daluke_mdl():
    from daluke import AutoNERDaLUKE, predict_ner

    model = AutoNERDaLUKE()
    sents = [" ".join(s) for s in sentences_tokens]

    start = time.time()
    predictions = predict_ner(sents, model)
    print('DaLUKE:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #4
0
def benchmark_bert_mdl():
    bert = load_bert_ner_model()

    start = time.time()

    predictions = []
    for i, sentence in enumerate(sentences_tokens):
        _, pred_ents = bert.predict(sentence)
        predictions.append(pred_ents)
    print('BERT:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #5
0
def benchmark_xlmr_mdl():
    xlmr = load_xlmr_ned_model()

    start = time.time()

    predictions = []
    for sent, kg in zip(sentences, kgs):
        pred = xlmr.predict(sent, kg)
        predictions.append(pred)
    print('XLMR:')
    print_speed_performance(start, num_sentences)

    assert len(predictions) == num_sentences

    print(f1_report(gold_tags, predictions, 'XLM-R', 'DaNED'))
예제 #6
0
def benchmark_nerda_electra_mdl():

    nerda = DA_ELECTRA_DA()
    nerda.download_network()
    nerda.load_network()

    start = time.time()

    predictions = nerda.predict(sentences_tokens)

    print('NERDA DA electra:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #7
0
def benchmark_nerda_multi_mdl():

    nerda = DA_BERT_ML()
    nerda.download_network()
    nerda.load_network()

    start = time.time()

    predictions = nerda.predict(sentences_tokens)

    print('NERDA multilingual:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #8
0
def benchmark_spacy_mdl():

    start = time.time()

    chks_pred = []
    for sent in sentences_tokens:
        bio_chunks = chunker.predict(sent)
        chks_pred.append(bio_chunks)

    print('**Spacy model**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(chks_pred)==num_sentences
    assert sum([len(s) for s in chks_pred])==num_tokens

    print(f1_report(chks_true, chks_pred, bio=True))
예제 #9
0
def benchmark_scandiner_mdl():

    from transformers import AutoModelForTokenClassification
    from transformers import AutoTokenizer
    import torch

    model_name = "saattrupdan/nbailab-base-ner-scandi"
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    id2labels = model.config.id2label

    tokenized_sents = []
    for sentence in sentences_tokens:
        tok_mask = []
        for tok in sentence:
            t = tokenizer.tokenize(tok)
            tok_mask += [1] + [0] * (len(t) - 1)
        inputs = tokenizer.encode(sentence,
                                  return_tensors="pt",
                                  is_split_into_words=True)
        assert (len(inputs[0]) == len(tok_mask) + 2)
        tokenized_sents.append((inputs, tok_mask))

    start = time.time()
    predictions = []
    for (input, mask) in tokenized_sents:
        preds = model(input)[0]
        preds = torch.argmax(preds, dim=2)
        preds = preds[0].tolist()[1:-1]
        assert (len(preds) == len(mask))
        pred_ents = [id2labels[p] for p, m in zip(preds, mask) if m]
        assert (len(pred_ents) == sum(mask))
        predictions.append(pred_ents)
    print('ScandiNER:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #10
0
def benchmark_flair_mdl():
    tagger = load_flair_ner_model()

    start = time.time()

    flair_sentences = []
    for i, sentence in enumerate(sentences_tokens):
        flair_sentence = Sentence()

        for token_txt in sentence:
            flair_sentence.add_token(Token(token_txt))
        flair_sentences.append(flair_sentence)

    tagger.predict(flair_sentences, verbose=True)
    predictions = [[tok.get_tag('ner').value for tok in fs]
                   for fs in flair_sentences]
    print('Flair:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #11
0
def benchmark_spacy_mdl():
    nlp = load_spacy_model()
    ner = nlp.entity

    predictions = []
    start = time.time()
    for token in sentences_tokens:
        doc = nlp.tokenizer.tokens_from_list(token)
        ner(doc)
        ents = []
        for t in doc:
            if t.ent_iob_ == 'O':
                ents.append(t.ent_iob_)
            else:
                ents.append(t.ent_iob_ + "-" + t.ent_type_)

        predictions.append(ents)
    print('spaCy:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))