예제 #1
0
def benchmark_polyglot_mdl():
    """
    Running ployglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot
    """
    from polyglot.tag import NEChunker
    from polyglot.text import WordList

    start = time.time()

    predictions = []
    for tokens in sentences_tokens:
        word_list = WordList(tokens, language='da')
        ne_chunker = NEChunker(lang='da')
        word_ent_tuples = list(ne_chunker.annotate(word_list))

        predictions.append([entity for word, entity in word_ent_tuples])
    print('polyglot:')
    print_speed_performance(start, num_sentences, num_tokens)
    assert len(predictions) == len(sentences_entities)

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #2
0
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"):
    """
    an adaption of benchmark spacy model which is compatible with spacy v. 3

    running this requires:
    spacy >= 3.0.0
    spacy-transformers
    """
    from spacy.tokens import Doc
    import dacy
    nlp = dacy.load(dacy_model)
    trf = nlp.get_pipe('transformer')
    ner = nlp.get_pipe('ner')

    predictions = []
    start = time.time()
    for token in sentences_tokens:
        doc = Doc(nlp.vocab, words=token)
        doc = trf(doc)
        doc = ner(doc)
        ents = []
        for t in doc:
            if t.ent_iob_ == 'O':
                ents.append(t.ent_iob_)
            else:
                ents.append(t.ent_iob_ + "-" + t.ent_type_)

        predictions.append(ents)
    print('DaCy ({}):'.format(dacy_model))
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #3
0
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"):
    """
    an adaption of benchmark spacy model which is compatible with spacy v. 3

    running this requires:
    spacy >= 3.0.0
    spacy-transformers
    """
    import dacy
    from spacy.tokens import Doc
    nlp = dacy.load(dacy_model)
    trf = nlp.get_pipe('transformer')
    tagger = nlp.get_pipe('tagger')

    start = time.time()

    tags_pred = []
    for sent in sentences_tokens:
        doc = Doc(nlp.vocab, words=sent)
        doc = trf(doc)
        doc = tagger(doc)

        tags = []
        for tok in doc:

            tags.append(tok.tag_)

        tags_pred.append(tags)
    print('**Spacy model**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(tags_pred) == num_sentences
    assert sum([len(s) for s in tags_pred]) == num_tokens

    print(accuracy_report(tags_true, tags_pred), end="\n\n")
예제 #4
0
def benchmark_polyglot_mdl(corrected_output=False):
    """
    Running polyglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot

    """
    def udify_tag(tag, word):
        if tag == "CONJ":
            return "CCONJ"
        if tag == "VERB" and word in auxiliary_verbs:
            return "AUX"
        return tag

    start = time.time()

    tags_pred = []
    for tokens in sentences_tokens:
        word_list = WordList(tokens, language='da')
        tagger = POSTagger(lang='da')
        word_tag_tuples = list(tagger.annotate(word_list))
        tags_pred.append([
            udify_tag(tag, word) if corrected_output else tag
            for word, tag in word_tag_tuples
        ])
    print('**Polyglot model' +
          (' (corrected output) ' if corrected_output else '') + '**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(tags_pred) == num_sentences
    assert sum([len(s) for s in tags_pred]) == num_tokens

    print(accuracy_report(tags_true, tags_pred), end="\n\n")
예제 #5
0
def benchmark_spacy_mdl():
    def normalize_spacy_head(i, hd):
        return 0 if i == hd else hd + 1

    nlp = load_spacy_model()
    parser = nlp.parser

    start = time.time()

    deps_pred = []
    for sent in sentences_tokens:
        doc = nlp.tokenizer.tokens_from_list(sent)
        doc = parser(doc)

        deprels = []
        depheads = []
        for i, tok in enumerate(doc):
            deprels.append(tok.dep_.lower())
            depheads.append(normalize_spacy_head(i, tok.head.i))
        deps_pred.append([(r, h) for r, h in zip(deprels, depheads)])

    print('**Spacy model**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(deps_pred) == num_sentences
    assert sum([len(s) for s in deps_pred]) == num_tokens

    print(dependency_report(deps_true, deps_pred))
예제 #6
0
def benchmark_stanza_mdl():

    nlp = stanza.Pipeline('da',
                          processors='tokenize,pos,lemma,depparse',
                          tokenize_pretokenized=True)

    start = time.time()

    deps_pred = []
    for sent in sentences_tokens:
        doc = nlp(" ".join(sent))

        deprels = []
        depheads = []
        for tok in doc.iter_tokens():
            deprels.append(tok.words[0].deprel)
            depheads.append(tok.words[0].head)
        deps_pred.append([(r, h) for r, h in zip(deprels, depheads)])

    print('**Stanza model**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(deps_pred) == num_sentences
    assert sum([len(s) for s in deps_pred]) == num_tokens

    print(dependency_report(deps_true, deps_pred))
예제 #7
0
def benchmark_daluke_mdl():
    from daluke import AutoNERDaLUKE, predict_ner

    model = AutoNERDaLUKE()
    sents = [" ".join(s) for s in sentences_tokens]

    start = time.time()
    predictions = predict_ner(sents, model)
    print('DaLUKE:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_flair_mdl():
    tagger = load_flair_pos_model()
    
    start = time.time()
    tagger.predict(corpus_flair.test)
    tags_pred = [[tok.tags['upos'].value for tok in fs] for fs in corpus_flair.test]
    
    print('**Flair model** ')
    print_speed_performance(start, num_sentences, num_tokens)
    
    assert len(tags_pred)==num_sentences
    assert sum([len(s) for s in tags_pred])==num_tokens
    
    print(accuracy_report(tags_true, tags_pred), end="\n\n")
예제 #9
0
def benchmark_bert_mdl():
    bert = load_bert_ner_model()

    start = time.time()

    predictions = []
    for i, sentence in enumerate(sentences_tokens):
        _, pred_ents = bert.predict(sentence)
        predictions.append(pred_ents)
    print('BERT:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #10
0
def benchmark_xlmr_mdl():
    xlmr = load_xlmr_ned_model()

    start = time.time()

    predictions = []
    for sent, kg in zip(sentences, kgs):
        pred = xlmr.predict(sent, kg)
        predictions.append(pred)
    print('XLMR:')
    print_speed_performance(start, num_sentences)

    assert len(predictions) == num_sentences

    print(f1_report(gold_tags, predictions, 'XLM-R', 'DaNED'))
예제 #11
0
def benchmark_spacy_mdl():

    start = time.time()

    chks_pred = []
    for sent in sentences_tokens:
        bio_chunks = chunker.predict(sent)
        chks_pred.append(bio_chunks)

    print('**Spacy model**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(chks_pred)==num_sentences
    assert sum([len(s) for s in chks_pred])==num_tokens

    print(f1_report(chks_true, chks_pred, bio=True))
예제 #12
0
def benchmark_nerda_electra_mdl():

    nerda = DA_ELECTRA_DA()
    nerda.download_network()
    nerda.load_network()

    start = time.time()

    predictions = nerda.predict(sentences_tokens)

    print('NERDA DA electra:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #13
0
def benchmark_nerda_multi_mdl():

    nerda = DA_BERT_ML()
    nerda.download_network()
    nerda.load_network()

    start = time.time()

    predictions = nerda.predict(sentences_tokens)

    print('NERDA multilingual:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #14
0
def benchmark_xlmr_mdl():

    from allennlp.data import DataLoader
    from allennlp.training.util import evaluate

    xlmr = load_xlmr_coref_model()

    instances = xlmr.dataset_reader.load_dataset(testset)
    data_loader = SimpleDataLoader(instances, 1)
    data_loader.index_with(xlmr.model.vocab)

    start = time.time()

    metrics = evaluate(xlmr.model, data_loader)

    print('**XLM-R model**')
    print_speed_performance(start, num_sentences, num_tokens)
    print('Precision : ', metrics['coref_precision'])
    print('Recall : ', metrics['coref_recall'])
    print('F1 : ', metrics['coref_f1'])
    print('Mention Recall : ', metrics['mention_recall'])
예제 #15
0
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"):
    """
    an adaption of benchmark spacy model which is compatible with spacy v. 3

    running this requires:
    spacy >= 3.0.0
    spacy-transformers
    """
    def normalize_spacy_head(i, hd):
        return 0 if i == hd else hd+1

    from spacy.tokens import Doc
    import dacy
    nlp = dacy.load(dacy_model)
    trf = nlp.get_pipe('transformer')
    parser = nlp.get_pipe('parser')

    
    start = time.time()

    deps_pred = []
    for sent in sentences_tokens:
        doc = Doc(nlp.vocab, words=sent)
        doc = trf(doc)
        doc = parser(doc)

        deprels = []
        depheads = []
        for i, tok in enumerate(doc):
            deprels.append(tok.dep_.lower())
            depheads.append(normalize_spacy_head(i, tok.head.i))
        deps_pred.append([(r,h) for r,h in zip(deprels, depheads)])

    print('**Spacy model**')
    print_speed_performance(start, num_sentences, num_tokens)
    
    assert len(deps_pred)==num_sentences
    assert sum([len(s) for s in deps_pred])==num_tokens
    
    print(dependency_report(deps_true, deps_pred))
예제 #16
0
def benchmark_scandiner_mdl():

    from transformers import AutoModelForTokenClassification
    from transformers import AutoTokenizer
    import torch

    model_name = "saattrupdan/nbailab-base-ner-scandi"
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    id2labels = model.config.id2label

    tokenized_sents = []
    for sentence in sentences_tokens:
        tok_mask = []
        for tok in sentence:
            t = tokenizer.tokenize(tok)
            tok_mask += [1] + [0] * (len(t) - 1)
        inputs = tokenizer.encode(sentence,
                                  return_tensors="pt",
                                  is_split_into_words=True)
        assert (len(inputs[0]) == len(tok_mask) + 2)
        tokenized_sents.append((inputs, tok_mask))

    start = time.time()
    predictions = []
    for (input, mask) in tokenized_sents:
        preds = model(input)[0]
        preds = torch.argmax(preds, dim=2)
        preds = preds[0].tolist()[1:-1]
        assert (len(preds) == len(mask))
        pred_ents = [id2labels[p] for p, m in zip(preds, mask) if m]
        assert (len(pred_ents) == sum(mask))
        predictions.append(pred_ents)
    print('ScandiNER:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #17
0
def benchmark_flair_mdl():
    tagger = load_flair_ner_model()

    start = time.time()

    flair_sentences = []
    for i, sentence in enumerate(sentences_tokens):
        flair_sentence = Sentence()

        for token_txt in sentence:
            flair_sentence.add_token(Token(token_txt))
        flair_sentences.append(flair_sentence)

    tagger.predict(flair_sentences, verbose=True)
    predictions = [[tok.get_tag('ner').value for tok in fs]
                   for fs in flair_sentences]
    print('Flair:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #18
0
def benchmark_spacy_mdl():
    nlp = load_spacy_model()
    ner = nlp.entity

    predictions = []
    start = time.time()
    for token in sentences_tokens:
        doc = nlp.tokenizer.tokens_from_list(token)
        ner(doc)
        ents = []
        for t in doc:
            if t.ent_iob_ == 'O':
                ents.append(t.ent_iob_)
            else:
                ents.append(t.ent_iob_ + "-" + t.ent_type_)

        predictions.append(ents)
    print('spaCy:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #19
0
def benchmark_stanza_mdl():

    nlp = stanza.Pipeline('da',
                          processors='tokenize,pos',
                          tokenize_pretokenized=True)

    start = time.time()

    tags_pred = []
    for sent in sentences_tokens:
        doc = nlp(" ".join(sent))

        tags = []
        for tok in doc.iter_tokens():
            tags.append(tok.words[0].upos)
        tags_pred.append(tags)

    print('**Stanza model**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(tags_pred) == num_sentences
    assert sum([len(s) for s in tags_pred]) == num_tokens

    print(accuracy_report(tags_true, tags_pred), end="\n\n")
예제 #20
0
def benchmark_spacy_mdl():
    nlp = load_spacy_model()
    tagger = nlp.tagger

    start = time.time()

    tags_pred = []
    for sent in sentences_tokens:
        doc = nlp.tokenizer.tokens_from_list(sent)
        doc = tagger(doc)

        tags = []
        for tok in doc:

            tags.append(tok.pos_)

        tags_pred.append(tags)
    print('**Spacy model**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(tags_pred) == num_sentences
    assert sum([len(s) for s in tags_pred]) == num_tokens

    print(accuracy_report(tags_true, tags_pred), end="\n\n")