def benchmark_polyglot_mdl(): """ Running ployglot requires these packages: # Morfessor==2.0.6 # PyICU==2.4.2 # pycld2==0.41 # polyglot """ from polyglot.tag import NEChunker from polyglot.text import WordList start = time.time() predictions = [] for tokens in sentences_tokens: word_list = WordList(tokens, language='da') ne_chunker = NEChunker(lang='da') word_ent_tuples = list(ne_chunker.annotate(word_list)) predictions.append([entity for word, entity in word_ent_tuples]) print('polyglot:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == len(sentences_entities) print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"): """ an adaption of benchmark spacy model which is compatible with spacy v. 3 running this requires: spacy >= 3.0.0 spacy-transformers """ from spacy.tokens import Doc import dacy nlp = dacy.load(dacy_model) trf = nlp.get_pipe('transformer') ner = nlp.get_pipe('ner') predictions = [] start = time.time() for token in sentences_tokens: doc = Doc(nlp.vocab, words=token) doc = trf(doc) doc = ner(doc) ents = [] for t in doc: if t.ent_iob_ == 'O': ents.append(t.ent_iob_) else: ents.append(t.ent_iob_ + "-" + t.ent_type_) predictions.append(ents) print('DaCy ({}):'.format(dacy_model)) print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_daluke_mdl(): from daluke import AutoNERDaLUKE, predict_ner model = AutoNERDaLUKE() sents = [" ".join(s) for s in sentences_tokens] start = time.time() predictions = predict_ner(sents, model) print('DaLUKE:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_bert_mdl(): bert = load_bert_ner_model() start = time.time() predictions = [] for i, sentence in enumerate(sentences_tokens): _, pred_ents = bert.predict(sentence) predictions.append(pred_ents) print('BERT:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_xlmr_mdl(): xlmr = load_xlmr_ned_model() start = time.time() predictions = [] for sent, kg in zip(sentences, kgs): pred = xlmr.predict(sent, kg) predictions.append(pred) print('XLMR:') print_speed_performance(start, num_sentences) assert len(predictions) == num_sentences print(f1_report(gold_tags, predictions, 'XLM-R', 'DaNED'))
def benchmark_nerda_electra_mdl(): nerda = DA_ELECTRA_DA() nerda.download_network() nerda.load_network() start = time.time() predictions = nerda.predict(sentences_tokens) print('NERDA DA electra:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_nerda_multi_mdl(): nerda = DA_BERT_ML() nerda.download_network() nerda.load_network() start = time.time() predictions = nerda.predict(sentences_tokens) print('NERDA multilingual:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_spacy_mdl(): start = time.time() chks_pred = [] for sent in sentences_tokens: bio_chunks = chunker.predict(sent) chks_pred.append(bio_chunks) print('**Spacy model**') print_speed_performance(start, num_sentences, num_tokens) assert len(chks_pred)==num_sentences assert sum([len(s) for s in chks_pred])==num_tokens print(f1_report(chks_true, chks_pred, bio=True))
def benchmark_scandiner_mdl(): from transformers import AutoModelForTokenClassification from transformers import AutoTokenizer import torch model_name = "saattrupdan/nbailab-base-ner-scandi" model = AutoModelForTokenClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) id2labels = model.config.id2label tokenized_sents = [] for sentence in sentences_tokens: tok_mask = [] for tok in sentence: t = tokenizer.tokenize(tok) tok_mask += [1] + [0] * (len(t) - 1) inputs = tokenizer.encode(sentence, return_tensors="pt", is_split_into_words=True) assert (len(inputs[0]) == len(tok_mask) + 2) tokenized_sents.append((inputs, tok_mask)) start = time.time() predictions = [] for (input, mask) in tokenized_sents: preds = model(input)[0] preds = torch.argmax(preds, dim=2) preds = preds[0].tolist()[1:-1] assert (len(preds) == len(mask)) pred_ents = [id2labels[p] for p, m in zip(preds, mask) if m] assert (len(pred_ents) == sum(mask)) predictions.append(pred_ents) print('ScandiNER:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_flair_mdl(): tagger = load_flair_ner_model() start = time.time() flair_sentences = [] for i, sentence in enumerate(sentences_tokens): flair_sentence = Sentence() for token_txt in sentence: flair_sentence.add_token(Token(token_txt)) flair_sentences.append(flair_sentence) tagger.predict(flair_sentences, verbose=True) predictions = [[tok.get_tag('ner').value for tok in fs] for fs in flair_sentences] print('Flair:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_spacy_mdl(): nlp = load_spacy_model() ner = nlp.entity predictions = [] start = time.time() for token in sentences_tokens: doc = nlp.tokenizer.tokens_from_list(token) ner(doc) ents = [] for t in doc: if t.ent_iob_ == 'O': ents.append(t.ent_iob_) else: ents.append(t.ent_iob_ + "-" + t.ent_type_) predictions.append(ents) print('spaCy:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))