def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"): """ an adaption of benchmark spacy model which is compatible with spacy v. 3 running this requires: spacy >= 3.0.0 spacy-transformers """ import dacy from spacy.tokens import Doc nlp = dacy.load(dacy_model) trf = nlp.get_pipe('transformer') tagger = nlp.get_pipe('tagger') start = time.time() tags_pred = [] for sent in sentences_tokens: doc = Doc(nlp.vocab, words=sent) doc = trf(doc) doc = tagger(doc) tags = [] for tok in doc: tags.append(tok.tag_) tags_pred.append(tags) print('**Spacy model**') print_speed_performance(start, num_sentences, num_tokens) assert len(tags_pred) == num_sentences assert sum([len(s) for s in tags_pred]) == num_tokens print(accuracy_report(tags_true, tags_pred), end="\n\n")
def benchmark_polyglot_mdl(corrected_output=False): """ Running polyglot requires these packages: # Morfessor==2.0.6 # PyICU==2.4.2 # pycld2==0.41 # polyglot """ def udify_tag(tag, word): if tag == "CONJ": return "CCONJ" if tag == "VERB" and word in auxiliary_verbs: return "AUX" return tag start = time.time() tags_pred = [] for tokens in sentences_tokens: word_list = WordList(tokens, language='da') tagger = POSTagger(lang='da') word_tag_tuples = list(tagger.annotate(word_list)) tags_pred.append([ udify_tag(tag, word) if corrected_output else tag for word, tag in word_tag_tuples ]) print('**Polyglot model' + (' (corrected output) ' if corrected_output else '') + '**') print_speed_performance(start, num_sentences, num_tokens) assert len(tags_pred) == num_sentences assert sum([len(s) for s in tags_pred]) == num_tokens print(accuracy_report(tags_true, tags_pred), end="\n\n")
def benchmark_flair_mdl(): tagger = load_flair_pos_model() start = time.time() tagger.predict(corpus_flair.test) tags_pred = [[tok.tags['upos'].value for tok in fs] for fs in corpus_flair.test] print('**Flair model** ') print_speed_performance(start, num_sentences, num_tokens) assert len(tags_pred)==num_sentences assert sum([len(s) for s in tags_pred])==num_tokens print(accuracy_report(tags_true, tags_pred), end="\n\n")
def benchmark_spacy_mdl(): nlp = load_spacy_model() tagger = nlp.tagger start = time.time() tags_pred = [] for sent in sentences_tokens: doc = nlp.tokenizer.tokens_from_list(sent) doc = tagger(doc) tags = [] for tok in doc: tags.append(tok.pos_) tags_pred.append(tags) print('**Spacy model**') print_speed_performance(start, num_sentences, num_tokens) assert len(tags_pred) == num_sentences assert sum([len(s) for s in tags_pred]) == num_tokens print(accuracy_report(tags_true, tags_pred), end="\n\n")
def benchmark_stanza_mdl(): nlp = stanza.Pipeline('da', processors='tokenize,pos', tokenize_pretokenized=True) start = time.time() tags_pred = [] for sent in sentences_tokens: doc = nlp(" ".join(sent)) tags = [] for tok in doc.iter_tokens(): tags.append(tok.words[0].upos) tags_pred.append(tags) print('**Stanza model**') print_speed_performance(start, num_sentences, num_tokens) assert len(tags_pred) == num_sentences assert sum([len(s) for s in tags_pred]) == num_tokens print(accuracy_report(tags_true, tags_pred), end="\n\n")