def pretrained_tagger(request): """Exposes the command-line option to a test case.""" pretrained_tagger_path = request.config.getoption("--tagger") if not pretrained_tagger_path: pytest.skip("No --tagger given") else: return pos.Tagger( model_file=pretrained_tagger_path, device="cpu", )
#!/usr/bin/env python3 import sys import re import rnc import pos sentences = [] sentences.extend(rnc.Reader().read('tmp/media1.xml')) sentences.extend(rnc.Reader().read('tmp/media2.xml')) sentences.extend(rnc.Reader().read('tmp/media3.xml')) re_pos = re.compile('([\w-]+)(?:[^\w-]|$)'.format('|'.join(pos.tagset))) tagger = pos.Tagger() sentence_labels = [] sentence_words = [] for sentence in sentences: labels = [] words = [] for word in sentence: gr = word[1]['gr'] m = re_pos.match(gr) if not m: print(gr, file = sys.stderr) pos = m.group(1) if pos == 'ANUM': pos = 'A-NUM' label = tagger.get_label_id(pos) if not label: