def tagging(dictionaries, tokenizer, pos_tagger, ner_tagger, input_streams, sep="\t", **flags): """Print columnar output of [text UID,] token data and entity tags; one token per line.""" worker = TextAnalytics(tokenizer, pos_tagger, **flags) worker.addNerTagger(ner_tagger) for d in dictionaries: worker.addDictionary(d) for input in input_streams: for line in input: *uid, text = line.strip().split(sep) logging.debug('tagging %s: "%s"', '-'.join(uid), text) try: _, ner_tokens, dict_tags = worker.analyze(text) except RuntimeError: logging.exception('at UID %s', sep.join(uid)) continue for idx in range(len(ner_tokens[0])): token = ner_tokens[0][idx] tags = [t[idx].entity for t in ner_tokens[1:]] tags.extend(d[idx] for d in dict_tags) print("{}{}{}{}{}".format(sep.join(uid), sep if uid else "", sep.join(token), sep if tags else "", sep.join(tags))) print("")
def normalize(dictionaries, tokenizer, pos_tagger, ner_tagger, input_streams, sep="\t", **flags): """Print only [text UIDs and] dictionary tags.""" worker = TextAnalytics(tokenizer, pos_tagger, **flags) worker.addNerTagger(ner_tagger) for d in dictionaries: worker.addDictionary(d) for input in input_streams: for line in input: *uid, text = line.strip().split(sep) logging.debug('normalizing %s: "%s"', '-'.join(uid), text) try: _, _, dict_tags = worker.analyze(text) except RuntimeError: logging.exception('at UID %s', sep.join(uid)) continue for tags in dict_tags: for tag in {tag[2:] for tag in tags if tag != Dictionary.O}: print("{}{}{}".format(sep.join(uid), sep if uid else "", tag))
def align(dictionaries, tokenizer, pos_tagger, ner_tagger, input_streams, sep="", **flags): """Print the aligned dictionary tags below the tokens.""" uid = [] worker = TextAnalytics(tokenizer, pos_tagger, **flags) worker.addNerTagger(ner_tagger) for d in dictionaries: worker.addDictionary(d) for input in input_streams: for text in input: if sep: *uid, text = text.strip().split(sep) logging.debug('aligning %s "%s"', sep.join(uid), text) try: tokens, _, dict_tags = worker.analyze(text) except RuntimeError: logging.exception('at UID %s', sep.join(uid)) continue lens = [max(len(tok), max(len(t) for t in tags)) for tok, *tags in zip(tokens, *dict_tags)] if sep and uid: print(sep.join(uid)) print(" ".join(("{:<%i}" % l).format(t) for l, t in zip(lens, tokens))) for tags in dict_tags: print(" ".join(("{:<%i}" % l).format(t) for l, t in zip(lens, tags))) print("--")
def align(dictionaries, tokenizer, pos_tagger, ner_tagger, input_streams, sep="", **flags): """Print the aligned dictionary tags below the tokens.""" uid = [] worker = TextAnalytics(tokenizer, pos_tagger, **flags) worker.addNerTagger(ner_tagger) for d in dictionaries: worker.addDictionary(d) for input in input_streams: for text in input: if sep: *uid, text = text.strip().split(sep) logging.debug('aligning %s "%s"', sep.join(uid), text) try: tokens, _, dict_tags = worker.analyze(text) except RuntimeError: logging.exception('at UID %s', sep.join(uid)) continue lens = [ max(len(tok), max(len(t) for t in tags)) for tok, *tags in zip(tokens, *dict_tags) ] if sep and uid: print(sep.join(uid)) print(" ".join( ("{:<%i}" % l).format(t) for l, t in zip(lens, tokens))) for tags in dict_tags: print(" ".join( ("{:<%i}" % l).format(t) for l, t in zip(lens, tags))) print("--")