Exemplo n.º 1
0
def ifd_tag(text):
    """ Tokenize the given text and use a global singleton TnT tagger to tag it """
    global _TAGGER
    if _TAGGER is None:
        # Load the tagger from a pickle the first time it's used
        logging.info("Loading TnT model from {0}".format("config" + os.sep +
                                                         "TnT-model.pickle"))
        _TAGGER = TnT.load("config" + os.sep + "TnT-model.pickle")
        if _TAGGER is None:
            return []  # No tagger model - unable to tag
    token_stream = raw_tokenize(text)
    result = []

    def xlt(txt):
        """ Translate the token text as required before tagging it """
        if txt[0] == '[' and txt[-1] == ']':
            # Abbreviation enclosed in square brackets: remove'em
            return txt[1:-1]
        return _XLT.get(txt, txt)

    for pg in paragraphs(token_stream):
        for _, sent in pg:
            toklist = [xlt(t.txt) for t in sent if t.txt]
            # print(f"Toklist: {toklist}")
            tagged = _TAGGER.tag(toklist)
            result.append(tagged)

    # Return a list of paragraphs, consisting of sentences, consisting of tokens
    return result
Exemplo n.º 2
0
def ifd_tag(text):
    """ Tokenize the given text and use a global singleton TnT tagger to tag it """
    global _TAGGER
    if _TAGGER is None:
        # Load the tagger from a pickle the first time it's used
        logging.info("Loading TnT model from {0}".format("config" + os.sep + "TnT-model.pickle"))
        _TAGGER = TnT.load("config" + os.sep + "TnT-model.pickle")
        if _TAGGER is None:
            return [] # No tagger model - unable to tag
    token_stream = raw_tokenize(text)
    result = []

    def xlt(txt):
        """ Translate the token text as required before tagging it """
        if txt[0] == '[' and txt[-1] == ']':
            # Abbreviation enclosed in square brackets: remove'em
            return txt[1:-1]
        return _XLT.get(txt, txt)

    for pg in paragraphs(token_stream):
        for _, sent in pg:
            toklist = [ xlt(t.txt) for t in sent if t.txt ]
            # print(f"Toklist: {toklist}")
            tagged = _TAGGER.tag(toklist)
            result.append(tagged)

    # Return a list of paragraphs, consisting of sentences, consisting of tokens
    return result