示例#1
0
文件: ptb.py 项目: fbuijs/educe
    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Return a tokenized document.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text, tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.tkd_tokens.extend(result)

        return doc
示例#2
0
文件: ptb.py 项目: fbuijs/educe
    def _norm(toknum, tagged_token):
        "tweak a token to match RST_DT text"

        word, tag = tagged_token
        if (ptb_name, toknum) in _PTB_SUBSTS:
            prefix, tweak = _PTB_SUBSTS[(ptb_name, toknum)]
            return TweakedToken(word, tag, tweak, prefix)
        elif is_empty_category(tag) and is_nonword_token(word):
            return TweakedToken(word, tag, "")

        tweak = PTB_TO_TEXT.get(word, word)
        tweak = slash_re.sub('/', tweak)
        tweak = star_re.sub('*', tweak)
        tweak = None if tweak == word else tweak
        return TweakedToken(word, tag, tweak)
示例#3
0
    def _norm(toknum, tagged_token):
        "tweak a token to match RST_DT text"

        word, tag = tagged_token
        if (ptb_name, toknum) in _PTB_SUBSTS:
            prefix, tweak = _PTB_SUBSTS[(ptb_name, toknum)]
            return TweakedToken(word, tag, tweak, prefix)
        elif is_empty_category(tag) and is_nonword_token(word):
            return TweakedToken(word, tag, "")

        tweak = PTB_TO_TEXT.get(word, word)
        tweak = slash_re.sub('/', tweak)
        tweak = star_re.sub('*', tweak)
        tweak = None if tweak == word else tweak
        return TweakedToken(word, tag, tweak)
示例#4
0
    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with tokenization.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text,
                                    tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.set_tokens(result)

        return doc