Python is_empty_category示例

编程语言: Python

命名空间/包名称: educe.ptb.annotation

方法/功能: is_empty_category

hotexamples.com的示例: 4

Python is_empty_category - 已找到4个示例。这些是从开源项目中提取的最受好评的educe.ptb.annotation.is_empty_category现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： ptb.py 项目： fbuijs/educe

    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Return a tokenized document.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text, tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.tkd_tokens.extend(result)

        return doc

示例#2

显示文件

文件： ptb.py 项目： fbuijs/educe

    def _norm(toknum, tagged_token):
        "tweak a token to match RST_DT text"

        word, tag = tagged_token
        if (ptb_name, toknum) in _PTB_SUBSTS:
            prefix, tweak = _PTB_SUBSTS[(ptb_name, toknum)]
            return TweakedToken(word, tag, tweak, prefix)
        elif is_empty_category(tag) and is_nonword_token(word):
            return TweakedToken(word, tag, "")

        tweak = PTB_TO_TEXT.get(word, word)
        tweak = slash_re.sub('/', tweak)
        tweak = star_re.sub('*', tweak)
        tweak = None if tweak == word else tweak
        return TweakedToken(word, tag, tweak)

示例#3

显示文件

    def _norm(toknum, tagged_token):
        "tweak a token to match RST_DT text"

        word, tag = tagged_token
        if (ptb_name, toknum) in _PTB_SUBSTS:
            prefix, tweak = _PTB_SUBSTS[(ptb_name, toknum)]
            return TweakedToken(word, tag, tweak, prefix)
        elif is_empty_category(tag) and is_nonword_token(word):
            return TweakedToken(word, tag, "")

        tweak = PTB_TO_TEXT.get(word, word)
        tweak = slash_re.sub('/', tweak)
        tweak = star_re.sub('*', tweak)
        tweak = None if tweak == word else tweak
        return TweakedToken(word, tag, tweak)

示例#4

显示文件

    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with tokenization.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text,
                                    tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.set_tokens(result)

        return doc