Python Trie 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk.util

클래스/타입: Trie

hotexamples.com에서의 예제들: 4

Python Trie - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk.util.Trie에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Trie(1)

insert(1)

예제 #1

파일 보기

    def __init__(self, mwes=None, separator='_'):
        """Initialize the multi-word tokenizer with a list of expressions and a
        separator

        :type mwes: list(list(str))
        :param mwes: A sequence of multi-word expressions to be merged, where
            each MWE is a sequence of strings.
        :type separator: str
        :param separator: String that should be inserted between words in a multi-word
            expression token. (Default is '_')

        """
        if not mwes:
            mwes = []
        self._mwes = Trie(mwes)
        self._separator = separator

예제 #2

파일 보기

파일: mwe.py 프로젝트: DrDub/nltk

    def __init__(self, mwes=None, separator='_'):
        """Initialize the multi-word tokenizer with a list of expressions and a
        separator

        :type mwes: list(list(str))
        :param mwes: A sequence of multi-word expressions to be merged, where
            each MWE is a sequence of strings.
        :type separator: str
        :param separator: String that should be inserted between words in a multi-word
            expression token. (Default is '_')

        """
        if not mwes:
            mwes = []
        self._mwes = Trie(mwes)
        self._separator = separator

예제 #3

파일 보기

class MWETokenizer(TokenizerI):
    """A tokenizer that processes tokenized text and merges multi-word expressions
    into single tokens.
    """
    def __init__(self, mwes=None, separator='_'):
        """Initialize the multi-word tokenizer with a list of expressions and a
        separator

        :type mwes: list(list(str))
        :param mwes: A sequence of multi-word expressions to be merged, where
            each MWE is a sequence of strings.
        :type separator: str
        :param separator: String that should be inserted between words in a multi-word
            expression token. (Default is '_')

        """
        if not mwes:
            mwes = []
        self._mwes = Trie(mwes)
        self._separator = separator

    def add_mwe(self, mwe):
        """Add a multi-word expression to the lexicon (stored as a word trie)

        We use ``util.Trie`` to represent the trie. Its form is a dict of dicts. 
        The key True marks the end of a valid MWE.

        :param mwe: The multi-word expression we're adding into the word trie
        :type mwe: tuple(str) or list(str)

        :Example:

        >>> tokenizer = MWETokenizer()
        >>> tokenizer.add_mwe(('a', 'b'))
        >>> tokenizer.add_mwe(('a', 'b', 'c'))
        >>> tokenizer.add_mwe(('a', 'x'))
        >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
        >>> tokenizer._mwes.as_dict() == expected
        True

        """
        self._mwes.insert(mwe)

    def tokenize(self, text):
        """

        :param text: A list containing tokenized text
        :type text: list(str)
        :return: A list of the tokenized text with multi-words merged together
        :rtype: list(str)

        :Example:

        >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
        >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
        ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
        
        """
        i = 0
        n = len(text)
        result = []

        while i < n:
            if text[i] in self._mwes:
                # possible MWE match
                j = i
                trie = self._mwes
                while j < n and text[j] in trie:
                    trie = trie[text[j]]
                    j = j + 1
                else:
                    if Trie.LEAF in trie:
                        # success!
                        result.append(self._separator.join(text[i:j]))
                        i = j
                    else:
                        # no match, so backtrack
                        result.append(text[i])
                        i += 1
            else:
                result.append(text[i])
                i += 1

        return result

예제 #4

파일 보기

파일: mwe.py 프로젝트: DrDub/nltk

class MWETokenizer(TokenizerI):
    """A tokenizer that processes tokenized text and merges multi-word expressions
    into single tokens.
    """

    def __init__(self, mwes=None, separator='_'):
        """Initialize the multi-word tokenizer with a list of expressions and a
        separator

        :type mwes: list(list(str))
        :param mwes: A sequence of multi-word expressions to be merged, where
            each MWE is a sequence of strings.
        :type separator: str
        :param separator: String that should be inserted between words in a multi-word
            expression token. (Default is '_')

        """
        if not mwes:
            mwes = []
        self._mwes = Trie(mwes)
        self._separator = separator

    def add_mwe(self, mwe):
        """Add a multi-word expression to the lexicon (stored as a word trie)

        We use ``util.Trie`` to represent the trie. Its form is a dict of dicts. 
        The key True marks the end of a valid MWE.

        :param mwe: The multi-word expression we're adding into the word trie
        :type mwe: tuple(str) or list(str)

        :Example:

        >>> tokenizer = MWETokenizer()
        >>> tokenizer.add_mwe(('a', 'b'))
        >>> tokenizer.add_mwe(('a', 'b', 'c'))
        >>> tokenizer.add_mwe(('a', 'x'))
        >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
        >>> tokenizer._mwes.as_dict() == expected
        True

        """
        self._mwes.insert(mwe)

    def tokenize(self, text):
        """

        :param text: A list containing tokenized text
        :type text: list(str)
        :return: A list of the tokenized text with multi-words merged together
        :rtype: list(str)

        :Example:

        >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
        >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
        ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
        
        """
        i = 0
        n = len(text)
        result = []

        while i < n:
            if text[i] in self._mwes:
                # possible MWE match
                j = i
                trie = self._mwes
                while j < n and text[j] in trie:
                    trie = trie[text[j]]
                    j = j + 1
                else:
                    if Trie.LEAF in trie:
                        # success!
                        result.append(self._separator.join(text[i:j]))
                        i = j
                    else:
                        # no match, so backtrack
                        result.append(text[i])
                        i += 1
            else:
                result.append(text[i])
                i += 1

        return result