예제 #1
0
    def stem(self, corpus):
        """Stem finding method.

        Tokenize input text, then finds token stems.

        Args:
            corpus (str or list of str): Text.

        Returns:
            list: Stems.

        Raises:
            TypeError: If input is not string or a list of strings.

        """
        try:
            tokens = word_tokenize(corpus)
        except TypeError:
            raise TypeError(
                'Stemmer input must be string or a list of strings.')
        stems = list()
        for token in tokens:
            if token.isalpha():
                stems.append(self._word_stem(token.lower()))
            else:
                stems.append(token.lower())
        return stems
예제 #2
0
def ngrams(input_value, n=1):
    """Ngrams function.

    Args:
        input_value (str or list of str): Text or corpus.
        n (int): Number.

    Returns:
        list of tuple: Ngrams.

    Raises:
        TypeError: If input type is not valid.

    Example:
        >>> sent = 'Reason is easy to use'
        >>> from reason.util import ngrams
        >>> ngrams(sent, 4)
        [('Reason', 'is', 'easy', 'to'), ('is', 'easy', 'to', 'use')]

    """
    try:
        data = word_tokenize(input_value)
    except TypeError:
        raise TypeError('Input type must be string or list of strings.')

    n_grams = list()
    for i in range(len(data) - (n - 1)):
        n_gram = tuple([token for token in data[i : (i + n)]])
        n_grams.append(n_gram)

    return n_grams
예제 #3
0
    def __init__(self, data):
        """FreqDist Constructor

        Tokenize input data and creates a counter dictionary.

        Args:
            data (str or list of str): Text or corpus.

        Raises:
            TypeError: If data is not valid.

        """
        try:
            if all(isinstance(i, tuple) for i in data):
                items = list(data)
            else:
                items = word_tokenize(data)
        except TypeError:
            raise TypeError(
                'FreqDist input must be string or list of string or tuple.')

        self._counter = Counter(items)
예제 #4
0
    def tag(self, corpus):
        """Tagging method.

        First tokenize input text, then finds tags and returns (word, tag)
        tuples.

        Args:
            corpus (str or list of str): String text or list of tokens.

        Returns:
            list: Tokens + tags.

        Raises:
            TypeError: If input is not string or a list of strings.

        """
        if isinstance(corpus, str):
            tokens = word_tokenize(corpus)
        else:
            if (isinstance(corpus, list)
                    and all(isinstance(token, str) for token in corpus)):
                tokens = corpus
            else:
                raise TypeError(
                    'Tagger input must be string or a list of string tokens.')
        token_tags = list()
        for token in tokens:
            tag = ''
            for tagger in self.taggers:
                token_tag = tagger._token_tag(token)
                if token_tag is not None:
                    tag = token_tag
                    break
            token_tags.append((token, tag))

        return token_tags
예제 #5
0
    def _token_tag(self, token):
        if word_tokenize(token, 'alphanumeric'):
            return self._tagger.tag(token)[0][1]

        return token