def stem(self, corpus): """Stem finding method. Tokenize input text, then finds token stems. Args: corpus (str or list of str): Text. Returns: list: Stems. Raises: TypeError: If input is not string or a list of strings. """ try: tokens = word_tokenize(corpus) except TypeError: raise TypeError( 'Stemmer input must be string or a list of strings.') stems = list() for token in tokens: if token.isalpha(): stems.append(self._word_stem(token.lower())) else: stems.append(token.lower()) return stems
def ngrams(input_value, n=1): """Ngrams function. Args: input_value (str or list of str): Text or corpus. n (int): Number. Returns: list of tuple: Ngrams. Raises: TypeError: If input type is not valid. Example: >>> sent = 'Reason is easy to use' >>> from reason.util import ngrams >>> ngrams(sent, 4) [('Reason', 'is', 'easy', 'to'), ('is', 'easy', 'to', 'use')] """ try: data = word_tokenize(input_value) except TypeError: raise TypeError('Input type must be string or list of strings.') n_grams = list() for i in range(len(data) - (n - 1)): n_gram = tuple([token for token in data[i : (i + n)]]) n_grams.append(n_gram) return n_grams
def __init__(self, data): """FreqDist Constructor Tokenize input data and creates a counter dictionary. Args: data (str or list of str): Text or corpus. Raises: TypeError: If data is not valid. """ try: if all(isinstance(i, tuple) for i in data): items = list(data) else: items = word_tokenize(data) except TypeError: raise TypeError( 'FreqDist input must be string or list of string or tuple.') self._counter = Counter(items)
def tag(self, corpus): """Tagging method. First tokenize input text, then finds tags and returns (word, tag) tuples. Args: corpus (str or list of str): String text or list of tokens. Returns: list: Tokens + tags. Raises: TypeError: If input is not string or a list of strings. """ if isinstance(corpus, str): tokens = word_tokenize(corpus) else: if (isinstance(corpus, list) and all(isinstance(token, str) for token in corpus)): tokens = corpus else: raise TypeError( 'Tagger input must be string or a list of string tokens.') token_tags = list() for token in tokens: tag = '' for tagger in self.taggers: token_tag = tagger._token_tag(token) if token_tag is not None: tag = token_tag break token_tags.append((token, tag)) return token_tags
def _token_tag(self, token): if word_tokenize(token, 'alphanumeric'): return self._tagger.tag(token)[0][1] return token