示例#1
0
 def _tokenize(self, text: str):
     """
     Returns a list of word tokens from the text input with common English
     stop words removed and leading/trailing punctuation stripped.
     """
     text_with_nonalpha_removed = self._non_alphanumeric_chars_regex.sub(repl="", string=text)
     return [token.lower().strip(string.punctuation) for token in nltk_word_tokenize(text_with_nonalpha_removed) if token not in self._stop_words]
示例#2
0
def word_tokenize(x):
    x_tokens = nltk_word_tokenize(x)
    x_tokens_offsets = tokens_offsets(x, x_tokens)
    for i, off in enumerate(x_tokens_offsets):
        if off is None and '\"' in x and (x_tokens[i] == '``'
                                          or x_tokens[i] == '\'\''):
            x_tokens[i] = '\"'
    return x_tokens
示例#3
0
    def __init__(self, lang):
        # This can be varied
        self.language = 'english'.lower() if lang == 2 else 'russian'.lower()
        self.removeStops = True  # `= set()` for not removing stopwords
        self.puncts = set('.,!?')
        self.default_encodings = ["utf-8", "cp1251"]

        # language dispatch
        self.sent_tokenize = lambda text: nltk_sent_tokenize(text, self.language)
        self.word_tokenize = lambda text: nltk_word_tokenize(text, self.language)
        self.stopwords = set(stopwords.words(self.language)) if self.removeStops else set()
        self.stemmer = RusStemmer() if lang == 1 else EngStemmer()
示例#4
0
def split_sentence_into_word_tokens(sentence):
    from nltk import word_tokenize as nltk_word_tokenize
    return nltk_word_tokenize(sentence)
示例#5
0
from nltk.corpus import stopwords
from nltk import word_tokenize as nltk_word_tokenize
from nltk.util import trigrams  # skipgrams(_, n, k); n - deg, k - skip dist

import re

# This can be varied
language = 'english'.lower()
#language = 'russian'.lower()
removeStops = True  # `= set()` for not removing stopwords
puncts = set('.,!?')
default_encodings = ["utf-8", "cp1251"]

# language dispatch
sent_tokenize = lambda text: nltk_sent_tokenize(text, language)
word_tokenize = lambda text: nltk_word_tokenize(text, language)
stopwords = set(stopwords.words(language)) if removeStops else set()
if language == 'russian':
    from nltk.stem.snowball import RussianStemmer as Stemmer
else:
    from nltk.stem.snowball import EnglishStemmer as Stemmer


# Remove unnecessary tokens
def remove_sth(seq: Iterator[str], sth: Set[str]) -> Iterator[str]:
    """ Generic function for removal """
    return filter(lambda x: x not in sth, seq)


def remove_puncts(seq: Iterator[str]) -> Iterator[str]:
    return remove_sth(seq, puncts)
示例#6
0
def word_tokenize(sentence):
    """Tokenize a Danish sentence into words."""
    return nltk_word_tokenize(sentence)