def _tokenize(self, text: str): """ Returns a list of word tokens from the text input with common English stop words removed and leading/trailing punctuation stripped. """ text_with_nonalpha_removed = self._non_alphanumeric_chars_regex.sub(repl="", string=text) return [token.lower().strip(string.punctuation) for token in nltk_word_tokenize(text_with_nonalpha_removed) if token not in self._stop_words]
def word_tokenize(x): x_tokens = nltk_word_tokenize(x) x_tokens_offsets = tokens_offsets(x, x_tokens) for i, off in enumerate(x_tokens_offsets): if off is None and '\"' in x and (x_tokens[i] == '``' or x_tokens[i] == '\'\''): x_tokens[i] = '\"' return x_tokens
def __init__(self, lang): # This can be varied self.language = 'english'.lower() if lang == 2 else 'russian'.lower() self.removeStops = True # `= set()` for not removing stopwords self.puncts = set('.,!?') self.default_encodings = ["utf-8", "cp1251"] # language dispatch self.sent_tokenize = lambda text: nltk_sent_tokenize(text, self.language) self.word_tokenize = lambda text: nltk_word_tokenize(text, self.language) self.stopwords = set(stopwords.words(self.language)) if self.removeStops else set() self.stemmer = RusStemmer() if lang == 1 else EngStemmer()
def split_sentence_into_word_tokens(sentence): from nltk import word_tokenize as nltk_word_tokenize return nltk_word_tokenize(sentence)
from nltk.corpus import stopwords from nltk import word_tokenize as nltk_word_tokenize from nltk.util import trigrams # skipgrams(_, n, k); n - deg, k - skip dist import re # This can be varied language = 'english'.lower() #language = 'russian'.lower() removeStops = True # `= set()` for not removing stopwords puncts = set('.,!?') default_encodings = ["utf-8", "cp1251"] # language dispatch sent_tokenize = lambda text: nltk_sent_tokenize(text, language) word_tokenize = lambda text: nltk_word_tokenize(text, language) stopwords = set(stopwords.words(language)) if removeStops else set() if language == 'russian': from nltk.stem.snowball import RussianStemmer as Stemmer else: from nltk.stem.snowball import EnglishStemmer as Stemmer # Remove unnecessary tokens def remove_sth(seq: Iterator[str], sth: Set[str]) -> Iterator[str]: """ Generic function for removal """ return filter(lambda x: x not in sth, seq) def remove_puncts(seq: Iterator[str]) -> Iterator[str]: return remove_sth(seq, puncts)
def word_tokenize(sentence): """Tokenize a Danish sentence into words.""" return nltk_word_tokenize(sentence)