def sent_tokenize(text): language = 'english' length_limit = 10 sents = nltk_sent_tokenize(text, language) sents_filtered = [] for s in sents: if s[-1] != ':' and len(s) > length_limit: sents_filtered.append(s) return sents_filtered
def sent_tokenize(self, text): if self.preprocess_type == "nltk": sents = nltk_sent_tokenize(text, self.language) else: sents = gensim_sent_tokenize(text) sents_filtered = [] for s in sents: if s[-1] != ":" and len(s) > self.length_limit: sents_filtered.append(s) # else: # print("REMOVED!!!!" + s) return sents_filtered
def __init__(self, lang): # This can be varied self.language = 'english'.lower() if lang == 2 else 'russian'.lower() self.removeStops = True # `= set()` for not removing stopwords self.puncts = set('.,!?') self.default_encodings = ["utf-8", "cp1251"] # language dispatch self.sent_tokenize = lambda text: nltk_sent_tokenize(text, self.language) self.word_tokenize = lambda text: nltk_word_tokenize(text, self.language) self.stopwords = set(stopwords.words(self.language)) if self.removeStops else set() self.stemmer = RusStemmer() if lang == 1 else EngStemmer()
def sent_tokenize(self, text): if self.preprocess_type == 'nltk': sents = nltk_sent_tokenize(text, self.language) else: sents = gensim_sent_tokenize(text) sents_filtered = [] # to store the sentences for s in sents: if s[-1] != ':' and len( s ) > self.length_limit: # append sentence if it has a minimum character length sents_filtered.append(s) # else: # print("REMOVED!!!!" + s) return sents_filtered
def sent_tokenize(document, tokenizer='nltk'): """ Tokenize the document into a list of sentences. Args: document (string): The input string tokenizer (string): The tokenizer to use. Default is nltk sentence tokenizer Returns: List[string]: The sentences from the input string """ if tokenizer in "nltk": return nltk_sent_tokenize(document) else: return re.split(document, DOC_SPLIT_REGEX)
def sentence_tokenize(text): """Tokenize text into sentences.""" return nltk_sent_tokenize(text)
def sent_tokenize(text, lang="en"): lang = langcodes.Language(lang).language_name().lower() try: return nltk_sent_tokenize(text, language=lang) except (LookupError, KeyError): return nltk_sent_tokenize(text)
from nltk.tokenize import sent_tokenize as nltk_sent_tokenize from nltk.corpus import stopwords from nltk import word_tokenize as nltk_word_tokenize from nltk.util import trigrams # skipgrams(_, n, k); n - deg, k - skip dist import re # This can be varied language = 'english'.lower() #language = 'russian'.lower() removeStops = True # `= set()` for not removing stopwords puncts = set('.,!?') default_encodings = ["utf-8", "cp1251"] # language dispatch sent_tokenize = lambda text: nltk_sent_tokenize(text, language) word_tokenize = lambda text: nltk_word_tokenize(text, language) stopwords = set(stopwords.words(language)) if removeStops else set() if language == 'russian': from nltk.stem.snowball import RussianStemmer as Stemmer else: from nltk.stem.snowball import EnglishStemmer as Stemmer # Remove unnecessary tokens def remove_sth(seq: Iterator[str], sth: Set[str]) -> Iterator[str]: """ Generic function for removal """ return filter(lambda x: x not in sth, seq) def remove_puncts(seq: Iterator[str]) -> Iterator[str]:
def sent_tokenize(text, lang='en'): lang = langcodes.Language(lang).language_name().lower() try: return nltk_sent_tokenize(text, language=lang) except: return nltk_sent_tokenize(text)