Exemplo n.º 1
0
def sent_tokenize(text):
    language = 'english'
    length_limit = 10
    sents = nltk_sent_tokenize(text, language)

    sents_filtered = []
    for s in sents:
        if s[-1] != ':' and len(s) > length_limit:
            sents_filtered.append(s)
    return sents_filtered
Exemplo n.º 2
0
 def sent_tokenize(self, text):
     if self.preprocess_type == "nltk":
         sents = nltk_sent_tokenize(text, self.language)
     else:
         sents = gensim_sent_tokenize(text)
     sents_filtered = []
     for s in sents:
         if s[-1] != ":" and len(s) > self.length_limit:
             sents_filtered.append(s)
         # else:
         #   print("REMOVED!!!!" + s)
     return sents_filtered
Exemplo n.º 3
0
    def __init__(self, lang):
        # This can be varied
        self.language = 'english'.lower() if lang == 2 else 'russian'.lower()
        self.removeStops = True  # `= set()` for not removing stopwords
        self.puncts = set('.,!?')
        self.default_encodings = ["utf-8", "cp1251"]

        # language dispatch
        self.sent_tokenize = lambda text: nltk_sent_tokenize(text, self.language)
        self.word_tokenize = lambda text: nltk_word_tokenize(text, self.language)
        self.stopwords = set(stopwords.words(self.language)) if self.removeStops else set()
        self.stemmer = RusStemmer() if lang == 1 else EngStemmer()
 def sent_tokenize(self, text):
     if self.preprocess_type == 'nltk':
         sents = nltk_sent_tokenize(text, self.language)
     else:
         sents = gensim_sent_tokenize(text)
     sents_filtered = []  # to store the sentences
     for s in sents:
         if s[-1] != ':' and len(
                 s
         ) > self.length_limit:  # append sentence if it has a minimum character length
             sents_filtered.append(s)
         # else:
         #   print("REMOVED!!!!" + s)
     return sents_filtered
def sent_tokenize(document, tokenizer='nltk'):
    """
    Tokenize the document into a list of sentences.

    Args:
        document (string):      The input string
        tokenizer (string):     The tokenizer to use. Default is nltk sentence tokenizer

    Returns:
        List[string]: The sentences from the input string
    """
    if tokenizer in "nltk":
        return nltk_sent_tokenize(document)
    else:
        return re.split(document, DOC_SPLIT_REGEX)
Exemplo n.º 6
0
def sentence_tokenize(text):
    """Tokenize text into sentences."""
    return nltk_sent_tokenize(text)
Exemplo n.º 7
0
def sent_tokenize(text, lang="en"):
    lang = langcodes.Language(lang).language_name().lower()
    try:
        return nltk_sent_tokenize(text, language=lang)
    except (LookupError, KeyError):
        return nltk_sent_tokenize(text)
Exemplo n.º 8
0
from nltk.tokenize import sent_tokenize as nltk_sent_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize as nltk_word_tokenize
from nltk.util import trigrams  # skipgrams(_, n, k); n - deg, k - skip dist

import re

# This can be varied
language = 'english'.lower()
#language = 'russian'.lower()
removeStops = True  # `= set()` for not removing stopwords
puncts = set('.,!?')
default_encodings = ["utf-8", "cp1251"]

# language dispatch
sent_tokenize = lambda text: nltk_sent_tokenize(text, language)
word_tokenize = lambda text: nltk_word_tokenize(text, language)
stopwords = set(stopwords.words(language)) if removeStops else set()
if language == 'russian':
    from nltk.stem.snowball import RussianStemmer as Stemmer
else:
    from nltk.stem.snowball import EnglishStemmer as Stemmer


# Remove unnecessary tokens
def remove_sth(seq: Iterator[str], sth: Set[str]) -> Iterator[str]:
    """ Generic function for removal """
    return filter(lambda x: x not in sth, seq)


def remove_puncts(seq: Iterator[str]) -> Iterator[str]:
def sent_tokenize(text, lang='en'):
    lang = langcodes.Language(lang).language_name().lower()
    try:
        return nltk_sent_tokenize(text, language=lang)
    except:
        return nltk_sent_tokenize(text)