def tokenize(self, text: str): """ :rtype: list :param text: text to be tokenized into sentences :type text: str """ sents = self.sent_tokenizer.tokenize(text) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
def tokenize(self, text: str): """ :rtype: list :param text: text to be tokenized into sentences :type text: str :param model: tokenizer object to used # Should be in init? :type model: object """ sents = self.sent_tokenizer.tokenize(text) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
def tokenize(self, text: str, split_enclitics:list = ['ne', 'n', 'que', 've', 'ue', 'st'], split_words:list = []): """ :rtype: list :param text: text to be tokenized into sentences :type text: str :param model: tokenizer object to used # Should be in init? :type model: object """ if self._latin_replacements: split_words = self._latin_replacements if split_words: text = self._replace_patterns(text, split_words) sents = self.sent_tokenizer.tokenize(text) if split_enclitics: sents = self._split_enclitics(sents, split_enclitics) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]