def RemoveStopWords(self): LOGGER.debug("text before StopWords processing: {}".format( self.corpus)) self.corpus = self._remove_punct() self.corpus = self._remove_stop_words() LOGGER.debug("text after StopWords processing: {}".format(self.corpus))
def Stem(self, text): LOGGER.debug("text before Stem processing: {}".format(text)) tokens = word_tokenize(text) porter = PorterStemmer() # vectorizing function to able to call on list of tokens stem_words = np.vectorize(porter.stem) self.corpus = stem_words(tokens) LOGGER.debug("text after Stem processing: {}".format(self.corpus))
def Lemmatize(self): LOGGER.debug("text before Lemm processing: {}".format(self.corpus)) wordnet_lemmatizer = WordNetLemmatizer() # vectorizing function to able to call on list of tokens lemmatize_words = np.vectorize(wordnet_lemmatizer.lemmatize) self.corpus = lemmatize_words(self.corpus) LOGGER.debug("text after Lemm processing: {}".format(self.corpus))
def Lemmatize(self): """ Spacy lemmatized much better than nltk, one of the examples risen -> rise, only spacy handled that. """ LOGGER.debug("text before Lemm processing: {}".format(self.corpus)) new_corpus = [] nlp = spacy.load("en_core_web_sm") for el in self.corpus: new_corpus += [nlp(str(el))] self.corpus = new_corpus.copy() LOGGER.debug("text after Lemm processing: {}".format(self.corpus))