def _remove_stop_words(document): text_tokens = word_tokenize(document) tokens_without_sw = [ word for word in text_tokens if not word in stopwords.words() ] rslps = RSLPStemmer() radical = [] for i in tokens_without_sw: radical.append(rslps.stem(i)) filtered_sentence = (" ").join(radical) return filtered_sentence
def _create_frequency_table(text_string) -> dict: words = _cts_tokenize(text_string) rslps = RSLPStemmer() freqTable = dict() for word in words: word = rslps.stem(word) if word in freqTable: freqTable[word] += 1 else: freqTable[word] = 1 return freqTable
def stem(string, stemmer="porter", **kwargs): if stemmer == "porter": impl = PorterStemmer() elif stemmer == "lancaster": impl = LancasterStemmer() elif stemmer == "regex": regexp = kwargs['regexp'] if 'min' in kwargs: min = kwargs['min'] else: mins = 0 impl = RegexpStemmer(regexp=regexp, min=min) elif stemmer == "isri": impl = ISRIStemmer() elif stemmer == "snowball": if 'language' in kwargs: language = kwargs['language'] else: language = 'english' impl = SnowballStemmer(language=language) elif stemmer == "rslp": impl = RSLPStemmer() elif stemmer == "cistem": if 'case_insensitive' in kwargs: case_insensitive = kwargs['case_insensitive'] else: case_insensitive = False impl = Cistem(case_insensitive=case_insensitive) else: return string return impl.stem(string)
def __init__(self, lan='english', norm=None, load_v=None, clean_regex="([aA-zZ]+)", stemmer=RSLPStemmer(), custom_func=[]): self.regex = clean_regex self.stops = nltk.corpus.stopwords.words(lan) if load_v == None: if lan == "english": self.nlp = spacy.load('en') elif lan == "portuguese": self.nlp = spacy.load('pt_core_news_sm') else: raise ("idioma invalido") else: self.nlp = spacy.load(load_v) self.ps = stemmer if (norm != STEM) and (norm != LEM) and (norm != None): raise ("Normalizacao invalida") self.norm = norm self.custom_func = custom_func
def stemmize_text(texts): """Stemmize each token in the list of tokens""" # To count the most common for of a root term root2frequent = {} # Stemming and correct spelling stemmer = RSLPStemmer() texts_ = [] for text in texts: text_ = [] for w in text: stem = stemmer.stem(w) try: root2frequent[stem].update({w: 1}) except KeyError: root2frequent[stem] = Counter() root2frequent[stem].update({w: 1}) text_.append(stem) texts_.append(text_) return texts_, root2frequent
def initialize_bot(): bot = PyBot(RSLPStemmer()) bot.train_file('telegrambot/data/train.json') bot.register_action('saudacao', commands.saudacao) bot.register_action('frase', commands.frase_aleatoria) bot.register_action('vccurte', commands.curte) return bot
def __init__(self, save_path, commands, verbose, force_training=False): self.verbose = verbose self.save_path = save_path # English # self.stemmer = LancasterStemmer() # Portuguese self.stemmer = RSLPStemmer() self.stopwords = set(nltk.corpus.stopwords.words('portuguese') + list(punctuation)) self.commands = commands if force_training: self.load_corpus() else: try: with open(save_path, "rb") as f: self.words, self.labels, self.training, self.output = pickle.load(f) except: self.load_corpus()
def treatTokens(): #print(RSLPStemmer().stem('Viajando')) print("Depois: ", _ALL_WORDS) global _ALL_WORDS stopWords = set(stopwords.words(_LANGUAGE)) allWords = list(_ALL_WORDS) for word in allWords: if word in stopWords or isPunct(word) or isDigits(word): #print("Excluido > ", word) _ALL_WORDS.remove(word) else: l = list(_ALL_WORDS) l[l.index(word)] = RSLPStemmer().stem(word) _ALL_WORDS = set(l) print("Stemmers: ", _ALL_WORDS)
wrds = nltk.word_tokenize(pattern, language='portuguese') palavras.extend(wrds) sentencas.append(wrds) saidas.append(tag) # Separando as palavras não desejadas stopwords = list(string.punctuation) + \ nltk.corpus.stopwords.words('portuguese') filteredWords = [] for palavra in palavras: if palavra not in stopwords: filteredWords.append(palavra) # Stemming stemer = RSLPStemmer() stemmed_words = [stemer.stem(w.lower()) for w in palavras] stemmed_words = sorted(list(set(stemmed_words))) # Criando a bag of words training = [] output = [] outputEmpty = [0 for _ in range(len(intencoes))] for x, frase in enumerate(sentencas): bag = [] wds = [stemer.stem(k.lower()) for k in frase] for w in stemmed_words: if w in wds:
def __init__(self,): NltkStemmer.__init__(self) _RSLPStemmer.__init__(self,)
def applyStemmersByList(self, listWords): listWordsCopy = listWords for word in listWordsCopy: listWords[listWords.index(word)] = RSLPStemmer().stem(word) return listWords
def applyStemmer(self, text): tokens = self.applyTokenizer(text) #tokensCopy = tokens for word in tokens: tokens[tokens.index(word)] = RSLPStemmer().stem(word) return tokens
class BagOfWordsCorpus: def __init__(self, save_path, commands, verbose, force_training=False): self.verbose = verbose self.save_path = save_path # English # self.stemmer = LancasterStemmer() # Portuguese self.stemmer = RSLPStemmer() self.stopwords = set(nltk.corpus.stopwords.words('portuguese') + list(punctuation)) self.commands = commands if force_training: self.load_corpus() else: try: with open(save_path, "rb") as f: self.words, self.labels, self.training, self.output = pickle.load(f) except: self.load_corpus() def load_corpus(self): words = [] labels = [] docs_x = [] docs_y = [] # for intent in data["intents"]: for key, command in self.commands.items(): for pattern in command.patterns: wrds = nltk.word_tokenize(pattern) wrds = [word for word in wrds if word not in self.stopwords] wrds = [self.stemmer.stem(w.lower()) for w in wrds] words.extend(wrds) docs_x.append(wrds) docs_y.append(command.tag) if command.tag not in labels: labels.append(command.tag) words = sorted(list(set(words))) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, wrds in enumerate(docs_x): bag = [] for w in words: if w in wrds: bag.append(1) else: bag.append(0) output_row = out_empty[:] output_row[labels.index(docs_y[x])] = 1 training.append(bag) output.append(output_row) training = np.array(training) output = np.array(output) self.words = words self.labels = labels self.training = training self.output = output with open("data/data.pickle", "wb") as f: pickle.dump((words, labels, training, output), f) def encode(self, sentence): bag = [0 for _ in range(len(self.words))] wrds = nltk.word_tokenize(sentence) wrds = [word for word in wrds if word not in self.stopwords] wrds = [self.stemmer.stem(w.lower()) for w in wrds] corrected_input = wrds # corrent user input spelling caso seja entrada digitada # corrected_input = [] # for userinput_word in s_words: # # spell checking # # userinput_word = reduce_lengthening(userinput_word) # correct_word = spelling.correction(userinput_word) # corrected_input.append(correct_word) if self.verbose: print("Mensagem do usuario corregida para: {0}".format(corrected_input)) for se in wrds: for i, w in enumerate(self.words): if w == se: bag[i] = 1 return np.array(bag) def reduce_lengthening(self, word): pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1", word) def add(self, sentence, tag): try: # read the dataset with open(self.save_path, "rb") as f: self.labels, self.training, self.output = pickle.load(f) x = self.encode([sentence]) # find the phrase in the dataset if x in self.training: return y = [0 for _ in range(len(self.labels))] y[self.labels.index(tag)] = 1 self.training.append(x) self.output.append(y) # add the current phrase to the dataset with open(self.save_path, "wb") as f: pickle.dump((self.labels, self.training, self.output), f) except Exception as e: print(e)
def test_stemmer(self): from nltk.stem.rslp import RSLPStemmer lemmatizer = RSLPStemmer() text = 'policia' stem = lemmatizer.stem(text) assert stem == 'polic'
def _get_rslp_stemmer(self): return RSLPStemmer()
# -*- coding: utf-8 -*- import os import nltk import numpy as np import matplotlib.pyplot as plt from nltk.stem.rslp import RSLPStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from matplotlib import cm as cm path = "corpus/" token_dict = dict([]) list_file_name = [] stemmer = RSLPStemmer() stopwords = set([]) for s in open("stopwords-pt.txt", 'r').readlines(): stopwords.add(s.strip().lower()) def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed def normalize(text): return stem_tokens(nltk.word_tokenize(text, language='portuguese'),
def lema(word): lemmatizer = RSLPStemmer() return "%s*" % lemmatizer.stem(word)