예제 #1
0
def _remove_stop_words(document):
    text_tokens = word_tokenize(document)

    tokens_without_sw = [
        word for word in text_tokens if not word in stopwords.words()
    ]
    rslps = RSLPStemmer()
    radical = []
    for i in tokens_without_sw:
        radical.append(rslps.stem(i))

    filtered_sentence = (" ").join(radical)
    return filtered_sentence
예제 #2
0
def _create_frequency_table(text_string) -> dict:

    words = _cts_tokenize(text_string)
    rslps = RSLPStemmer()

    freqTable = dict()
    for word in words:
        word = rslps.stem(word)
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable
예제 #3
0
    def stem(string, stemmer="porter", **kwargs):

        if stemmer == "porter":
            impl = PorterStemmer()
        elif stemmer == "lancaster":
            impl = LancasterStemmer()
        elif stemmer == "regex":
            regexp = kwargs['regexp']
            if 'min' in kwargs:
                min = kwargs['min']
            else:
                mins = 0
            impl = RegexpStemmer(regexp=regexp, min=min)
        elif stemmer == "isri":
            impl = ISRIStemmer()
        elif stemmer == "snowball":
            if 'language' in kwargs:
                language = kwargs['language']
            else:
                language = 'english'
            impl = SnowballStemmer(language=language)
        elif stemmer == "rslp":
            impl = RSLPStemmer()
        elif stemmer == "cistem":
            if 'case_insensitive' in kwargs:
                case_insensitive = kwargs['case_insensitive']
            else:
                case_insensitive = False
            impl = Cistem(case_insensitive=case_insensitive)
        else:
            return string

        return impl.stem(string)
예제 #4
0
    def __init__(self,
                 lan='english',
                 norm=None,
                 load_v=None,
                 clean_regex="([aA-zZ]+)",
                 stemmer=RSLPStemmer(),
                 custom_func=[]):
        self.regex = clean_regex
        self.stops = nltk.corpus.stopwords.words(lan)

        if load_v == None:
            if lan == "english":
                self.nlp = spacy.load('en')
            elif lan == "portuguese":
                self.nlp = spacy.load('pt_core_news_sm')
            else:
                raise ("idioma invalido")
        else:
            self.nlp = spacy.load(load_v)
        self.ps = stemmer

        if (norm != STEM) and (norm != LEM) and (norm != None):
            raise ("Normalizacao invalida")
        self.norm = norm
        self.custom_func = custom_func
예제 #5
0
def stemmize_text(texts):
    """Stemmize each token in the list of tokens"""
    # To count the most common for of a root term
    root2frequent = {}
    # Stemming and correct spelling
    stemmer = RSLPStemmer()
    texts_ = []
    for text in texts:
        text_ = []
        for w in text:
            stem = stemmer.stem(w)
            try:
                root2frequent[stem].update({w: 1})
            except KeyError:
                root2frequent[stem] = Counter()
                root2frequent[stem].update({w: 1})
            text_.append(stem)
        texts_.append(text_)
    return texts_, root2frequent
예제 #6
0
def initialize_bot():

    bot = PyBot(RSLPStemmer())

    bot.train_file('telegrambot/data/train.json')

    bot.register_action('saudacao', commands.saudacao)
    bot.register_action('frase', commands.frase_aleatoria)
    bot.register_action('vccurte', commands.curte)

    return bot
예제 #7
0
    def __init__(self, save_path, commands, verbose, force_training=False):
        self.verbose = verbose
        self.save_path = save_path

        # English
        # self.stemmer = LancasterStemmer()
        # Portuguese
        self.stemmer = RSLPStemmer()

        self.stopwords = set(nltk.corpus.stopwords.words('portuguese') + list(punctuation))

        self.commands = commands

        if force_training:
            self.load_corpus()
        else:
            try:
                with open(save_path, "rb") as f:
                    self.words, self.labels, self.training, self.output = pickle.load(f)
            except:
                self.load_corpus()
예제 #8
0
def treatTokens():
    #print(RSLPStemmer().stem('Viajando'))

    print("Depois: ", _ALL_WORDS)
    global _ALL_WORDS
    stopWords = set(stopwords.words(_LANGUAGE))
    allWords = list(_ALL_WORDS)
    for word in allWords:
        if word in stopWords or isPunct(word) or isDigits(word):
            #print("Excluido > ", word)
            _ALL_WORDS.remove(word)
        else:
            l = list(_ALL_WORDS)
            l[l.index(word)] = RSLPStemmer().stem(word)
            _ALL_WORDS = set(l)
    print("Stemmers: ", _ALL_WORDS)
예제 #9
0
        wrds = nltk.word_tokenize(pattern, language='portuguese')
        palavras.extend(wrds)
        sentencas.append(wrds)
        saidas.append(tag)

# Separando as palavras não desejadas
stopwords = list(string.punctuation) + \
    nltk.corpus.stopwords.words('portuguese')
filteredWords = []

for palavra in palavras:
    if palavra not in stopwords:
        filteredWords.append(palavra)

# Stemming
stemer = RSLPStemmer()

stemmed_words = [stemer.stem(w.lower()) for w in palavras]
stemmed_words = sorted(list(set(stemmed_words)))

# Criando a bag of words
training = []
output = []

outputEmpty = [0 for _ in range(len(intencoes))]

for x, frase in enumerate(sentencas):
    bag = []
    wds = [stemer.stem(k.lower()) for k in frase]
    for w in stemmed_words:
        if w in wds:
예제 #10
0
    def __init__(self,):

        NltkStemmer.__init__(self)
        _RSLPStemmer.__init__(self,)
예제 #11
0
 def applyStemmersByList(self, listWords):
     listWordsCopy = listWords
     for word in listWordsCopy:
         listWords[listWords.index(word)] = RSLPStemmer().stem(word)
     return listWords
예제 #12
0
 def applyStemmer(self, text):
     tokens = self.applyTokenizer(text)
     #tokensCopy = tokens
     for word in tokens:
         tokens[tokens.index(word)] = RSLPStemmer().stem(word)
     return tokens
예제 #13
0
class BagOfWordsCorpus:

    def __init__(self, save_path, commands, verbose, force_training=False):
        self.verbose = verbose
        self.save_path = save_path

        # English
        # self.stemmer = LancasterStemmer()
        # Portuguese
        self.stemmer = RSLPStemmer()

        self.stopwords = set(nltk.corpus.stopwords.words('portuguese') + list(punctuation))

        self.commands = commands

        if force_training:
            self.load_corpus()
        else:
            try:
                with open(save_path, "rb") as f:
                    self.words, self.labels, self.training, self.output = pickle.load(f)
            except:
                self.load_corpus()

    def load_corpus(self):

        words = []
        labels = []
        docs_x = []
        docs_y = []

        # for intent in data["intents"]:
        for key, command in self.commands.items():
            for pattern in command.patterns:

                wrds = nltk.word_tokenize(pattern)
                wrds = [word for word in wrds if word not in self.stopwords]
                wrds = [self.stemmer.stem(w.lower()) for w in wrds]

                words.extend(wrds)
                docs_x.append(wrds)
                docs_y.append(command.tag)

            if command.tag not in labels:
                labels.append(command.tag)

        words = sorted(list(set(words)))
        labels = sorted(labels)

        training = []
        output = []

        out_empty = [0 for _ in range(len(labels))]

        for x, wrds in enumerate(docs_x):
            bag = []

            for w in words:
                if w in wrds:
                    bag.append(1)
                else:
                    bag.append(0)

            output_row = out_empty[:]
            output_row[labels.index(docs_y[x])] = 1

            training.append(bag)
            output.append(output_row)

        training = np.array(training)
        output = np.array(output)

        self.words = words
        self.labels = labels
        self.training = training
        self.output = output

        with open("data/data.pickle", "wb") as f:
            pickle.dump((words, labels, training, output), f)

    def encode(self, sentence):
        bag = [0 for _ in range(len(self.words))]

        wrds = nltk.word_tokenize(sentence)
        wrds = [word for word in wrds if word not in self.stopwords]
        wrds = [self.stemmer.stem(w.lower()) for w in wrds]

        corrected_input = wrds

        # corrent user input spelling caso seja entrada digitada
        # corrected_input = []
        # for userinput_word in s_words:
        #     # spell checking
        #     # userinput_word = reduce_lengthening(userinput_word)
        #     correct_word = spelling.correction(userinput_word)
        #     corrected_input.append(correct_word)

        if self.verbose:
            print("Mensagem do usuario corregida para: {0}".format(corrected_input))

        for se in wrds:
            for i, w in enumerate(self.words):
                if w == se:
                    bag[i] = 1

        return np.array(bag)

    def reduce_lengthening(self, word):
        pattern = re.compile(r"(.)\1{2,}")
        return pattern.sub(r"\1\1", word)

    def add(self, sentence, tag):
        try:
            # read the dataset
            with open(self.save_path, "rb") as f:
                self.labels, self.training, self.output = pickle.load(f)
                x = self.encode([sentence])

                # find the phrase in the dataset
                if x in self.training:
                    return

                y = [0 for _ in range(len(self.labels))]
                y[self.labels.index(tag)] = 1

                self.training.append(x)
                self.output.append(y)
            # add the current phrase to the dataset
            with open(self.save_path, "wb") as f:
                pickle.dump((self.labels, self.training, self.output), f)
        except Exception as e:
            print(e)
예제 #14
0
 def test_stemmer(self):
     from nltk.stem.rslp import RSLPStemmer
     lemmatizer = RSLPStemmer()
     text = 'policia'
     stem = lemmatizer.stem(text)
     assert stem == 'polic'
예제 #15
0
 def test_stemmer(self):
     from nltk.stem.rslp import RSLPStemmer
     lemmatizer = RSLPStemmer()
     text = 'policia'
     stem = lemmatizer.stem(text)
     assert stem == 'polic'
예제 #16
0
 def _get_rslp_stemmer(self):
     return RSLPStemmer()
# -*- coding: utf-8 -*-
import os
import nltk
import numpy as np
import matplotlib.pyplot as plt

from nltk.stem.rslp import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import cm as cm

path = "corpus/"
token_dict = dict([])
list_file_name = []

stemmer = RSLPStemmer()

stopwords = set([])
for s in open("stopwords-pt.txt", 'r').readlines():
    stopwords.add(s.strip().lower())


def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed


def normalize(text):
    return stem_tokens(nltk.word_tokenize(text, language='portuguese'),
예제 #18
0
파일: utils.py 프로젝트: cyberelfo/casanova
def lema(word):
    lemmatizer = RSLPStemmer()    
    return "%s*" % lemmatizer.stem(word)