示例#1
0
class SRLParser(object):

    def __init__(self):
        self.stemmer = RSLPStemmer()
        self.verbs = {}
        self.verbs[self.stemmer.stem('publicar')] = ['Quem publica?', 'O que publica?', 'Onde publica?', 'Quando publica?', 'Alcance da publicação?', 'Qualidade da publicacao?']
        self.verbs[self.stemmer.stem('registrar')] = ['Quem registra?', 'Quando registra?', 'Onde registra?', 'Qual o meio de registro?', 'O que se registra?']


    def parse_sentence(self, frase):
        tokens = word_tokenize(frase, language='portuguese')
        for t in tokens:
            verb_config = self.verbs.get(self.stemmer.stem(t.lower()), None)
            if not verb_config is None:
                return (t, verb_config, tokens)
        return (None, None, None)

    def generate_question(self, verb, option):
        return option


    def gerar_sentenca(self, sentenca):
        verbo, config, tokens = self.parse_sentence(sentenca)
        if verbo != None:
            sent = create_sentence(sentenca, verbo)
            sent.questions = []
            for c in config:
                sent.questions.append(create_question(sent.id, tokens, pergunta=self.generate_question(verbo, c), skip_list=[verbo]))
            return sent
        return None
示例#2
0
def Stemming(palavras):
    stemmer = RSLPStemmer()
    palavras_base = []
    for palavra in palavras:
        palavras_base.append(stemmer.stem(palavra))

    return palavras_base
示例#3
0
def gera_indice_invertido(docs, base_dir):
    # Utilitários necessários
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    st = RSLPStemmer()
    # ------------------------------------------------

    cont_arquivo = 0
    for file in docs:
        cont_arquivo += 1
        dict_arquivos[cont_arquivo] = file

        caminho_arquivo = os.path.join(base_dir, file)
        with open(caminho_arquivo, 'r') as f:
            txt_arquivo = f.read()

            palavras = tokenizer.tokenize(txt_arquivo)

            palavras = filtra_palavras(palavras)

            radical_palavras = [st.stem(palavra) for palavra in palavras
                                ]  # Obtem apenas o radical de cada palavra

            for palavra in radical_palavras:
                if palavra not in dict_indice_invertido.keys():
                    dict_indice_invertido[palavra] = {cont_arquivo: 1}
                else:
                    if cont_arquivo not in dict_indice_invertido[palavra].keys(
                    ):
                        dict_indice_invertido[palavra][cont_arquivo] = 1
                    else:
                        dict_indice_invertido[palavra][cont_arquivo] += 1
            f.close()
示例#4
0
 def preparingSetToTrain(self, input, N=2000):
     self.all_words = set()
     wordsFreq = {}
     stopWords = stopwords.words('english')
     stemmer = RSLPStemmer()
     for document in input:
         clean_text = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '',
                             document[1])
         for word in word_tokenize(clean_text, 'english'):
             if word.lower() not in stopWords:
                 stemmed_word = stemmer.stem(word.lower())
                 if stemmed_word in wordsFreq:
                     wordsFreq[stemmed_word] += 1
                 else:
                     wordsFreq[stemmed_word] = 1
     i = 0
     for item in sorted(wordsFreq, key=wordsFreq.get):
         if (i < N):
             self.all_words.add(item)
         i += 1
     t = []
     for document in input:
         clean_text = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '',
                             document[1])
         aux = {}
         for word in word_tokenize(clean_text, 'portuguese'):
             if word.lower() not in stopWords:
                 stemmed_word = stemmer.stem(word.lower())
                 if stemmed_word in self.all_words:
                     aux[stemmed_word] = True
         for word in self.all_words:
             if word not in aux:
                 aux[word] = False
         t.append((aux, document[0]))
     return t
示例#5
0
def normalize_text(text):
    '''
    Um exemplo de formas diferentes para normalizar um texto
    usando ferramentas do NLTK

    param: text Uma string com texto que será processado
    '''
    text = text.decode('utf8')
    stemmer = RSLPStemmer()  # Carregando um radicalizador para o PT-BR
    print(text)
    for sent in sent_tokenize(text):
        # Testando formas de tokenização
        tokens = wordpunct_tokenize(sent)
        print(sent)
        print('   wordpunct: \t%s' % ' '.join(tokens))

        tokens = word_tokenize(sent)
        print('        word: \t%s' % ' '.join(tokens))

        # Removendo stopwords
        tokens = remove_stopwords(tokens)
        print('  -stopwords: \t%s' % ' '.join(tokens))

        # Radicalizando as palavras restantes
        tokens = [stemmer.stem(t) for t in tokens]
        print('radicalizado: \t%s' % ' '.join(tokens))

        print('')
示例#6
0
def Stemming(sentence):
    stemmer = RSLPStemmer()
    phrase = []
    for word in sentence:
        phrase.append(stemmer.stem(word.lower()))
        phrase.append(" ")
    return "".join(phrase)
示例#7
0
def clean_words(words, remove_stopwords=False, language='portuguese'):
    """Stems and removes stopwords from a set of word-level tokens using the RSLPStemmer.

    Args:
        words (list): Tokens to be stemmed.
        remove_stopwords (bool): Whether stopwords should be removed or not.
        language (str): Identifier of stopwords' language.

    Returns:
        List of stemmed tokens.

    """

    # Creates the RSLP stemmer
    stemmer = RSLPStemmer()

    # Checks if stopwords are supposed to be removed
    if remove_stopwords:
        # Gathers the stopwords
        stop_words = stopwords.words(language)

        # Stems and removes the stopwords
        stemmed_words = [
            stemmer.stem(word) for word in words
            if word.lower() not in stop_words
        ]

    # If stopwords are not supposed to be removed
    else:
        # Just stems the words
        stemmed_words = [stemmer.stem(word) for word in words]

    return stemmed_words
def lematizar(text):
    if (text != 'negativo' and text != 'positivo'):
        st = RSLPStemmer()
        w = st.stem(text)
    else:
        w=text 
    return w
示例#9
0
 def __init__(self):
     self.nlp = spacy.load('pt_core_news_md')
     self.vogais = [
         'a', 'á', 'à', 'â', 'ã', 'e', 'é', 'ê', 'i', 'í', 'î', 'o', 'ó',
         'ô', 'õ', 'u', 'ú', 'û'
     ]
     self.stemmer = RSLPStemmer()
def lematizar(text):
    lista_conjuncao=['mas', 'contudo', 'no entanto', 'entretanto', 'porem', 'todavia']
    if (text != 'negativo' and text != 'positivo' and text not in lista_conjuncao):
        st = RSLPStemmer()
        w = st.stem(text)
    else:
        w=text 
    return w
 def stemmer(self, processed_text):
     '''Input: processed text
        Output: tokens after stemming
     '''
     st = RSLPStemmer()
     #st = SnowballStemmer("english")
     stemmed_list = set(st.stem(token) for token in processed_text)
     return stemmed_list
示例#12
0
def get_idword(conn, word):
    result = -1
    stemmer = RSLPStemmer()
    cursor = conn.execute('SELECT idword FROM words WHERE word = %s',
                          stemmer.stem(word))
    if cursor.rowcount > 0:
        result = cursor.fetchone()[0]
    return result
示例#13
0
def stemming_(text):
    stemmer = RSLPStemmer()
    stemming = []
    for phrase, emotion in text:
        preprocess = [
            str(stemmer.stem(p)) for p in phrase.split() if p not in stopwords
        ]
        stemming.append((preprocess, emotion))
    return stemming
示例#14
0
def stemming(tokens):

    stemmer = RSLPStemmer()
    pharse = []

    for word in tokens:
        pharse.append(stemmer.stem(word))

    return pharse
示例#15
0
def word_stemm(word):
    stemmer = RSLPStemmer()

    try:
        word = stemmer.stem(word)
    except:
        print(word)

    return word
示例#16
0
def stemming(sentence):

	stemmer = RSLPStemmer()
	phrase = []
	for word in sentence:
		word = unicode(word, 'utf-8')
		word = unicodedata.normalize("NFKD", word)
		phrase.append(stemmer.stem(word.lower()))

	return phrase
示例#17
0
def separaPalavras(texto):
    stop = nltk.corpus.stopwords.words('portuguese')
    stemmer = RSLPStemmer()
    splitter = re.compile('\\W+')
    lista_palavras = []
    lista = [p for p in splitter.split(texto) if p != '']
    for p in lista:
        if p.lower() not in stop:
            if len(p) > 1:
                lista_palavras.append(stemmer.stem(p).lower())
    return lista_palavras
示例#18
0
def getIdPalavra(palavra):
    retorno = -1
    stemmer = RSLPStemmer()
    conexao = pymysql.connect(host=host, user=id_user, passwd=password, db=data_base)
    cursor = conexao.cursor()
    cursor.execute('select idpalavra from palavras where palavra = %s', stemmer.stem(palavra))
    if cursor.rowcount > 0:
        retorno = cursor.fetchone()[0]
    cursor.close()
    conexao.close()
    return retorno
def separates_words(text):
    stop_words = stopwords.words('portuguese')
    stemmer = RSLPStemmer()
    splitter = re.compile('\W+')
    list_words = []
    words = [p for p in splitter.split(text) if p != '']
    for word in words:
        if word.lower() not in stop_words:
            if len(word) > 1:
                list_words.append(stemmer.stem(word).lower())
    return list_words
 def features(self):
     """Use N gram to extract feature """
     n = 1
     data = self.data.split()
     st = RSLPStemmer()    
     data = [st.stem(word) for word in self.data.split()]
     out = []
     for i  in range(n, len(self.data.split()) - n + 1):
         out.append(data[i - n:i])
         out.append(data[i + 1:i + n])
     return [' '.join(x) for x in out]
示例#21
0
    def _stem(text):
        """ Convert words to it's stem

        :param text: list of words
        :return: list of stemmed words
        """
        stemmer = RSLPStemmer()
        phrase = []
        for word in text:
            phrase.append(stemmer.stem(word.lower()))
        return phrase
示例#22
0
def get_doc(folder_name):

    doc_list = get_doc_list(folder_name)
    tokenizer = RegexpTokenizer(r'\w+')
    #en_stop = get_stop_words('en')
    #p_stemmer = PorterStemmer()
    p_stemmer = RSLPStemmer()

    taggeddoc = []

    texts = []
    for index, i in enumerate(doc_list):
        # for tagged doc
        wordslist = []
        tagslist = []

        # clean and tokenize document string
        raw = gensim.utils.to_unicode(i, 'latin1').lower()
        print index, ' - ', raw, '\n'
        tokens = tokenizer.tokenize(raw)
        #print tokens

        # remove stop words from tokens
        #stopped_tokens = [i for i in tokens if not i in en_stop]

        #Remove StopWords
        stopped_tokens = [
            word for word in tokens
            if word not in stopwords.words('portuguese')
        ]
        #print stopped_tokens

        # remove numbers
        number_tokens = [re.sub(r'[\d]', ' ', i) for i in stopped_tokens]
        number_tokens = ' '.join(number_tokens).split()
        #print number_tokens,'\n'

        # stem tokens
        #stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]

        #Stemming
        stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]
        print stemmed_tokens, '\n'

        # remove empty
        length_tokens = [i for i in stemmed_tokens if len(i) > 1]
        # add tokens to list
        texts.append(length_tokens)

        #td = TaggedDocument(gensim.utils.to_unicode(str.encode(' '.join(stemmed_tokens))).split(),str(index))
        td = TaggedDocument(forward_transformer(stemmed_tokens), str(index))
        taggeddoc.append(td)

    return taggeddoc
示例#23
0
    def __init__(self, db=None):
        if db is None:
            self.db = constantes.ARQ_BANCO

        self.stem_count = dict()
        self.word_count = dict()
        self.unidas_count = dict()
        self.connection = sqlite3.connect(self.db)
        self.cursor = self.connection.cursor()
        self.stemmer = RSLPStemmer()
        self.tokenizer = RegexpTokenizer(r'\w+')
示例#24
0
def string_steem(text):

    string_steem = []
    stemmer = RSLPStemmer()

    for i in text.split():
        try:
            string_steem.append(
                stemmer.stem(i.lower().translate(remove_punct_dict)))
        except:
            string_steem.append('')
    return ' '.join(i for i in string_steem)
示例#25
0
def stemming_(text):
    punc = string.punctuation
    stemmer = RSLPStemmer()
    stemming = []
    for phrase, emotion in text.items():
        nopunc = [str(stemmer.stem(p)) for p in phrase if p not in punc]
        nopunc = ''.join(nopunc)
        preprocess = [
            str(stemmer.stem(p)) for p in nopunc.split() if p not in stopwords
        ]
        stemming.append((preprocess, emotion))
    return stemming
示例#26
0
def only_stems(keywords):
    st = PorterStemmer()
    os = OrengoStemmer()
    ss = SavoyStemmer()

    rs = RSLPStemmer()
    
    stem1 = [st.getWordStem(x.encode('utf8')) for x in keywords]
    stem2 = [rs.stem(x.encode('utf8')) for x in keywords]
    stem3 = [os.getWordStem(x.encode('utf8')) for x in keywords]
    stem4 = [ss.getWordStem(x.encode('utf8')) for x in keywords]

    return stem1+stem2+stem3+stem4
示例#27
0
 def preprocess_text(self, text):
     tokens = []
     stemmer = RSLPStemmer()
     for t in text.split():
         # Need a better set of stopwords
         #if t in stopwords.words('portuguese'):
         #continue
         t = unidecode(t)
         t = t.lower()
         t = re.sub(r'\W+', '', t)
         t = stemmer.stem(t)
         tokens.append(t)
     return ' '.join(tokens)
def stem_string(string, minimal_length):
    st = RSLPStemmer()
    string = clean_string(string)
    string = string.replace('\n', ' ')
    text = []
    for token in string.split(' '):
        if token != '' and len(token) > minimal_length:
            try:
                text.append(st.stem(clean_word(token)))
            except:
                text.append(clean_word(token).decode('utf8', 'ignore'))

    return ' '.join(text)
示例#29
0
def search(query, document_base):
    st = RSLPStemmer()

    dict_indice_invertido, dict_arquivos = get_indice_invertido(document_base)

    # Realiza a consulta
    consultas = query.split(
        '|')  # Primeiramente, divide a consulta pelos operadores OR existentes

    conjunto_final = set()
    conjunto = set()
    for consulta in consultas:  # Realiza um consulta separada para cada uma anteriormente dividida
        consulta = consulta.split('&')

        count = 0
        for palavra in consulta:
            palavra = st.stem(palavra.strip())
            if count == 0:
                if '!' in palavra:
                    conjunto = set(dict_arquivos.keys())
                    conjunto = conjunto.difference(
                        dict_indice_invertido[st.stem(
                            palavra.lstrip('!'))].keys())
                else:
                    try:
                        conjunto = set(dict_indice_invertido[palavra].keys())
                    except KeyError:
                        conjunto = {}
            else:
                if '!' in palavra:
                    conjunto = conjunto.intersection(
                        set(dict_arquivos.keys()).difference(
                            dict_indice_invertido[st.stem(
                                palavra.lstrip('!'))].keys()))
                else:
                    try:
                        conjunto = conjunto.intersection(
                            set(dict_indice_invertido[palavra].keys()))
                    except KeyError:
                        conjunto = {}
            count += 1

        conjunto_final = conjunto_final.union(conjunto)

    txt_arquivos = ''
    for file in conjunto_final:
        txt_arquivos += dict_arquivos[file] + '\n'

    with open("answer.txt", 'w+') as resposta:
        resposta.write(str(len(conjunto_final)) + '\n' + txt_arquivos)
        resposta.close()
 def stem_report_sents(self, encoded_text_alpha_no_punct_stopword_list):
     decoded_stemmed_list = []
     encoded_stemmed_list = []
     for i in range(len(encoded_text_alpha_no_punct_stopword_list)):
         decoded_stemmed_list.append([])
         encoded_stemmed_list.append([])
     stemmer = RSLPStemmer()
     for c1 in range(len(encoded_text_alpha_no_punct_stopword_list)):
         for c2 in range(len(encoded_text_alpha_no_punct_stopword_list[c1])):
             decoded_stemmed_list[c1].append(stemmer.stem(encoded_text_alpha_no_punct_stopword_list[c1][c2].decode('utf-8')))
     for c1 in range(len(decoded_stemmed_list)):
         for c2 in range(len(decoded_stemmed_list[c1])):
             encoded_stemmed_list[c1].append(decoded_stemmed_list[c1][c2].encode('utf-8'))
     return encoded_stemmed_list
 def stem_related_terms(self, uni_encoded_cluster_tokenized_list):
     stemmer = RSLPStemmer()
     decoded_stemmed_cluster = []
     encoded_stemmed_cluster = []
     for i in range(len(uni_encoded_cluster_tokenized_list)):
         decoded_stemmed_cluster.append([])
         encoded_stemmed_cluster.append([])
     for c1 in range(len(uni_encoded_cluster_tokenized_list)):
         for c2 in range(len(uni_encoded_cluster_tokenized_list[c1])):
             decoded_stemmed_cluster[c1].append(stemmer.stem(uni_encoded_cluster_tokenized_list[c1][c2].decode('utf-8')))
     for c1 in range(len(decoded_stemmed_cluster)):
         for c2 in range(len(decoded_stemmed_cluster[c1])):
             encoded_stemmed_cluster[c1].append(decoded_stemmed_cluster[c1][c2].encode('utf-8'))
     return encoded_stemmed_cluster
示例#32
0
def tokenize(text):
    text = text.replace('\n', ' ')
    text = text.replace('  ', ' ')
    text = text.lower()
    text = text.replace(',', '')
    text = text.replace('?', '')
    text = text.replace('!', '')
    text = text.replace('.', '')
    stopwordslist = stopwords.words('portuguese')
    stemmer = RSLPStemmer()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwordslist]
    stems = [stemmer.stem(item) for item in tokens]
    return stems
示例#33
0
 def __init__(self,
              filterStopwords=True,
              stemming=False,
              groupClasses=True):
     self.groupClasses = groupClasses
     if stemming:
         self.stemmer = RSLPStemmer()
     else:
         self.stemmer = lambda: None
         self.stemmer.stem = lambda x: x
     self.stopwords = [
         unicodeToAscii(sw) for sw in stopwords.words('portuguese')
     ] if filterStopwords else []
     super().__init__(preserve_case=False, reduce_len=True)
示例#34
0
 def preparingToClassify(self, input):
     stopWords = stopwords.words('english')
     stemmer = RSLPStemmer()
     clean_text = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '',
                         input)
     aux = {}
     for word in word_tokenize(clean_text, 'english'):
         if word.lower() not in stopWords:
             stemmed_word = stemmer.stem(word.lower())
             if stemmed_word in self.all_words:
                 aux[stemmed_word] = True
     for word in self.all_words:
         if word not in aux:
             aux[word] = False
     return aux
示例#35
0
def stemming(text):
    """
    Receives a list of strings and returns a list containing the stemmed version of them.
   
    # Input:
        - text (list): a list of words (strings).
    
    # Output: 
        - new_text (list): list of string containing stemmed words.
    """
    text = [word for word in text if word != ""]
    stemmer = RSLPStemmer()
    new_text = [stemmer.stem(word) for word in text]

    return new_text
示例#36
0
 def search_file(self, keys, arch):
     for paragraph in arch:
         i = 0
         for k in keys:
             for word in paragraph:
                 if k[0] == word:
                     i += 1
                 stemmer = RSLPStemmer()
                 tok = stemmer.stem(k[0])
                 if re.match(tok, word):
                     i += 1
             if i >= len(keys):
                 return paragraph
         i = 0
     return u"Me desculpe, mas não sei"
示例#37
0
def nlp_treatment(text_feats, stem=False):
    
    processed = dict()
    
    for key, value in text_feats.items():
        if value != '':
            string_list = word_tokenize(value)
            string_list = [w for w in string_list if w not in stop_pt]
            string_list = list(unique_everseen(string_list))
            if stem:
                stemmer = RSLPStemmer()
                string_list = [stemmer.stem(w) for w in string_list]
            value = " ".join(string_list)
        processed[key] = value
        
    return processed
示例#38
0
 def compare(self, word, key):
     """
     Metodo que faz comparações para definir se uma palavra é igual, possui o mesmo radical ou se é similar a outra.
     verificar encoding (ueff/) isso está fazendo com que falhe no radical(Na verdade, desde o começo).
     :rtype : object
     """
     if word[0] == key or word[0].lower() == key.lower():
         return 0.5
     else:
         stemmer = RSLPStemmer()
         tok = stemmer.stem(key)
         if re.match(tok, word[0]):
             return 0.3
         syn = self.isSyn(key, word)
         if syn > 0:
             return syn * 0.2
     return 0
	def __init__(self):
		self.portugues_stemmer = RSLPStemmer()
		self.tokenizar = WhitespaceTokenizer()
		self.stopwords = stopwords.words('portuguese')
		self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate']
		self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'),
                 ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'),
                 ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')]
示例#40
0
def stemming(texto):
#############################################################################################################
#Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa
#retirando afixos dos textos do posInicial e tese    

    stemmer = RSLPStemmer()
    st_texto = []
    
#     print texto

    for i in range(len(texto)):
        st_aux = texto[i]
        string_aux = ""
        for sufixo in st_aux:
            string_aux = string_aux + " " + stemmer.stem(sufixo)
        st_texto.append(string_aux)
    
#     print "stemming, st_texto", st_texto
    return st_texto
class LimparTexto(object):
	def __init__(self):
		self.portugues_stemmer = RSLPStemmer()
		self.tokenizar = WhitespaceTokenizer()
		self.stopwords = stopwords.words('portuguese')
		self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate']
		self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'),
                 ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'),
                 ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')]

	#Remover acentuação dos textos		
	def removeAccent(self, text):
		para = text
		for (lat, asc) in self.ascii_replace:
			para = para.replace(lat, asc)
		return para

	#Realiza a remoção das stop words que são palavras que não representam significado para o nosso modelo.
	def removerStopWords(self, texto):		
#O decode é necessário se for utilizado o latin-1 no mining
		texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.stopwords])
		texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.mais_utilizadas])
#		texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.stopwords])
#		texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.mais_utilizadas])
		return texto

	#Tokenização das palavras por espaços
	def tokenizarPalavras(self, texto):
		texto = self.tokenizar.tokenize(texto)
		return texto

	#A remoção da pontuação é necessário pois palavras seguidas de pontos difere de palavra iguais sem a pontuação.
	def removerPontuacao(self, texto):
		regex = re.compile('[%s]' % re.escape(string.punctuation))
		texto = regex.sub('',texto)
		return texto
		
		
	#Remoção dos sufixos das palavras
	def removerSufixo(self, para):
		text = ''
		for w in para:
#			text = text + self.portugues_stemmer.stem(w.decode('latin-1')) + ' '
			text = text + self.portugues_stemmer.stem(w) + ' '
		return text
	
	def removerAcentos(self, texto):
		texto = unicode(texto, 'latin-1')
		para = unidecode.unidecode(texto)
		return para

	def removerCaracteresRepetidos(self, texto):
		texto = re.sub(r'([a-z])\1+', r'\1', texto)
		return texto
示例#42
0
def preprocessing(corpora):
    stemmer = RSLPStemmer()
    stemmer2 = PorterStemmer()
    stp = stopwords.words('portuguese')
    #stp.append('')
    stp.append('ainda')
    res = []
    for i in range(len(corpora)):
        corpora[i] = str(corpora[i]).lower()
        corpora[i] = corpora[i].translate(None, string.punctuation)
        corpora[i] = corpora[i].decode('utf-8')
        corpora[i] = corpora[i].replace(u'”',u'')
        corpora[i] = corpora[i].replace(u'“',u'')
        corpora[i] = corpora[i].replace(u'–',u'')
        res2 = []
        for t in word_tokenize(corpora[i]):
            if t in stp:
                continue
            if(any(char.isdigit() for char in t)==False):
                res2.append(stemmer2.stem(stemmer.stem(t)))
        res.append(res2)
    return res
示例#43
0
def identify_question(question):

    st = RSLPStemmer()
    wrds = []

    for token in question.lower().split():
        wrds.append(st.stem(token))

    types = {
                'time': ['quant', 'temp', 'dem', 'esper', 'long', 'pert', 'lev'],
                'location' : ['cad', 'ond', 'est', 'aven' 'rua']
             }

    best_result = None
    last_test = 0

    for key, value in types.iteritems():
        filtered = filter(set(value).__contains__, wrds) 
        if len(filtered) > last_test:
            last_test = len(filtered)
            best_result = key

    return best_result
示例#44
0
def pre_process(description):
    '''
    pre-processa a descricao
    '''

    # compila regex de caracteres nao-especiais
    vanilla = u'[^\u0041-\u005A \
                  \u0061-\u007A \
                  \u00C0-\u00D6 \
                  \u00D8-\u00F6 \
                  \u00F8-\u00FF \
                  \u0100-\u017F \
                  \u0020]'
    regex = re.compile(vanilla)

    # poe tudo em minusculas
    description = description.encode('utf8').decode('utf8')
    lowercased = description.lower()

    # remove caracteres especiais e numeros
    regexed = regex.sub(' ', lowercased)

    # separa palavras
    tokenized = regexed.split()

    # passa o que esta no plural p/ singular
    st = RSLPStemmer()
    singularized = [st.apply_rule(token, 0) for token in tokenized]

    # remove palavras c/ menos de 2 caracteres
    # e mescla palavras novamente
    remerged = ''
    for word in singularized:
        if len(word) > 1:
            remerged += word + ' '

    return remerged
class PreProcessor(object):

    def __init__(self, idf_path=IDF_PATH, use_idf=True):
        self.stemmer = RSLPStemmer()
        self.term_dict, self.freq = self.dict_from_idf(idf_path)
        self.max_freq = float(max(self.freq.values()))
        self.vocab_size = len(self.term_dict)
        self.use_idf = use_idf

    def dict_from_idf(self, idf_path):
        my_dict = Dictionary()
        freq_dict = {}
        with codecs.open(idf_path, mode="rb", encoding="utf8") as in_file:
            for line in in_file:
                splitted_line = line.split(" ")
                stemmed_word = self.stemmer.stem(splitted_line[0])
                frequency = int(splitted_line[1])
                if frequency < 5:
                    break
                else:
                    id_tuple = my_dict.doc2bow([stemmed_word], allow_update=True)
                    word_id = id_tuple[0][0]
                    freq_dict[word_id] = frequency + freq_dict.setdefault(word_id, 0)
        return my_dict, freq_dict

    def idf(self, term_id):
        return math.log(self.max_freq/self.freq[term_id])

    def url_to_bow(self, url):
        print url
        tokenized_doc = http2tokenized_stemmed(url)
        bow_doc = self.term_dict.doc2bow(tokenized_doc)
        new_bow_doc = []
        for i in range(0, len(bow_doc)):
            new_bow_doc.append((bow_doc[i][0], bow_doc[i][1]*self.idf(bow_doc[i][0])))
        if self.use_idf:
            return new_bow_doc
        else:
            return bow_doc

    def corpus_from_urllist(self, url_list, label):
        urls = url_list
        docs_bow = [self.url_to_bow(url) for url in urls]
        labels = [label] * len(urls)
        return NewsCorpus(urls, labels, docs_bow, self.vocab_size)
示例#46
0
    def __init__(self, corpus_dict, stopword_filename = None, DEFAULT_IDF = 1.5):
        super(TFIDF, self).__init__()
        self.num_docs = 0
        self.term_num_docs = {} # 用于存在某个词在文档集中出现的次数
        self.stopwords = set([])
        self.idf_default = DEFAULT_IDF
        self.st = RSLPStemmer()
        # self._tokenizer = PortugueseWordTokenizer()

        if not corpus_dict:
            print "corpus is empty!"
            exit()
        self.num_docs = len(corpus_dict)
        # 将句子词干化
        self.corpus = [self.getTokens(doc) for doc in corpus_dict]
        # 如果有停用词表则生成停用词
        if stopword_filename:
            stopword_file = codecs.open(stopword_filename, "r", encoding='utf-8')
            self.stopwords = set([line.strip() for line in stopword_file])
 def features(self):
     """Stem the word"""
     st = RSLPStemmer()    
     return [st.stem(word) for word in self.data.split()]
示例#48
0
def clusterArgFinal(idtese):
    #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses
    cursor = connection.cursor()
    cursor2 = connection.cursor()
 
    cursor.execute("select distinct `usr`.`primeironome` as `name`, `pos`.`posicionamentofinal` AS `posicionamentofinal` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr`  where ((`arg`.`tese_idtese` = " + idtese + "  ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))")
    cursor2.execute("select tese from tese where grupo_idgrupo = 1064 ")
     
    #Variavel e função para tratar tags html e acentos com codificação ISO
    h = HTMLParser.HTMLParser()
     
    #dados retirados da consulta ao banco
    dadosSql = cursor.fetchall()
    textotese = cursor2.fetchall()
     
    #listas para tratar os dados iniciais
    usu = []
    posFinal = []
    dados = []
    tese = []
     
    #lista com dados após a remoção das stopwords
    sw_tese = []
    sw_posFinal = []
    aux_usu = []
 
    #lista com dados após a aplicação de Stemming
    st_posFinal = []
    st_tese = []
       
 
#############################################################################################################    
#Aplicacao de Case Folding
    for d in dadosSql:
        dados.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(),
                      re.sub('<[^>]*>', '', h.unescape(d[1])).lower()])
 
    for t in textotese:
        tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower())
             
 
    #Colocando os textos de posicionamento final em numa lista separada
    for i in dados:
        x = 0
        usu.append(i[x].upper())
        posFinal.append(i[x+1].lower()) #lista com o posicionamento Final
         
 
#############################################################################################################
#Fases de pré-processamento linguistico
# - Remoção de stopwords
# - Troca de caracteres acentuados por caracteres não acentuados
# - Remoção pontuações
    for i in usu:
        aux_usu.append(removeStopWords(i))
 
    for i in tese:
        sw_tese.append(removeStopWords(i))
 
 
    for i in posFinal:
        sw_posFinal.append(removeStopWords(i))
 
#############################################################################################################
#Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa
#retirando afixos dos textos do posFinal e tese
    stemmer = RSLPStemmer()
 
    for i in range(len(sw_posFinal)):
        st_aux = sw_posFinal[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)
         
        st_posFinal.append(string_aux)
 
    for i in range(len(sw_tese)):
        st_aux = sw_tese[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)
         
        st_tese.append(string_aux)
 
#############################################################################################################
#TESTES!!!!!!!
 
 
#############################################################################################################
#retorno da função - usado na views.py para alimentar o template debate.html
#passar parametros que devem ser apresentados na templates debate.html
    return [st_tese, posFinal, sw_tese, aux_usu, st_posFinal]
示例#49
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import sys
from pymongo import Connection
from nltk.stem import RSLPStemmer

stemmer = RSLPStemmer()


db = Connection().termos
a = open(sys.argv[1],'r').read().split('\n')
for x in a:
	if x != '':
		x = x.decode('utf-8')
		print db['termos'].insert({'termo':x,'stem':stemmer.stem(x),'tipo':sys.argv[1]})

def ferramentaAgrupamento(idtese):
    #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses
    cursor = connection.cursor()
    cursor1 = connection.cursor()
    cursor2 = connection.cursor()
 
    cursor.execute("select distinct `usr`.`primeironome` as `name`, `arg`.`argumento` AS `posicionamentoinicial` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr`  where ((`arg`.`tese_idtese` = " + idtese + "  ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))")
    cursor1.execute("select `usr`.`primeironome` as `NOME`,`arg`.`argumento` AS `ArgINICIAL`,`arg`.`posicionamentoinicial` AS `PosicInicial` from (`argumento` `arg` join `argumentador` `urg` join `usuario` `usr`) where `arg`.`tese_idtese`" +idtese+ " and `arg`.`posicionamentoinicial` is not null and `arg`.`argumentador_idargumentador` = `urg`.`idargumentador` and `urg`.`usuario_idusuario` = `usr`.`idusuario`")
    cursor2.execute("select tese from tese where idtese="+ idtese)
     
    #Variavel e função para tratar tags html e acentos com codificação ISO
    h = HTMLParser.HTMLParser()
     
    #dados retirados da consulta ao banco
    dadosSql = cursor.fetchall()
    allArgumentos = cursor1.fetchall() #todos os argumentos iniciais preenchidos
    textotese = cursor2.fetchall()
     
    #listas para tratar os dados iniciais
    usu = []
    allUsu = []
    posInicial = []
    dados = []
    tese = []
    allArg = []
    allArgIni = []
     
    #lista com dados após a remoção das stopwords
    sw_tese = []
    sw_posInicial = []
    aux_usu = []
    sw_allArgIni = []
    allUsuAux = []
 
    #lista com dados após a aplicação de Stemming
    st_posInicial = []
    st_tese = []
    st_allArgIni = []
       
 
#############################################################################################################    
#Aplicacao de Case Folding
    for d in dadosSql:
        dados.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(),
                      re.sub('<[^>]*>', '', h.unescape(d[1])).lower()])
 
    for d in allArgumentos:
        allArg.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(),
                      re.sub('<[^>]*>', '', h.unescape(d[1])).lower()])
         
    for t in textotese:
        tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower())
             
 
    #Colocando os textos de posicionamento final em numa lista separada
    for i in dados:
        x = 0
        usu.append(i[x].upper())
        posInicial.append(i[x+1].lower()) #lista com o posicionamento Inicial
     
    for i in allArg:
        x = 0
        allUsu.append(i[x].upper())
        allArgIni.append(i[x+1].lower()) #lista com o posicionamento Inicial
     
    print posInicial
    print allArgIni
     
         
 
#############################################################################################################
#Fases de pré-processamento linguistico
# - Remoção de stopwords
# - Troca de caracteres acentuados por caracteres não acentuados
# - Remoção pontuações
    for i in usu:
        aux_usu.append(removeStopWords(i))
 
    for i in tese:
        sw_tese.append(removeStopWords(i))
 
    for i in posInicial:
        sw_posInicial.append(removeStopWords(i))
         
    for i in allArgIni:
        sw_allArgIni.append(removeStopWords(i))
 
#############################################################################################################
#Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa
#retirando afixos dos textos do posInicial e tese
    stemmer = RSLPStemmer()
     
     
    for i in range(len(sw_posInicial)):
        st_aux = sw_posInicial[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)
          
        st_posInicial.append(string_aux)
  
    for i in range(len(sw_tese)):
        st_aux = sw_tese[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)
          
        st_tese.append(string_aux)
     
    #todos os argumentos iniciais
    for i in range(len(sw_allArgIni)):
        st_aux = sw_allArgIni[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)
          
        st_allArgIni.append(string_aux)
 
 
#############################################################################################################
# #LSI
#     lsi_posInicial = []
#     for i in range(len(sw_posInicial)):
#         aux = "posIni(%d): %s" %(i, sw_posInicial[i])
#         lsi_posInicial.append(aux)
# 
# 
#     lsi = gensim.models.lsimodel.LsiModel(lsi_posInicial)
# # #     print sw_posInicial
#     lsi.print_topics(10)
 
 
 
 
#############################################################################################################
#retorno da função - usado na views.py para alimentar o template debate.html
#passar parametros que devem ser apresentados na templates debate.html
    return [st_tese, posInicial, sw_tese, aux_usu, st_posInicial, tese]
示例#51
0
def clusterFinal(idtese):

    #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses
    cursor = connection.cursor()
    cursor2 = connection.cursor()

    cursor.execute("select distinct `usr`.`primeironome` as `name`, `pos`.`posicionamentofinal` AS `posicionamentofinal` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr`  where ((`arg`.`tese_idtese` = " + idtese + "  ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))")
    cursor2.execute("select tese from tese where grupo_idgrupo = 1064 ")
    
    #Variavel e função para tratar tags html e acentos com codificação ISO
    h = HTMLParser.HTMLParser()
    
    #dados retirados da consulta ao banco
    dadosSql = cursor.fetchall()
    textotese = cursor2.fetchall()
    
    #listas para tratar os dados iniciais
    usu = []
    posInicial = []
    dados = []
    tese = []
    
    #lista com dados pos tagger
    tag_posInicial = []
    tag_comAce_posInicial = []
    
    
    #lista com dados após a remoção das stopwords
    sw_tese = []
    sw_posInicial = []
    aux_usu = []
    sw_tagPosInicial = [] #texto marcado e sem stopwords
    sw_tagcomAce_posInicial = [] #texto COM ACENTOS marcado e sem stopwords 


    #lista com dados após a aplicação de Stemming
    st_posInicial = []
    st_tese = []
    st_tagPosInicial = [] #texto marcado, sem stopwords e com stemmer aplicado
    st_tagcomAce_posInicial = [] #texto COM ACENTOS marcado, sem stopwords e com stemmer aplicado
    
#############################################################################################################    
    #LISTA COM OS POSICIONAMENTOS INICIAIS APÓS APLICAÇÃO DA NORMALIZAÇAÕ
    posInicial_Normalizado = []
    normalizacao = []
      

#############################################################################################################    
#Aplicacao de Case Folding

    for d in dadosSql:
        dados.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(),
                      re.sub('<[^>]*>', '', h.unescape(d[1])).lower()])

    for t in textotese:
        tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower())
            

    #Colocando os textos de posicionamento inicial em numa lista separada
    for i in dados:
        x = 0
        usu.append(i[x].upper())
        posInicial.append(i[x+1].lower()) #lista com o posicionamento Inicial com todas as letras em minusculo

#############################################################################################################
### Classificacao das palavras de acordo com sua classe gramatical
### Utilizacao do postagger NLPNET
### http://nilc.icmc.usp.br/nlpnet/index.html#
    
    tagger = nlpnet.POSTagger()
    
    semAce_posInicial = [] #armazena o posInicial apenas sem acentos, sem pontuações, sem endereço web e sem numeros 
    comAce_posInicial = [] #armazena o posInicial apenas COM acentos, sem pontuações, sem endereço web e sem numeros
    
    for i in posInicial:
        semAce_posInicial.append(removePontuacao(removeA(removeNum(removeSE(removeEndWeb((i)))))))
    
    for i in semAce_posInicial:
        tag_posInicial.append(tagger.tag(i))
        
    for i in posInicial:
        comAce_posInicial.append(removePontuacao(removeNum(removeSE(removeEndWeb((i))))))
    
    for i in comAce_posInicial:
        tag_comAce_posInicial.append(tagger.tag(i))
        
 
 #############################################################################################################   
 #APENAS PARA REALIZAR TESTE E COLOCAR NA DISSERTACAO

#     pprint(semAce_posInicial)
#     pprint(comAce_posInicial)
#     exit()

#     tagg_posInicial = []
#     for texto in posInicial:
#         tagg_posInicial.append(tagger.tag(texto))
#     
#     print "posInicial"
#     pprint(posInicial)
#     
#     print "tagg_posInicial"
#     pprint(tagg_posInicial)
    
 #############################################################################################################

#############################################################################################################
### REMOCAO DE STOPWORDS
### Remocao dos termos de acordo com a NLTK
### Remocao dos termos classificados como artigos, verbos, adverbios, etc...
    
    
    for i in usu:
        aux_usu.append(removeStopWords(i))

    for i in tese:
        sw_tese.append(removeStopWords(i))

    for i in posInicial:
        sw_posInicial.append(removeStopWords(i))
        
    for i in tag_posInicial:
        sw_tagPosInicial.append(limpaCorpus(i))
    
    for i in tag_comAce_posInicial:
        sw_tagcomAce_posInicial.append(limpaCorpus(i))
    
    
    
####################################################################################################################################
# Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa
# Retirando afixos dos textos do posInicial e tese

    
    stemmer = RSLPStemmer()
 
    for i in range(len(sw_posInicial)):
        st_aux = sw_posInicial[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)
         
        st_posInicial.append(string_aux)

    
    for i in range(len(sw_tese)):
        st_aux = sw_tese[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)
         
        st_tese.append(string_aux)
        
    for i in range(len(sw_tagPosInicial)):
        termosST = ""
        auxST = []
        for j in range(len(sw_tagPosInicial[i])):
            aux = stemmer.stem(sw_tagPosInicial[i][j][0])
            etiqueta = sw_tagPosInicial[i][j][1]
            termosST = (aux,etiqueta)
            auxST.append(termosST)
        
        st_tagPosInicial.append(auxST)
        
    for i in range(len(sw_tagcomAce_posInicial)):
        termosST = ""
        auxST = []
        for j in range(len(sw_tagcomAce_posInicial[i])):
            aux = stemmer.stem(sw_tagcomAce_posInicial[i][j][0])
            etiqueta = sw_tagcomAce_posInicial[i][j][1]
            termosST = (aux,etiqueta)
            auxST.append(termosST)
        
        st_tagcomAce_posInicial.append(auxST)


    
####################################################################################################################################
### A NORMALIZACAO DE TERMOS REFERE-SE A TECNICA DE TROCAR PALAVRAS SINONIMAS, OU SEJA, QUE TENHAM SIGNIFICADO                    ##
### SEMELHANTE, POR UM UNICO TERMO REPRESENTATIVO NO CORPUS DE ANALISE. DESSA FORMA, É POSSIVEL AUMENTAR O GRAU                   ##
### DE SIMILARIDADE ENTRE OS TEXTOS ANALISADOS ATRAVES DO USO DE TECNICAS DE ANALISE ESTATISTICAS, COMO SIMILA                    ##
### RIDADE DE COSSENOS OU DISTANCIA EUCLIDIANA.                                                                                   ##
####################################################################################################################################   
### A NORMALIZACAO FOI DESENVOLVIDA COM BASE NOS DADOS DISPONIBILIZADOS PELO PROJETO TEP 2.0 DO NILC/USP                          ##
### http://143.107.183.175:21480/tep2/index.htm                                                                                   ##
###                                                                                                                               ## 
### FORMATO DO ARQUIVO                                                                                                            ##
### NUM1. [Tipo] {termos sinonimos} <NUM2>                                                                                        ##
### 263. [Verbo] {consentir, deixar, permitir} <973>                                                                              ##
### NUM1 = NUMERO DA LINHA DE REFERENCIA PARA TERMO SINONIMO                                                                      ##
### NUM2 = NUMERO DA LINHA DE REFERENCIA PARA TERMO ANTONIMO (SENTIDO OPOSTO)                                                     ##
####################################################################################################################################
    
    #abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios)
    #arquivo apenas com termos classificados como substantivos, adjetivos e verbos 
    base_tep = codecs.open(os.path.join(os.path.dirname(__file__),'../base_tep2/base_tep.txt'), 'r', 'UTF8')
#     dicionario = open('/home/panceri/git/alpes_v1/base_tep2/dicionarioSinonimos.txt', 'w')
    
    #variavel com conteúdo do arquivo em memoria
    #não imprimir essa variável, MUITO GRANDEE!!!
    wordNet = base_tep.readlines()
    
    #fechar arquivo 
    base_tep.close()
    
####################################################################################################################################
## NORMALIZAÇÃO FEITA COM BASE NOS RADICAIS DE FORMAÇÃO DAS PALAVRAS                                                              ##
## APLICAÇÃO DO RSPL PRIMEIRO PARA DEPOIS BUSCAR NA BASE OS TERMOS SIMILARES                                                      ##
## DENTRO DA BASE_TEP OS TERMOS TAMBÉM FORAM REDUZIDOS AOS SEUS RADICIAIS DE FORMAÇÃO                                             ##
## O DICIONÁRIO ESTÁ COM A REFERÊNCIA PARA A LINHA AONDE ESTÃO OS TERMOS SINÔNIMOS                                                ##
## OS TERMOS SÃO ANALISADOS CONSIDERANDO SUAS ACENTUAÇÕES, PARA APLICAÇÃO CORRETA DO RSLP                                         ##
####################################################################################################################################
    
    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()    

    st_WordNetV = [] ##armazena num, tipo, e radical dos sinonimos - APENAS VERBOS
    st_WordNetN = [] ##armazena num, tipo, e radical dos sinonimos - APENAS SUBSTANTIVOS
    st_WordNetA = [] ##armazena num, tipo, e radical dos sinonimos - APENAS ADJETIVOS
    st_WordNetO = [] ##armazena num, tipo, e radical dos sinonimos - APENAS OUTROS
    
    for linhaWordnet in wordNet:
        listaAux = []
        termos = re.findall(r"\{(.*)\}", linhaWordnet)
        num = re.findall(r"([0-9]+)\.", linhaWordnet)
        tipo = re.findall(r"\[(.*)\]", linhaWordnet)
        
        
        if tipo[0] == "Substantivo":
            listaAux.append(num)
            listaAux.append(tipo)
            
            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetN.append(listaAux)
            
        elif tipo[0] == "Verbo":
            listaAux.append(num)
            listaAux.append(tipo)
            
            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetV.append(listaAux)
        
        elif tipo[0] == "Adjetivo":
            listaAux.append(num)
            listaAux.append(tipo)
            
            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetA.append(listaAux)
        else:
            listaAux.append(num)
            listaAux.append(tipo)
            
            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetO.append(listaAux)
            

 
    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('stemmWordNet.out', type = 'callgrind')
    
####################################################################################################################################
### A ANÁLISE É REALIZADA COM BASE NO TEXTO SEM A EXCLUSÃO DOS ACENTOS                                                            ##
### POIS AO EXCLUÍ-LOS A REDUÇÃO AO RADICAL DE FORMAÇÃO (APLICAÇÃO DO RSLP) É PREJUDICADA                                         ##
### OS TESTES REALIZADOS MOSTRARAM QUE ESSA É UMA MELHOR ABORDAGEM, UMA VEZ QUE NOSSOS TEXTOS SÃO PEQUENOS                        ##
### E PRECISAMOS CHEGAR O MAIS PRÓXIMO POSSÍVEL SEM CONSIDERAR SEUS SENTIDOS E/OU CONTEXTOS                                       ##
####################################################################################################################################
    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()    
    
    normalizacao = normalizacaoWordnet(st_WordNetA, st_WordNetN, st_WordNetV, st_WordNetO, st_tagcomAce_posInicial)
    
###############################################################
# Colocando os textos normalizados numa lista de 1 diemensão
############################################################### 
    stringNorm = ""
    auxNorm = []
    
    for i in range(len(normalizacao)):
        auxNorm = normalizacao[i]
        
        for x in range(len(auxNorm)):           
            stringNorm = stringNorm + " " + auxNorm[x]
        
        posInicial_Normalizado.append(stringNorm)
        stringNorm = ""
    
    
    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('normalizacaoWordnet.out', type = 'callgrind')


####################################################################################################################################

#     print "posInicial"
#     pprint(posInicial)
#     
#     print "comAce_posInicial"
#     pprint(comAce_posInicial)
#     
#     print "tag_comAce_posInicial"
#     pprint(tag_comAce_posInicial)
#         
#     print "sw_tagcomAce_posInicial"
#     pprint(sw_tagcomAce_posInicial)
#     
#     print "st_tagcomAce_posInicial"
#     pprint(st_tagcomAce_posInicial)
    
#     print "posInicial_Normalizado"
#     print len(posInicial_Normalizado)
#     pprint(posInicial_Normalizado)
     
#     exit()
####################################################################################################################################    


    return [st_tese, posInicial, sw_tese, aux_usu, st_posInicial, tese, posInicial_Normalizado]
def stem(term):
    snowball_stemmer = PortugueseStemmer()
    rslp_stemmer = RSLPStemmer()
    print u'[{}] Snowball: {}, RSLP: {}'.format(term,
            snowball_stemmer.stem(term), rslp_stemmer.stem(term))
 def __init__(self, idf_path=IDF_PATH, use_idf=True):
     self.stemmer = RSLPStemmer()
     self.term_dict, self.freq = self.dict_from_idf(idf_path)
     self.max_freq = float(max(self.freq.values()))
     self.vocab_size = len(self.term_dict)
     self.use_idf = use_idf
示例#54
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-

from pattern.web import URL, plaintext
from nltk.tokenize import RegexpTokenizer
from nltk.stem import RSLPStemmer

  	

tokenizer = RegexpTokenizer('\w+')
stemmer = RSLPStemmer()

verbos = tokenizer.tokenize(open('verbos','rb').read())

radicais = []
resultados = []

for verbo in verbos:
	radical = stemmer.stem(verbo)
	radicais.append(radical)

for verbo in verbos:
	pg = URL('http://linguistica.insite.com.br/mod_perl/conjugue?verbo='+verbo).download()
	palavras = tokenizer.tokenize(pg)
	for palavra in palavras:
		radical = stemmer.stem(palavra)
		if radical in radicais:
			if palavra not in resultados:
				print palavra.encode('utf-8')
			resultados.append(palavra)
示例#55
0
class TFIDF(object):
    """docstring for TFIDF"""
    def __init__(self, corpus_dict, stopword_filename = None, DEFAULT_IDF = 1.5):
        super(TFIDF, self).__init__()
        self.num_docs = 0
        self.term_num_docs = {} # 用于存在某个词在文档集中出现的次数
        self.stopwords = set([])
        self.idf_default = DEFAULT_IDF
        self.st = RSLPStemmer()
        # self._tokenizer = PortugueseWordTokenizer()

        if not corpus_dict:
            print "corpus is empty!"
            exit()
        self.num_docs = len(corpus_dict)
        # 将句子词干化
        self.corpus = [self.getTokens(doc) for doc in corpus_dict]
        # 如果有停用词表则生成停用词
        if stopword_filename:
            stopword_file = codecs.open(stopword_filename, "r", encoding='utf-8')
            self.stopwords = set([line.strip() for line in stopword_file])

    def getTokens(self,string):
        # return self._tokenizer.tokenize(string)
        # return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", string.lower())
        return [self.st.stem(token) for token in string.split()]

    '''
    计算词语在某类文章中的tf值
    '''
    def getTf(self,innerIndexes):
        wordFrequence = {}
        wordCount = 0
        for doc in innerIndexes:
            for oneToken in self.corpus[doc]:
                count = wordFrequence.setdefault(oneToken,0) + 1
                wordFrequence[oneToken] = count
                wordCount += 1
        for index,value in wordFrequence.iteritems():
            wordFrequence[index] = float(value)/wordCount
        return wordFrequence

    def getTermDocs(self):
        for oneAricles in self.corpus:
            for word in set(oneAricles):
                articles = self.term_num_docs.get(word,0) + 1
                self.term_num_docs[word] = articles

    '''
    计算词语在总体文章中的idf值
    '''
    def getIdf(self):
        self.getTermDocs()
        wordIdf = {}
        for term,value in self.term_num_docs.iteritems():
            if term in self.stopwords:
                wordIdf[term] = 0.0
                continue
            wordIdf[term] =  math.log(float(self.num_docs + 1) / (value + 1))
        return wordIdf

    '''
    计算词语对某类别的tf-idf值
    '''
    @staticmethod
    def getTfIdf(tfDict,idfDict):
        resultList = [(key,tfDict[key]*idfDict[key]) for key in tfDict]
        return sorted(resultList,key=lambda x:x[1],reverse=True)