class SRLParser(object): def __init__(self): self.stemmer = RSLPStemmer() self.verbs = {} self.verbs[self.stemmer.stem('publicar')] = ['Quem publica?', 'O que publica?', 'Onde publica?', 'Quando publica?', 'Alcance da publicação?', 'Qualidade da publicacao?'] self.verbs[self.stemmer.stem('registrar')] = ['Quem registra?', 'Quando registra?', 'Onde registra?', 'Qual o meio de registro?', 'O que se registra?'] def parse_sentence(self, frase): tokens = word_tokenize(frase, language='portuguese') for t in tokens: verb_config = self.verbs.get(self.stemmer.stem(t.lower()), None) if not verb_config is None: return (t, verb_config, tokens) return (None, None, None) def generate_question(self, verb, option): return option def gerar_sentenca(self, sentenca): verbo, config, tokens = self.parse_sentence(sentenca) if verbo != None: sent = create_sentence(sentenca, verbo) sent.questions = [] for c in config: sent.questions.append(create_question(sent.id, tokens, pergunta=self.generate_question(verbo, c), skip_list=[verbo])) return sent return None
def Stemming(palavras): stemmer = RSLPStemmer() palavras_base = [] for palavra in palavras: palavras_base.append(stemmer.stem(palavra)) return palavras_base
def gera_indice_invertido(docs, base_dir): # Utilitários necessários tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') st = RSLPStemmer() # ------------------------------------------------ cont_arquivo = 0 for file in docs: cont_arquivo += 1 dict_arquivos[cont_arquivo] = file caminho_arquivo = os.path.join(base_dir, file) with open(caminho_arquivo, 'r') as f: txt_arquivo = f.read() palavras = tokenizer.tokenize(txt_arquivo) palavras = filtra_palavras(palavras) radical_palavras = [st.stem(palavra) for palavra in palavras ] # Obtem apenas o radical de cada palavra for palavra in radical_palavras: if palavra not in dict_indice_invertido.keys(): dict_indice_invertido[palavra] = {cont_arquivo: 1} else: if cont_arquivo not in dict_indice_invertido[palavra].keys( ): dict_indice_invertido[palavra][cont_arquivo] = 1 else: dict_indice_invertido[palavra][cont_arquivo] += 1 f.close()
def preparingSetToTrain(self, input, N=2000): self.all_words = set() wordsFreq = {} stopWords = stopwords.words('english') stemmer = RSLPStemmer() for document in input: clean_text = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '', document[1]) for word in word_tokenize(clean_text, 'english'): if word.lower() not in stopWords: stemmed_word = stemmer.stem(word.lower()) if stemmed_word in wordsFreq: wordsFreq[stemmed_word] += 1 else: wordsFreq[stemmed_word] = 1 i = 0 for item in sorted(wordsFreq, key=wordsFreq.get): if (i < N): self.all_words.add(item) i += 1 t = [] for document in input: clean_text = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '', document[1]) aux = {} for word in word_tokenize(clean_text, 'portuguese'): if word.lower() not in stopWords: stemmed_word = stemmer.stem(word.lower()) if stemmed_word in self.all_words: aux[stemmed_word] = True for word in self.all_words: if word not in aux: aux[word] = False t.append((aux, document[0])) return t
def normalize_text(text): ''' Um exemplo de formas diferentes para normalizar um texto usando ferramentas do NLTK param: text Uma string com texto que será processado ''' text = text.decode('utf8') stemmer = RSLPStemmer() # Carregando um radicalizador para o PT-BR print(text) for sent in sent_tokenize(text): # Testando formas de tokenização tokens = wordpunct_tokenize(sent) print(sent) print(' wordpunct: \t%s' % ' '.join(tokens)) tokens = word_tokenize(sent) print(' word: \t%s' % ' '.join(tokens)) # Removendo stopwords tokens = remove_stopwords(tokens) print(' -stopwords: \t%s' % ' '.join(tokens)) # Radicalizando as palavras restantes tokens = [stemmer.stem(t) for t in tokens] print('radicalizado: \t%s' % ' '.join(tokens)) print('')
def Stemming(sentence): stemmer = RSLPStemmer() phrase = [] for word in sentence: phrase.append(stemmer.stem(word.lower())) phrase.append(" ") return "".join(phrase)
def clean_words(words, remove_stopwords=False, language='portuguese'): """Stems and removes stopwords from a set of word-level tokens using the RSLPStemmer. Args: words (list): Tokens to be stemmed. remove_stopwords (bool): Whether stopwords should be removed or not. language (str): Identifier of stopwords' language. Returns: List of stemmed tokens. """ # Creates the RSLP stemmer stemmer = RSLPStemmer() # Checks if stopwords are supposed to be removed if remove_stopwords: # Gathers the stopwords stop_words = stopwords.words(language) # Stems and removes the stopwords stemmed_words = [ stemmer.stem(word) for word in words if word.lower() not in stop_words ] # If stopwords are not supposed to be removed else: # Just stems the words stemmed_words = [stemmer.stem(word) for word in words] return stemmed_words
def lematizar(text): if (text != 'negativo' and text != 'positivo'): st = RSLPStemmer() w = st.stem(text) else: w=text return w
def __init__(self): self.nlp = spacy.load('pt_core_news_md') self.vogais = [ 'a', 'á', 'à', 'â', 'ã', 'e', 'é', 'ê', 'i', 'í', 'î', 'o', 'ó', 'ô', 'õ', 'u', 'ú', 'û' ] self.stemmer = RSLPStemmer()
def lematizar(text): lista_conjuncao=['mas', 'contudo', 'no entanto', 'entretanto', 'porem', 'todavia'] if (text != 'negativo' and text != 'positivo' and text not in lista_conjuncao): st = RSLPStemmer() w = st.stem(text) else: w=text return w
def stemmer(self, processed_text): '''Input: processed text Output: tokens after stemming ''' st = RSLPStemmer() #st = SnowballStemmer("english") stemmed_list = set(st.stem(token) for token in processed_text) return stemmed_list
def get_idword(conn, word): result = -1 stemmer = RSLPStemmer() cursor = conn.execute('SELECT idword FROM words WHERE word = %s', stemmer.stem(word)) if cursor.rowcount > 0: result = cursor.fetchone()[0] return result
def stemming_(text): stemmer = RSLPStemmer() stemming = [] for phrase, emotion in text: preprocess = [ str(stemmer.stem(p)) for p in phrase.split() if p not in stopwords ] stemming.append((preprocess, emotion)) return stemming
def stemming(tokens): stemmer = RSLPStemmer() pharse = [] for word in tokens: pharse.append(stemmer.stem(word)) return pharse
def word_stemm(word): stemmer = RSLPStemmer() try: word = stemmer.stem(word) except: print(word) return word
def stemming(sentence): stemmer = RSLPStemmer() phrase = [] for word in sentence: word = unicode(word, 'utf-8') word = unicodedata.normalize("NFKD", word) phrase.append(stemmer.stem(word.lower())) return phrase
def separaPalavras(texto): stop = nltk.corpus.stopwords.words('portuguese') stemmer = RSLPStemmer() splitter = re.compile('\\W+') lista_palavras = [] lista = [p for p in splitter.split(texto) if p != ''] for p in lista: if p.lower() not in stop: if len(p) > 1: lista_palavras.append(stemmer.stem(p).lower()) return lista_palavras
def getIdPalavra(palavra): retorno = -1 stemmer = RSLPStemmer() conexao = pymysql.connect(host=host, user=id_user, passwd=password, db=data_base) cursor = conexao.cursor() cursor.execute('select idpalavra from palavras where palavra = %s', stemmer.stem(palavra)) if cursor.rowcount > 0: retorno = cursor.fetchone()[0] cursor.close() conexao.close() return retorno
def separates_words(text): stop_words = stopwords.words('portuguese') stemmer = RSLPStemmer() splitter = re.compile('\W+') list_words = [] words = [p for p in splitter.split(text) if p != ''] for word in words: if word.lower() not in stop_words: if len(word) > 1: list_words.append(stemmer.stem(word).lower()) return list_words
def features(self): """Use N gram to extract feature """ n = 1 data = self.data.split() st = RSLPStemmer() data = [st.stem(word) for word in self.data.split()] out = [] for i in range(n, len(self.data.split()) - n + 1): out.append(data[i - n:i]) out.append(data[i + 1:i + n]) return [' '.join(x) for x in out]
def _stem(text): """ Convert words to it's stem :param text: list of words :return: list of stemmed words """ stemmer = RSLPStemmer() phrase = [] for word in text: phrase.append(stemmer.stem(word.lower())) return phrase
def get_doc(folder_name): doc_list = get_doc_list(folder_name) tokenizer = RegexpTokenizer(r'\w+') #en_stop = get_stop_words('en') #p_stemmer = PorterStemmer() p_stemmer = RSLPStemmer() taggeddoc = [] texts = [] for index, i in enumerate(doc_list): # for tagged doc wordslist = [] tagslist = [] # clean and tokenize document string raw = gensim.utils.to_unicode(i, 'latin1').lower() print index, ' - ', raw, '\n' tokens = tokenizer.tokenize(raw) #print tokens # remove stop words from tokens #stopped_tokens = [i for i in tokens if not i in en_stop] #Remove StopWords stopped_tokens = [ word for word in tokens if word not in stopwords.words('portuguese') ] #print stopped_tokens # remove numbers number_tokens = [re.sub(r'[\d]', ' ', i) for i in stopped_tokens] number_tokens = ' '.join(number_tokens).split() #print number_tokens,'\n' # stem tokens #stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens] #Stemming stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens] print stemmed_tokens, '\n' # remove empty length_tokens = [i for i in stemmed_tokens if len(i) > 1] # add tokens to list texts.append(length_tokens) #td = TaggedDocument(gensim.utils.to_unicode(str.encode(' '.join(stemmed_tokens))).split(),str(index)) td = TaggedDocument(forward_transformer(stemmed_tokens), str(index)) taggeddoc.append(td) return taggeddoc
def __init__(self, db=None): if db is None: self.db = constantes.ARQ_BANCO self.stem_count = dict() self.word_count = dict() self.unidas_count = dict() self.connection = sqlite3.connect(self.db) self.cursor = self.connection.cursor() self.stemmer = RSLPStemmer() self.tokenizer = RegexpTokenizer(r'\w+')
def string_steem(text): string_steem = [] stemmer = RSLPStemmer() for i in text.split(): try: string_steem.append( stemmer.stem(i.lower().translate(remove_punct_dict))) except: string_steem.append('') return ' '.join(i for i in string_steem)
def stemming_(text): punc = string.punctuation stemmer = RSLPStemmer() stemming = [] for phrase, emotion in text.items(): nopunc = [str(stemmer.stem(p)) for p in phrase if p not in punc] nopunc = ''.join(nopunc) preprocess = [ str(stemmer.stem(p)) for p in nopunc.split() if p not in stopwords ] stemming.append((preprocess, emotion)) return stemming
def only_stems(keywords): st = PorterStemmer() os = OrengoStemmer() ss = SavoyStemmer() rs = RSLPStemmer() stem1 = [st.getWordStem(x.encode('utf8')) for x in keywords] stem2 = [rs.stem(x.encode('utf8')) for x in keywords] stem3 = [os.getWordStem(x.encode('utf8')) for x in keywords] stem4 = [ss.getWordStem(x.encode('utf8')) for x in keywords] return stem1+stem2+stem3+stem4
def preprocess_text(self, text): tokens = [] stemmer = RSLPStemmer() for t in text.split(): # Need a better set of stopwords #if t in stopwords.words('portuguese'): #continue t = unidecode(t) t = t.lower() t = re.sub(r'\W+', '', t) t = stemmer.stem(t) tokens.append(t) return ' '.join(tokens)
def stem_string(string, minimal_length): st = RSLPStemmer() string = clean_string(string) string = string.replace('\n', ' ') text = [] for token in string.split(' '): if token != '' and len(token) > minimal_length: try: text.append(st.stem(clean_word(token))) except: text.append(clean_word(token).decode('utf8', 'ignore')) return ' '.join(text)
def search(query, document_base): st = RSLPStemmer() dict_indice_invertido, dict_arquivos = get_indice_invertido(document_base) # Realiza a consulta consultas = query.split( '|') # Primeiramente, divide a consulta pelos operadores OR existentes conjunto_final = set() conjunto = set() for consulta in consultas: # Realiza um consulta separada para cada uma anteriormente dividida consulta = consulta.split('&') count = 0 for palavra in consulta: palavra = st.stem(palavra.strip()) if count == 0: if '!' in palavra: conjunto = set(dict_arquivos.keys()) conjunto = conjunto.difference( dict_indice_invertido[st.stem( palavra.lstrip('!'))].keys()) else: try: conjunto = set(dict_indice_invertido[palavra].keys()) except KeyError: conjunto = {} else: if '!' in palavra: conjunto = conjunto.intersection( set(dict_arquivos.keys()).difference( dict_indice_invertido[st.stem( palavra.lstrip('!'))].keys())) else: try: conjunto = conjunto.intersection( set(dict_indice_invertido[palavra].keys())) except KeyError: conjunto = {} count += 1 conjunto_final = conjunto_final.union(conjunto) txt_arquivos = '' for file in conjunto_final: txt_arquivos += dict_arquivos[file] + '\n' with open("answer.txt", 'w+') as resposta: resposta.write(str(len(conjunto_final)) + '\n' + txt_arquivos) resposta.close()
def stem_report_sents(self, encoded_text_alpha_no_punct_stopword_list): decoded_stemmed_list = [] encoded_stemmed_list = [] for i in range(len(encoded_text_alpha_no_punct_stopword_list)): decoded_stemmed_list.append([]) encoded_stemmed_list.append([]) stemmer = RSLPStemmer() for c1 in range(len(encoded_text_alpha_no_punct_stopword_list)): for c2 in range(len(encoded_text_alpha_no_punct_stopword_list[c1])): decoded_stemmed_list[c1].append(stemmer.stem(encoded_text_alpha_no_punct_stopword_list[c1][c2].decode('utf-8'))) for c1 in range(len(decoded_stemmed_list)): for c2 in range(len(decoded_stemmed_list[c1])): encoded_stemmed_list[c1].append(decoded_stemmed_list[c1][c2].encode('utf-8')) return encoded_stemmed_list
def stem_related_terms(self, uni_encoded_cluster_tokenized_list): stemmer = RSLPStemmer() decoded_stemmed_cluster = [] encoded_stemmed_cluster = [] for i in range(len(uni_encoded_cluster_tokenized_list)): decoded_stemmed_cluster.append([]) encoded_stemmed_cluster.append([]) for c1 in range(len(uni_encoded_cluster_tokenized_list)): for c2 in range(len(uni_encoded_cluster_tokenized_list[c1])): decoded_stemmed_cluster[c1].append(stemmer.stem(uni_encoded_cluster_tokenized_list[c1][c2].decode('utf-8'))) for c1 in range(len(decoded_stemmed_cluster)): for c2 in range(len(decoded_stemmed_cluster[c1])): encoded_stemmed_cluster[c1].append(decoded_stemmed_cluster[c1][c2].encode('utf-8')) return encoded_stemmed_cluster
def tokenize(text): text = text.replace('\n', ' ') text = text.replace(' ', ' ') text = text.lower() text = text.replace(',', '') text = text.replace('?', '') text = text.replace('!', '') text = text.replace('.', '') stopwordslist = stopwords.words('portuguese') stemmer = RSLPStemmer() tokens = nltk.word_tokenize(text) tokens = [token for token in tokens if token not in stopwordslist] stems = [stemmer.stem(item) for item in tokens] return stems
def __init__(self, filterStopwords=True, stemming=False, groupClasses=True): self.groupClasses = groupClasses if stemming: self.stemmer = RSLPStemmer() else: self.stemmer = lambda: None self.stemmer.stem = lambda x: x self.stopwords = [ unicodeToAscii(sw) for sw in stopwords.words('portuguese') ] if filterStopwords else [] super().__init__(preserve_case=False, reduce_len=True)
def preparingToClassify(self, input): stopWords = stopwords.words('english') stemmer = RSLPStemmer() clean_text = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '', input) aux = {} for word in word_tokenize(clean_text, 'english'): if word.lower() not in stopWords: stemmed_word = stemmer.stem(word.lower()) if stemmed_word in self.all_words: aux[stemmed_word] = True for word in self.all_words: if word not in aux: aux[word] = False return aux
def stemming(text): """ Receives a list of strings and returns a list containing the stemmed version of them. # Input: - text (list): a list of words (strings). # Output: - new_text (list): list of string containing stemmed words. """ text = [word for word in text if word != ""] stemmer = RSLPStemmer() new_text = [stemmer.stem(word) for word in text] return new_text
def search_file(self, keys, arch): for paragraph in arch: i = 0 for k in keys: for word in paragraph: if k[0] == word: i += 1 stemmer = RSLPStemmer() tok = stemmer.stem(k[0]) if re.match(tok, word): i += 1 if i >= len(keys): return paragraph i = 0 return u"Me desculpe, mas não sei"
def nlp_treatment(text_feats, stem=False): processed = dict() for key, value in text_feats.items(): if value != '': string_list = word_tokenize(value) string_list = [w for w in string_list if w not in stop_pt] string_list = list(unique_everseen(string_list)) if stem: stemmer = RSLPStemmer() string_list = [stemmer.stem(w) for w in string_list] value = " ".join(string_list) processed[key] = value return processed
def compare(self, word, key): """ Metodo que faz comparações para definir se uma palavra é igual, possui o mesmo radical ou se é similar a outra. verificar encoding (ueff/) isso está fazendo com que falhe no radical(Na verdade, desde o começo). :rtype : object """ if word[0] == key or word[0].lower() == key.lower(): return 0.5 else: stemmer = RSLPStemmer() tok = stemmer.stem(key) if re.match(tok, word[0]): return 0.3 syn = self.isSyn(key, word) if syn > 0: return syn * 0.2 return 0
def __init__(self): self.portugues_stemmer = RSLPStemmer() self.tokenizar = WhitespaceTokenizer() self.stopwords = stopwords.words('portuguese') self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'] self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')]
def stemming(texto): ############################################################################################################# #Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa #retirando afixos dos textos do posInicial e tese stemmer = RSLPStemmer() st_texto = [] # print texto for i in range(len(texto)): st_aux = texto[i] string_aux = "" for sufixo in st_aux: string_aux = string_aux + " " + stemmer.stem(sufixo) st_texto.append(string_aux) # print "stemming, st_texto", st_texto return st_texto
class LimparTexto(object): def __init__(self): self.portugues_stemmer = RSLPStemmer() self.tokenizar = WhitespaceTokenizer() self.stopwords = stopwords.words('portuguese') self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'] self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')] #Remover acentuação dos textos def removeAccent(self, text): para = text for (lat, asc) in self.ascii_replace: para = para.replace(lat, asc) return para #Realiza a remoção das stop words que são palavras que não representam significado para o nosso modelo. def removerStopWords(self, texto): #O decode é necessário se for utilizado o latin-1 no mining texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.stopwords]) texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.mais_utilizadas]) # texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.stopwords]) # texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.mais_utilizadas]) return texto #Tokenização das palavras por espaços def tokenizarPalavras(self, texto): texto = self.tokenizar.tokenize(texto) return texto #A remoção da pontuação é necessário pois palavras seguidas de pontos difere de palavra iguais sem a pontuação. def removerPontuacao(self, texto): regex = re.compile('[%s]' % re.escape(string.punctuation)) texto = regex.sub('',texto) return texto #Remoção dos sufixos das palavras def removerSufixo(self, para): text = '' for w in para: # text = text + self.portugues_stemmer.stem(w.decode('latin-1')) + ' ' text = text + self.portugues_stemmer.stem(w) + ' ' return text def removerAcentos(self, texto): texto = unicode(texto, 'latin-1') para = unidecode.unidecode(texto) return para def removerCaracteresRepetidos(self, texto): texto = re.sub(r'([a-z])\1+', r'\1', texto) return texto
def preprocessing(corpora): stemmer = RSLPStemmer() stemmer2 = PorterStemmer() stp = stopwords.words('portuguese') #stp.append('') stp.append('ainda') res = [] for i in range(len(corpora)): corpora[i] = str(corpora[i]).lower() corpora[i] = corpora[i].translate(None, string.punctuation) corpora[i] = corpora[i].decode('utf-8') corpora[i] = corpora[i].replace(u'”',u'') corpora[i] = corpora[i].replace(u'“',u'') corpora[i] = corpora[i].replace(u'–',u'') res2 = [] for t in word_tokenize(corpora[i]): if t in stp: continue if(any(char.isdigit() for char in t)==False): res2.append(stemmer2.stem(stemmer.stem(t))) res.append(res2) return res
def identify_question(question): st = RSLPStemmer() wrds = [] for token in question.lower().split(): wrds.append(st.stem(token)) types = { 'time': ['quant', 'temp', 'dem', 'esper', 'long', 'pert', 'lev'], 'location' : ['cad', 'ond', 'est', 'aven' 'rua'] } best_result = None last_test = 0 for key, value in types.iteritems(): filtered = filter(set(value).__contains__, wrds) if len(filtered) > last_test: last_test = len(filtered) best_result = key return best_result
def pre_process(description): ''' pre-processa a descricao ''' # compila regex de caracteres nao-especiais vanilla = u'[^\u0041-\u005A \ \u0061-\u007A \ \u00C0-\u00D6 \ \u00D8-\u00F6 \ \u00F8-\u00FF \ \u0100-\u017F \ \u0020]' regex = re.compile(vanilla) # poe tudo em minusculas description = description.encode('utf8').decode('utf8') lowercased = description.lower() # remove caracteres especiais e numeros regexed = regex.sub(' ', lowercased) # separa palavras tokenized = regexed.split() # passa o que esta no plural p/ singular st = RSLPStemmer() singularized = [st.apply_rule(token, 0) for token in tokenized] # remove palavras c/ menos de 2 caracteres # e mescla palavras novamente remerged = '' for word in singularized: if len(word) > 1: remerged += word + ' ' return remerged
class PreProcessor(object): def __init__(self, idf_path=IDF_PATH, use_idf=True): self.stemmer = RSLPStemmer() self.term_dict, self.freq = self.dict_from_idf(idf_path) self.max_freq = float(max(self.freq.values())) self.vocab_size = len(self.term_dict) self.use_idf = use_idf def dict_from_idf(self, idf_path): my_dict = Dictionary() freq_dict = {} with codecs.open(idf_path, mode="rb", encoding="utf8") as in_file: for line in in_file: splitted_line = line.split(" ") stemmed_word = self.stemmer.stem(splitted_line[0]) frequency = int(splitted_line[1]) if frequency < 5: break else: id_tuple = my_dict.doc2bow([stemmed_word], allow_update=True) word_id = id_tuple[0][0] freq_dict[word_id] = frequency + freq_dict.setdefault(word_id, 0) return my_dict, freq_dict def idf(self, term_id): return math.log(self.max_freq/self.freq[term_id]) def url_to_bow(self, url): print url tokenized_doc = http2tokenized_stemmed(url) bow_doc = self.term_dict.doc2bow(tokenized_doc) new_bow_doc = [] for i in range(0, len(bow_doc)): new_bow_doc.append((bow_doc[i][0], bow_doc[i][1]*self.idf(bow_doc[i][0]))) if self.use_idf: return new_bow_doc else: return bow_doc def corpus_from_urllist(self, url_list, label): urls = url_list docs_bow = [self.url_to_bow(url) for url in urls] labels = [label] * len(urls) return NewsCorpus(urls, labels, docs_bow, self.vocab_size)
def __init__(self, corpus_dict, stopword_filename = None, DEFAULT_IDF = 1.5): super(TFIDF, self).__init__() self.num_docs = 0 self.term_num_docs = {} # 用于存在某个词在文档集中出现的次数 self.stopwords = set([]) self.idf_default = DEFAULT_IDF self.st = RSLPStemmer() # self._tokenizer = PortugueseWordTokenizer() if not corpus_dict: print "corpus is empty!" exit() self.num_docs = len(corpus_dict) # 将句子词干化 self.corpus = [self.getTokens(doc) for doc in corpus_dict] # 如果有停用词表则生成停用词 if stopword_filename: stopword_file = codecs.open(stopword_filename, "r", encoding='utf-8') self.stopwords = set([line.strip() for line in stopword_file])
def features(self): """Stem the word""" st = RSLPStemmer() return [st.stem(word) for word in self.data.split()]
def clusterArgFinal(idtese): #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses cursor = connection.cursor() cursor2 = connection.cursor() cursor.execute("select distinct `usr`.`primeironome` as `name`, `pos`.`posicionamentofinal` AS `posicionamentofinal` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr` where ((`arg`.`tese_idtese` = " + idtese + " ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))") cursor2.execute("select tese from tese where grupo_idgrupo = 1064 ") #Variavel e função para tratar tags html e acentos com codificação ISO h = HTMLParser.HTMLParser() #dados retirados da consulta ao banco dadosSql = cursor.fetchall() textotese = cursor2.fetchall() #listas para tratar os dados iniciais usu = [] posFinal = [] dados = [] tese = [] #lista com dados após a remoção das stopwords sw_tese = [] sw_posFinal = [] aux_usu = [] #lista com dados após a aplicação de Stemming st_posFinal = [] st_tese = [] ############################################################################################################# #Aplicacao de Case Folding for d in dadosSql: dados.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(), re.sub('<[^>]*>', '', h.unescape(d[1])).lower()]) for t in textotese: tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower()) #Colocando os textos de posicionamento final em numa lista separada for i in dados: x = 0 usu.append(i[x].upper()) posFinal.append(i[x+1].lower()) #lista com o posicionamento Final ############################################################################################################# #Fases de pré-processamento linguistico # - Remoção de stopwords # - Troca de caracteres acentuados por caracteres não acentuados # - Remoção pontuações for i in usu: aux_usu.append(removeStopWords(i)) for i in tese: sw_tese.append(removeStopWords(i)) for i in posFinal: sw_posFinal.append(removeStopWords(i)) ############################################################################################################# #Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa #retirando afixos dos textos do posFinal e tese stemmer = RSLPStemmer() for i in range(len(sw_posFinal)): st_aux = sw_posFinal[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_posFinal.append(string_aux) for i in range(len(sw_tese)): st_aux = sw_tese[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_tese.append(string_aux) ############################################################################################################# #TESTES!!!!!!! ############################################################################################################# #retorno da função - usado na views.py para alimentar o template debate.html #passar parametros que devem ser apresentados na templates debate.html return [st_tese, posFinal, sw_tese, aux_usu, st_posFinal]
#!/usr/bin/env python #-*- coding:utf-8 -*- import sys from pymongo import Connection from nltk.stem import RSLPStemmer stemmer = RSLPStemmer() db = Connection().termos a = open(sys.argv[1],'r').read().split('\n') for x in a: if x != '': x = x.decode('utf-8') print db['termos'].insert({'termo':x,'stem':stemmer.stem(x),'tipo':sys.argv[1]})
def ferramentaAgrupamento(idtese): #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses cursor = connection.cursor() cursor1 = connection.cursor() cursor2 = connection.cursor() cursor.execute("select distinct `usr`.`primeironome` as `name`, `arg`.`argumento` AS `posicionamentoinicial` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr` where ((`arg`.`tese_idtese` = " + idtese + " ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))") cursor1.execute("select `usr`.`primeironome` as `NOME`,`arg`.`argumento` AS `ArgINICIAL`,`arg`.`posicionamentoinicial` AS `PosicInicial` from (`argumento` `arg` join `argumentador` `urg` join `usuario` `usr`) where `arg`.`tese_idtese`" +idtese+ " and `arg`.`posicionamentoinicial` is not null and `arg`.`argumentador_idargumentador` = `urg`.`idargumentador` and `urg`.`usuario_idusuario` = `usr`.`idusuario`") cursor2.execute("select tese from tese where idtese="+ idtese) #Variavel e função para tratar tags html e acentos com codificação ISO h = HTMLParser.HTMLParser() #dados retirados da consulta ao banco dadosSql = cursor.fetchall() allArgumentos = cursor1.fetchall() #todos os argumentos iniciais preenchidos textotese = cursor2.fetchall() #listas para tratar os dados iniciais usu = [] allUsu = [] posInicial = [] dados = [] tese = [] allArg = [] allArgIni = [] #lista com dados após a remoção das stopwords sw_tese = [] sw_posInicial = [] aux_usu = [] sw_allArgIni = [] allUsuAux = [] #lista com dados após a aplicação de Stemming st_posInicial = [] st_tese = [] st_allArgIni = [] ############################################################################################################# #Aplicacao de Case Folding for d in dadosSql: dados.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(), re.sub('<[^>]*>', '', h.unescape(d[1])).lower()]) for d in allArgumentos: allArg.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(), re.sub('<[^>]*>', '', h.unescape(d[1])).lower()]) for t in textotese: tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower()) #Colocando os textos de posicionamento final em numa lista separada for i in dados: x = 0 usu.append(i[x].upper()) posInicial.append(i[x+1].lower()) #lista com o posicionamento Inicial for i in allArg: x = 0 allUsu.append(i[x].upper()) allArgIni.append(i[x+1].lower()) #lista com o posicionamento Inicial print posInicial print allArgIni ############################################################################################################# #Fases de pré-processamento linguistico # - Remoção de stopwords # - Troca de caracteres acentuados por caracteres não acentuados # - Remoção pontuações for i in usu: aux_usu.append(removeStopWords(i)) for i in tese: sw_tese.append(removeStopWords(i)) for i in posInicial: sw_posInicial.append(removeStopWords(i)) for i in allArgIni: sw_allArgIni.append(removeStopWords(i)) ############################################################################################################# #Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa #retirando afixos dos textos do posInicial e tese stemmer = RSLPStemmer() for i in range(len(sw_posInicial)): st_aux = sw_posInicial[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_posInicial.append(string_aux) for i in range(len(sw_tese)): st_aux = sw_tese[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_tese.append(string_aux) #todos os argumentos iniciais for i in range(len(sw_allArgIni)): st_aux = sw_allArgIni[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_allArgIni.append(string_aux) ############################################################################################################# # #LSI # lsi_posInicial = [] # for i in range(len(sw_posInicial)): # aux = "posIni(%d): %s" %(i, sw_posInicial[i]) # lsi_posInicial.append(aux) # # # lsi = gensim.models.lsimodel.LsiModel(lsi_posInicial) # # # print sw_posInicial # lsi.print_topics(10) ############################################################################################################# #retorno da função - usado na views.py para alimentar o template debate.html #passar parametros que devem ser apresentados na templates debate.html return [st_tese, posInicial, sw_tese, aux_usu, st_posInicial, tese]
def clusterFinal(idtese): #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses cursor = connection.cursor() cursor2 = connection.cursor() cursor.execute("select distinct `usr`.`primeironome` as `name`, `pos`.`posicionamentofinal` AS `posicionamentofinal` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr` where ((`arg`.`tese_idtese` = " + idtese + " ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))") cursor2.execute("select tese from tese where grupo_idgrupo = 1064 ") #Variavel e função para tratar tags html e acentos com codificação ISO h = HTMLParser.HTMLParser() #dados retirados da consulta ao banco dadosSql = cursor.fetchall() textotese = cursor2.fetchall() #listas para tratar os dados iniciais usu = [] posInicial = [] dados = [] tese = [] #lista com dados pos tagger tag_posInicial = [] tag_comAce_posInicial = [] #lista com dados após a remoção das stopwords sw_tese = [] sw_posInicial = [] aux_usu = [] sw_tagPosInicial = [] #texto marcado e sem stopwords sw_tagcomAce_posInicial = [] #texto COM ACENTOS marcado e sem stopwords #lista com dados após a aplicação de Stemming st_posInicial = [] st_tese = [] st_tagPosInicial = [] #texto marcado, sem stopwords e com stemmer aplicado st_tagcomAce_posInicial = [] #texto COM ACENTOS marcado, sem stopwords e com stemmer aplicado ############################################################################################################# #LISTA COM OS POSICIONAMENTOS INICIAIS APÓS APLICAÇÃO DA NORMALIZAÇAÕ posInicial_Normalizado = [] normalizacao = [] ############################################################################################################# #Aplicacao de Case Folding for d in dadosSql: dados.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(), re.sub('<[^>]*>', '', h.unescape(d[1])).lower()]) for t in textotese: tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower()) #Colocando os textos de posicionamento inicial em numa lista separada for i in dados: x = 0 usu.append(i[x].upper()) posInicial.append(i[x+1].lower()) #lista com o posicionamento Inicial com todas as letras em minusculo ############################################################################################################# ### Classificacao das palavras de acordo com sua classe gramatical ### Utilizacao do postagger NLPNET ### http://nilc.icmc.usp.br/nlpnet/index.html# tagger = nlpnet.POSTagger() semAce_posInicial = [] #armazena o posInicial apenas sem acentos, sem pontuações, sem endereço web e sem numeros comAce_posInicial = [] #armazena o posInicial apenas COM acentos, sem pontuações, sem endereço web e sem numeros for i in posInicial: semAce_posInicial.append(removePontuacao(removeA(removeNum(removeSE(removeEndWeb((i))))))) for i in semAce_posInicial: tag_posInicial.append(tagger.tag(i)) for i in posInicial: comAce_posInicial.append(removePontuacao(removeNum(removeSE(removeEndWeb((i)))))) for i in comAce_posInicial: tag_comAce_posInicial.append(tagger.tag(i)) ############################################################################################################# #APENAS PARA REALIZAR TESTE E COLOCAR NA DISSERTACAO # pprint(semAce_posInicial) # pprint(comAce_posInicial) # exit() # tagg_posInicial = [] # for texto in posInicial: # tagg_posInicial.append(tagger.tag(texto)) # # print "posInicial" # pprint(posInicial) # # print "tagg_posInicial" # pprint(tagg_posInicial) ############################################################################################################# ############################################################################################################# ### REMOCAO DE STOPWORDS ### Remocao dos termos de acordo com a NLTK ### Remocao dos termos classificados como artigos, verbos, adverbios, etc... for i in usu: aux_usu.append(removeStopWords(i)) for i in tese: sw_tese.append(removeStopWords(i)) for i in posInicial: sw_posInicial.append(removeStopWords(i)) for i in tag_posInicial: sw_tagPosInicial.append(limpaCorpus(i)) for i in tag_comAce_posInicial: sw_tagcomAce_posInicial.append(limpaCorpus(i)) #################################################################################################################################### # Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa # Retirando afixos dos textos do posInicial e tese stemmer = RSLPStemmer() for i in range(len(sw_posInicial)): st_aux = sw_posInicial[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_posInicial.append(string_aux) for i in range(len(sw_tese)): st_aux = sw_tese[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_tese.append(string_aux) for i in range(len(sw_tagPosInicial)): termosST = "" auxST = [] for j in range(len(sw_tagPosInicial[i])): aux = stemmer.stem(sw_tagPosInicial[i][j][0]) etiqueta = sw_tagPosInicial[i][j][1] termosST = (aux,etiqueta) auxST.append(termosST) st_tagPosInicial.append(auxST) for i in range(len(sw_tagcomAce_posInicial)): termosST = "" auxST = [] for j in range(len(sw_tagcomAce_posInicial[i])): aux = stemmer.stem(sw_tagcomAce_posInicial[i][j][0]) etiqueta = sw_tagcomAce_posInicial[i][j][1] termosST = (aux,etiqueta) auxST.append(termosST) st_tagcomAce_posInicial.append(auxST) #################################################################################################################################### ### A NORMALIZACAO DE TERMOS REFERE-SE A TECNICA DE TROCAR PALAVRAS SINONIMAS, OU SEJA, QUE TENHAM SIGNIFICADO ## ### SEMELHANTE, POR UM UNICO TERMO REPRESENTATIVO NO CORPUS DE ANALISE. DESSA FORMA, É POSSIVEL AUMENTAR O GRAU ## ### DE SIMILARIDADE ENTRE OS TEXTOS ANALISADOS ATRAVES DO USO DE TECNICAS DE ANALISE ESTATISTICAS, COMO SIMILA ## ### RIDADE DE COSSENOS OU DISTANCIA EUCLIDIANA. ## #################################################################################################################################### ### A NORMALIZACAO FOI DESENVOLVIDA COM BASE NOS DADOS DISPONIBILIZADOS PELO PROJETO TEP 2.0 DO NILC/USP ## ### http://143.107.183.175:21480/tep2/index.htm ## ### ## ### FORMATO DO ARQUIVO ## ### NUM1. [Tipo] {termos sinonimos} <NUM2> ## ### 263. [Verbo] {consentir, deixar, permitir} <973> ## ### NUM1 = NUMERO DA LINHA DE REFERENCIA PARA TERMO SINONIMO ## ### NUM2 = NUMERO DA LINHA DE REFERENCIA PARA TERMO ANTONIMO (SENTIDO OPOSTO) ## #################################################################################################################################### #abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios) #arquivo apenas com termos classificados como substantivos, adjetivos e verbos base_tep = codecs.open(os.path.join(os.path.dirname(__file__),'../base_tep2/base_tep.txt'), 'r', 'UTF8') # dicionario = open('/home/panceri/git/alpes_v1/base_tep2/dicionarioSinonimos.txt', 'w') #variavel com conteúdo do arquivo em memoria #não imprimir essa variável, MUITO GRANDEE!!! wordNet = base_tep.readlines() #fechar arquivo base_tep.close() #################################################################################################################################### ## NORMALIZAÇÃO FEITA COM BASE NOS RADICAIS DE FORMAÇÃO DAS PALAVRAS ## ## APLICAÇÃO DO RSPL PRIMEIRO PARA DEPOIS BUSCAR NA BASE OS TERMOS SIMILARES ## ## DENTRO DA BASE_TEP OS TERMOS TAMBÉM FORAM REDUZIDOS AOS SEUS RADICIAIS DE FORMAÇÃO ## ## O DICIONÁRIO ESTÁ COM A REFERÊNCIA PARA A LINHA AONDE ESTÃO OS TERMOS SINÔNIMOS ## ## OS TERMOS SÃO ANALISADOS CONSIDERANDO SUAS ACENTUAÇÕES, PARA APLICAÇÃO CORRETA DO RSLP ## #################################################################################################################################### yappi.set_clock_type('cpu') yappi.start(builtins=True) start = time.time() st_WordNetV = [] ##armazena num, tipo, e radical dos sinonimos - APENAS VERBOS st_WordNetN = [] ##armazena num, tipo, e radical dos sinonimos - APENAS SUBSTANTIVOS st_WordNetA = [] ##armazena num, tipo, e radical dos sinonimos - APENAS ADJETIVOS st_WordNetO = [] ##armazena num, tipo, e radical dos sinonimos - APENAS OUTROS for linhaWordnet in wordNet: listaAux = [] termos = re.findall(r"\{(.*)\}", linhaWordnet) num = re.findall(r"([0-9]+)\.", linhaWordnet) tipo = re.findall(r"\[(.*)\]", linhaWordnet) if tipo[0] == "Substantivo": listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetN.append(listaAux) elif tipo[0] == "Verbo": listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetV.append(listaAux) elif tipo[0] == "Adjetivo": listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetA.append(listaAux) else: listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetO.append(listaAux) duration = time.time() - start stats = yappi.get_func_stats() stats.save('stemmWordNet.out', type = 'callgrind') #################################################################################################################################### ### A ANÁLISE É REALIZADA COM BASE NO TEXTO SEM A EXCLUSÃO DOS ACENTOS ## ### POIS AO EXCLUÍ-LOS A REDUÇÃO AO RADICAL DE FORMAÇÃO (APLICAÇÃO DO RSLP) É PREJUDICADA ## ### OS TESTES REALIZADOS MOSTRARAM QUE ESSA É UMA MELHOR ABORDAGEM, UMA VEZ QUE NOSSOS TEXTOS SÃO PEQUENOS ## ### E PRECISAMOS CHEGAR O MAIS PRÓXIMO POSSÍVEL SEM CONSIDERAR SEUS SENTIDOS E/OU CONTEXTOS ## #################################################################################################################################### yappi.set_clock_type('cpu') yappi.start(builtins=True) start = time.time() normalizacao = normalizacaoWordnet(st_WordNetA, st_WordNetN, st_WordNetV, st_WordNetO, st_tagcomAce_posInicial) ############################################################### # Colocando os textos normalizados numa lista de 1 diemensão ############################################################### stringNorm = "" auxNorm = [] for i in range(len(normalizacao)): auxNorm = normalizacao[i] for x in range(len(auxNorm)): stringNorm = stringNorm + " " + auxNorm[x] posInicial_Normalizado.append(stringNorm) stringNorm = "" duration = time.time() - start stats = yappi.get_func_stats() stats.save('normalizacaoWordnet.out', type = 'callgrind') #################################################################################################################################### # print "posInicial" # pprint(posInicial) # # print "comAce_posInicial" # pprint(comAce_posInicial) # # print "tag_comAce_posInicial" # pprint(tag_comAce_posInicial) # # print "sw_tagcomAce_posInicial" # pprint(sw_tagcomAce_posInicial) # # print "st_tagcomAce_posInicial" # pprint(st_tagcomAce_posInicial) # print "posInicial_Normalizado" # print len(posInicial_Normalizado) # pprint(posInicial_Normalizado) # exit() #################################################################################################################################### return [st_tese, posInicial, sw_tese, aux_usu, st_posInicial, tese, posInicial_Normalizado]
def stem(term): snowball_stemmer = PortugueseStemmer() rslp_stemmer = RSLPStemmer() print u'[{}] Snowball: {}, RSLP: {}'.format(term, snowball_stemmer.stem(term), rslp_stemmer.stem(term))
def __init__(self, idf_path=IDF_PATH, use_idf=True): self.stemmer = RSLPStemmer() self.term_dict, self.freq = self.dict_from_idf(idf_path) self.max_freq = float(max(self.freq.values())) self.vocab_size = len(self.term_dict) self.use_idf = use_idf
#!/usr/bin/env python #-*- coding:utf-8 -*- from pattern.web import URL, plaintext from nltk.tokenize import RegexpTokenizer from nltk.stem import RSLPStemmer tokenizer = RegexpTokenizer('\w+') stemmer = RSLPStemmer() verbos = tokenizer.tokenize(open('verbos','rb').read()) radicais = [] resultados = [] for verbo in verbos: radical = stemmer.stem(verbo) radicais.append(radical) for verbo in verbos: pg = URL('http://linguistica.insite.com.br/mod_perl/conjugue?verbo='+verbo).download() palavras = tokenizer.tokenize(pg) for palavra in palavras: radical = stemmer.stem(palavra) if radical in radicais: if palavra not in resultados: print palavra.encode('utf-8') resultados.append(palavra)
class TFIDF(object): """docstring for TFIDF""" def __init__(self, corpus_dict, stopword_filename = None, DEFAULT_IDF = 1.5): super(TFIDF, self).__init__() self.num_docs = 0 self.term_num_docs = {} # 用于存在某个词在文档集中出现的次数 self.stopwords = set([]) self.idf_default = DEFAULT_IDF self.st = RSLPStemmer() # self._tokenizer = PortugueseWordTokenizer() if not corpus_dict: print "corpus is empty!" exit() self.num_docs = len(corpus_dict) # 将句子词干化 self.corpus = [self.getTokens(doc) for doc in corpus_dict] # 如果有停用词表则生成停用词 if stopword_filename: stopword_file = codecs.open(stopword_filename, "r", encoding='utf-8') self.stopwords = set([line.strip() for line in stopword_file]) def getTokens(self,string): # return self._tokenizer.tokenize(string) # return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", string.lower()) return [self.st.stem(token) for token in string.split()] ''' 计算词语在某类文章中的tf值 ''' def getTf(self,innerIndexes): wordFrequence = {} wordCount = 0 for doc in innerIndexes: for oneToken in self.corpus[doc]: count = wordFrequence.setdefault(oneToken,0) + 1 wordFrequence[oneToken] = count wordCount += 1 for index,value in wordFrequence.iteritems(): wordFrequence[index] = float(value)/wordCount return wordFrequence def getTermDocs(self): for oneAricles in self.corpus: for word in set(oneAricles): articles = self.term_num_docs.get(word,0) + 1 self.term_num_docs[word] = articles ''' 计算词语在总体文章中的idf值 ''' def getIdf(self): self.getTermDocs() wordIdf = {} for term,value in self.term_num_docs.iteritems(): if term in self.stopwords: wordIdf[term] = 0.0 continue wordIdf[term] = math.log(float(self.num_docs + 1) / (value + 1)) return wordIdf ''' 计算词语对某类别的tf-idf值 ''' @staticmethod def getTfIdf(tfDict,idfDict): resultList = [(key,tfDict[key]*idfDict[key]) for key in tfDict] return sorted(resultList,key=lambda x:x[1],reverse=True)