Exemplo n.º 1
0
def LSA_Kmeans(clusters, textoTreinamento, nomeUsuarios, textoComparacao=None):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    ##########################################################################################
    #  PRÉ-PROCESSAMENTO DO TEXTO ENVIADO PARA CRIAÇÃO DO DICIONÁRIO DE RELACOES SEMANTICAS  #
    ##########################################################################################    
    
    #UTILIZA AS FUNCOES removeA e removePontuacao PARA TRATAR textoTreinamento 
    textoTrein = [removeA(removePontuacao(i)) for i in textoTreinamento] 
    #print textoTrein
    
    textoComp = [removeA(removePontuacao(i)) for i in textoComparacao]
    
    #CARREGA A LISTA DE STOPWORDS DA NLTK    
    stop = stopwords.words('portuguese')
    #RETIRA OS ACENTOS DA LISTA DE STOPWORDS   
    stoplist = [(removeA(s)) for s in stop ]
#     print stoplist
    
    #REMOVE AS STOPWORDS E PALAVRAS COM MENOS DE 3 CARACTERES
    textoTrein = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \
             for document in textoTrein]
#     print sw_textoTrein

    textoComp = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \
             for document in textoComp]
#     print textoComp
##############################################################################################
#     INICIO DE APLICACAO DO LSA - CRIANDO O DICIONARIO DE TERMOS/FREQUENCIA                 #
##############################################################################################

    #DEFINE FREQUENCIA COMO UMA VARIAVEL DO TIPO DICIONARIO DE INTEIROS
    frequencia = defaultdict(int)
    
    #ARMAZENA A QUANTIDADE DE REPETIÇÕES DE UM TERMO EM TODOS OS DOCUMENTOS DA COLECAO
    for t in textoTrein:
        for token in t:
            frequencia[token] += 1
#     pprint(frequencia)
   
    #PALAVRAS COM FREQUENCIA 1 NÃO SÃO IMPORTANTES, POIS NÃO POSSUEM RELACOES DE CO-OCORRENCIA
    #Remove todas as palavras que apareceram apenas 1 vez durante a contagem
    textoTrein = [[token for token in palavra if frequencia[token] > 1]\
             for palavra in textoTrein]
#     pprint(textoTrein)
    
    
    ##########################################################################################
    # Dictionary encapsulates the mapping between normalized words and their integer ids.    #
    # The main function is `doc2bow`, which converts a collection of words to its            #
    # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.             #
    ##########################################################################################
    dicionario = corpora.Dictionary(textoTrein)
#     print dicionario
    
    # Armazena o ID das palavras que aparecem apenas 1 vez nos textos
    once_ids = [tokenId for tokenId,docfreq in dicionario.dfs.iteritems() if docfreq == 1]
#     print once_ids
    
    #remove todas as palavras com frequencia = 1
    dicionario.filter_tokens(once_ids)
    
    #reorganiza o dicionario, realocando os dados para os indices que foram removidos
    dicionario.compactify()
    
#     print dicionario.token2id # token -> tokenId
#     print dicionario.dfs # document frequencies: tokenId -> in how many documents this token appeared
    
    # Atribui a corpus_textoTrein o textoTrein no formato "bag-of-words"
    # The main function is `doc2bow`, which converts a collection of words to its
    # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.
    corpus_textoTrein = [dicionario.doc2bow(texto) for texto in textoTrein]
#     pprint(corpus_textoTrein)
    
    corpus_textoComp = [dicionario.doc2bow(textoC) for textoC in textoComp]
#     pprint(corpus_textoComp)
    ##########################################################################################
    # MODELO DE TRANSFORMACAO - BAG-OF-WORDS PARA TF-IDF                                     #
    ##########################################################################################
    
    # TRANSFORMA corpus_textoTrein (bag-of-words) 
    # PARA tfidf_TextoTrein (frequencia termos x inverso da freq no documento 
    tfidf_TextoTrein = models.TfidfModel(corpus=corpus_textoTrein)
#     print tfidf_TextoTrein
    
    #USA O posIni PARA GERAR A MATRIZ DE COMPARACAO COM OS DADOS DO DICIONARIO
    corpus_tfidf_TextoTrein = tfidf_TextoTrein[corpus_textoComp]
#     print list(corpus_tfidf_TextoTrein)
    
    #TRANSFORMA A MATRIZ TF-IDF 
    modelo_lsa = models.LsiModel(corpus_tfidf_TextoTrein, id2word=dicionario,num_topics=len(dicionario))
    
    query = []

    for q in textoComparacao:
        vec_bow = dicionario.doc2bow(q.lower().split())
        vec_lsi = modelo_lsa[vec_bow] #convert a query de comparação num espaço LSI
        query.append(vec_lsi)         
#     print "query"
#     pprint(query)
    
    #TRANSFORMA corpus_textoComp num espaço LSA e indexa 
    indexComp = similarities.MatrixSimilarity(modelo_lsa[corpus_textoComp])
#     print "indexComp"
#     pprint(list(indexComp))


    # To obtain similarities of our query document against the indexed documents:
    # perform a similarity query against the corpus
    sims = indexComp[query]
    
#     pprint(sims)   
        

    ##########################################################################################
    # JUNÇÃO COM K-MEANS PARA REALIZAR AGRUPAMENTOS                                          #
    ##########################################################################################

    ##Valor ideal, após experimentos = 100000
    km_model = KMeans(n_clusters=clusters, n_init=100000)

    km_model.fit_transform(sims)
    
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
 
### impressões para visualizar no console
#     print "clustering _LSA_KMEANS"
#     pprint(clustering)
    
#     print len(clustering)
    
#     for i in range(len(clustering)):
#         for j in clustering[i]:
#             print "grupo", i
#             print j, nomeUsuarios[j]
#             print textoComparacao[j]
            
    
    return clustering    
    
Exemplo n.º 2
0
def similaridade_lsa(textoTreinamento, nomeUsuarios, textoComparacao=None):
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    ##########################################################################################
    #  PRÉ-PROCESSAMENTO DO TEXTO ENVIADO PARA CRIAÇÃO DO DICIONÁRIO DE RELACOES SEMANTICAS  #
    ##########################################################################################    
    
    #UTILIZA AS FUNCOES removeA e removePontuacao PARA TRATAR textoTreinamento 
    textoTrein = [removeA(removePontuacao(i)) for i in textoTreinamento] 
    #print textoTrein
    
    textoComp = [removeA(removePontuacao(i)) for i in textoComparacao]
    
    #CARREGA A LISTA DE STOPWORDS DA NLTK    
    stop = stopwords.words('portuguese')
    #RETIRA OS ACENTOS DA LISTA DE STOPWORDS   
    stoplist = [(removeA(s)) for s in stop ]
#     print stoplist
    
    #REMOVE AS STOPWORDS E PALAVRAS COM MENOS DE 3 CARACTERES
    textoTrein = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \
             for document in textoTrein]
#     print sw_textoTrein

    textoComp = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \
             for document in textoComp]
#     print textoComp
##############################################################################################
#     INICIO DE APLICACAO DO LSA - CRIANDO O DICIONARIO DE TERMOS/FREQUENCIA                 #
##############################################################################################

    #DEFINE FREQUENCIA COMO UMA VARIAVEL DO TIPO DICIONARIO DE INTEIROS
    frequencia = defaultdict(int)
    
    #ARMAZENA A QUANTIDADE DE REPETIÇÕES DE UM TERMO EM TODOS OS DOCUMENTOS DA COLECAO
    for t in textoTrein:
        for token in t:
            frequencia[token] += 1
#     pprint(frequencia)
   
    #PALAVRAS COM FREQUENCIA 1 NÃO SÃO IMPORTANTES, POIS NÃO POSSUEM RELACOES DE CO-OCORRENCIA
    #Remove todas as palavras que apareceram apenas 1 vez durante a contagem
    textoTrein = [[token for token in palavra if frequencia[token] > 1]\
             for palavra in textoTrein]
#     pprint(textoTrein)
    
    
    ##########################################################################################
    # Dictionary encapsulates the mapping between normalized words and their integer ids.    #
    # The main function is `doc2bow`, which converts a collection of words to its            #
    # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.             #
    ##########################################################################################
    dicionario = corpora.Dictionary(textoTrein)
#     print dicionario
    
    # Armazena o ID das palavras que aparecem apenas 1 vez nos textos
    once_ids = [tokenId for tokenId,docfreq in dicionario.dfs.iteritems() if docfreq == 1]
#     print once_ids
    
    #remove todas as palavras com frequencia = 1
    dicionario.filter_tokens(once_ids)
    
    #reorganiza o dicionario, realocando os dados para os indices que foram removidos
    dicionario.compactify()
    
#     print dicionario.token2id # token -> tokenId
#     print dicionario.dfs # document frequencies: tokenId -> in how many documents this token appeared
    
    # Atribui a corpus_textoTrein o textoTrein no formato "bag-of-words"
    # The main function is `doc2bow`, which converts a collection of words to its
    # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.
    corpus_textoTrein = [dicionario.doc2bow(texto) for texto in textoTrein]
#     pprint(corpus_textoTrein)
    
    corpus_textoComp = [dicionario.doc2bow(textoC) for textoC in textoComp]
#     pprint(corpus_textoComp)
    ##########################################################################################
    # MODELO DE TRANSFORMACAO - BAG-OF-WORDS PARA TF-IDF                                     #
    ##########################################################################################
    
    # TRANSFORMA corpus_textoTrein (bag-of-words) 
    # PARA tfidf_TextoTrein (frequencia termos x inverso da freq no documento 
    tfidf_TextoTrein = models.TfidfModel(corpus=corpus_textoTrein)
#     print tfidf_TextoTrein
    
    #USA O posIni PARA GERAR A MATRIZ DE COMPARACAO COM OS DADOS DO DICIONARIO
    corpus_tfidf_TextoTrein = tfidf_TextoTrein[corpus_textoComp]
#     print list(corpus_tfidf_TextoTrein)
    
    #TRANSFORMA A MATRIZ TF-IDF 
    modelo_lsa = models.LsiModel(corpus_tfidf_TextoTrein, id2word=dicionario,num_topics=len(dicionario))
    
    #TRANSFORMA OS DADOS DE TREINAMENTO EM LSA
#     corpus_lsi = modelo_lsa[corpus_tfidf_TextoTrein] 
    
#     for doc in corpus_lsi:
#         pprint(doc)
    
    query = []

    for q in textoComparacao:
        vec_bow = dicionario.doc2bow(q.lower().split())
        vec_lsi = modelo_lsa[vec_bow] #convert a query de comparação num espaço LSI
        query.append(vec_lsi)         
#     print "query"
#     pprint(query)
    
    #TRANSFORMA corpus_textoComp num espaço LSA e indexa 
    indexComp = similarities.MatrixSimilarity(modelo_lsa[corpus_textoComp])
#     print "indexComp"
#     pprint(list(indexComp))


    # To obtain similarities of our query document against the nine indexed documents:
    # perform a similarity query against the corpus
    sims = indexComp[query]
#     pprint(list(enumerate(sims)))

    
    now = datetime.now()
    resultado = open(os.path.join(os.path.dirname(__file__),"../arquivos/resultado"+now.__str__()+".txt"), "w")
    resultados = []
    
    for i in range(0, len(sims)):
        aux = sims[i]
#         print "sorted",sorted(sims[i], reverse=True)
#         print i, aux 
        for y in range(i+1, len(aux)):
            str_aux = [nomeUsuarios[i] +" " + aux[y].__str__() + "% similar" + nomeUsuarios[y]]
#             print str_aux
#             resultados.append(str_aux)
            resultados.append([aux[y],nomeUsuarios[i],nomeUsuarios[y]])
            resultado.write(nomeUsuarios[i] +" " + aux[y].__str__() + "% similar" + nomeUsuarios[y] + "\n")
# #     
    resultado.close()
#     print "resultados"
#     pprint(resultados)

    return resultados   
Exemplo n.º 3
0
def clusterArgInicial(idtese):

    #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses
    cursor = connection.cursor()
    cursor2 = connection.cursor()

    cursor.execute(
        "select distinct `usr`.`primeironome` as `name`, `arg`.`argumento` AS `posicionamentoinicial` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr`  where ((`arg`.`tese_idtese` = "
        + idtese +
        "  ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))"
    )
    cursor2.execute("select tese from tese where idtese=" + idtese)

    #Variavel e função para tratar tags html e acentos com codificação ISO
    h = HTMLParser.HTMLParser()

    #dados retirados da consulta ao banco
    dadosSql = cursor.fetchall()
    textotese = cursor2.fetchall()

    #listas para tratar os dados iniciais
    usu = []
    posInicial = []
    dados = []
    tese = []

    #lista com dados pos tagger
    tag_posInicial = []
    tag_comAce_posInicial = []

    #lista com dados após a remoção das stopwords
    sw_tese = []
    sw_posInicial = []
    aux_usu = []
    sw_tagPosInicial = []  #texto marcado e sem stopwords
    sw_tagcomAce_posInicial = []  #texto COM ACENTOS marcado e sem stopwords

    #lista com dados após a aplicação de Stemming
    st_posInicial = []
    st_tese = []
    st_tagPosInicial = []  #texto marcado, sem stopwords e com stemmer aplicado
    st_tagcomAce_posInicial = [
    ]  #texto COM ACENTOS marcado, sem stopwords e com stemmer aplicado

    #############################################################################################################
    #LISTA COM OS POSICIONAMENTOS INICIAIS APÓS APLICAÇÃO DA NORMALIZAÇAÕ
    posInicial_Normalizado = []
    normalizacao = []

    #############################################################################################################
    #Aplicacao de Case Folding

    for d in dadosSql:
        dados.append([
            re.sub('<[^>]*>', '', h.unescape(d[0])).lower(),
            re.sub('<[^>]*>', '', h.unescape(d[1])).lower()
        ])

    for t in textotese:
        tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower())

    #Colocando os textos de posicionamento inicial em numa lista separada
    for i in dados:
        x = 0
        usu.append(i[x].upper())
        posInicial.append(
            i[x + 1].lower()
        )  #lista com o posicionamento Inicial com todas as letras em minusculo

#############################################################################################################
### Classificacao das palavras de acordo com sua classe gramatical
### Utilizacao do postagger NLPNET
### http://nilc.icmc.usp.br/nlpnet/index.html#

    tagger = nlpnet.POSTagger()

    semAce_posInicial = [
    ]  #armazena o posInicial apenas sem acentos, sem pontuações, sem endereço web e sem numeros
    comAce_posInicial = [
    ]  #armazena o posInicial apenas COM acentos, sem pontuações, sem endereço web e sem numeros

    for i in posInicial:
        semAce_posInicial.append(
            removePontuacao(removeA(removeNum(removeSE(removeEndWeb((i)))))))

    for i in semAce_posInicial:
        tag_posInicial.append(tagger.tag(i))

    for i in posInicial:
        comAce_posInicial.append(
            removePontuacao(removeNum(removeSE(removeEndWeb((i))))))

    for i in comAce_posInicial:
        tag_comAce_posInicial.append(tagger.tag(i))

#############################################################################################################
#APENAS PARA REALIZAR TESTE E COLOCAR NA DISSERTACAO

#     pprint(semAce_posInicial)
#     pprint(comAce_posInicial)
#     exit()

#     tagg_posInicial = []
#     for texto in posInicial:
#         tagg_posInicial.append(tagger.tag(texto))
#
#     print "posInicial"
#     pprint(posInicial)
#
#     print "tagg_posInicial"
#     pprint(tagg_posInicial)

#############################################################################################################

#############################################################################################################
### REMOCAO DE STOPWORDS
### Remocao dos termos de acordo com a NLTK
### Remocao dos termos classificados como artigos, verbos, adverbios, etc...

    for i in usu:
        aux_usu.append(removeStopWords(i))

    for i in tese:
        sw_tese.append(removeStopWords(i))

    for i in posInicial:
        sw_posInicial.append(removeStopWords(i))

    for i in tag_posInicial:
        sw_tagPosInicial.append(limpaCorpus(i))

    for i in tag_comAce_posInicial:
        sw_tagcomAce_posInicial.append(limpaCorpus(i))

####################################################################################################################################
# Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa
# Retirando afixos dos textos do posInicial e tese

    stemmer = RSLPStemmer()

    for i in range(len(sw_posInicial)):
        st_aux = sw_posInicial[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)

        st_posInicial.append(string_aux)

    for i in range(len(sw_tese)):
        st_aux = sw_tese[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)

        st_tese.append(string_aux)

    for i in range(len(sw_tagPosInicial)):
        termosST = ""
        auxST = []
        for j in range(len(sw_tagPosInicial[i])):
            aux = stemmer.stem(sw_tagPosInicial[i][j][0])
            etiqueta = sw_tagPosInicial[i][j][1]
            termosST = (aux, etiqueta)
            auxST.append(termosST)

        st_tagPosInicial.append(auxST)

    for i in range(len(sw_tagcomAce_posInicial)):
        termosST = ""
        auxST = []
        for j in range(len(sw_tagcomAce_posInicial[i])):
            aux = stemmer.stem(sw_tagcomAce_posInicial[i][j][0])
            etiqueta = sw_tagcomAce_posInicial[i][j][1]
            termosST = (aux, etiqueta)
            auxST.append(termosST)

        st_tagcomAce_posInicial.append(auxST)

####################################################################################################################################
### A NORMALIZACAO DE TERMOS REFERE-SE A TECNICA DE TROCAR PALAVRAS SINONIMAS, OU SEJA, QUE TENHAM SIGNIFICADO                    ##
### SEMELHANTE, POR UM UNICO TERMO REPRESENTATIVO NO CORPUS DE ANALISE. DESSA FORMA, É POSSIVEL AUMENTAR O GRAU                   ##
### DE SIMILARIDADE ENTRE OS TEXTOS ANALISADOS ATRAVES DO USO DE TECNICAS DE ANALISE ESTATISTICAS, COMO SIMILA                    ##
### RIDADE DE COSSENOS OU DISTANCIA EUCLIDIANA.                                                                                   ##
####################################################################################################################################
### A NORMALIZACAO FOI DESENVOLVIDA COM BASE NOS DADOS DISPONIBILIZADOS PELO PROJETO TEP 2.0 DO NILC/USP                          ##
### http://143.107.183.175:21480/tep2/index.htm                                                                                   ##
###                                                                                                                               ##
### FORMATO DO ARQUIVO                                                                                                            ##
### NUM1. [Tipo] {termos sinonimos} <NUM2>                                                                                        ##
### 263. [Verbo] {consentir, deixar, permitir} <973>                                                                              ##
### NUM1 = NUMERO DA LINHA DE REFERENCIA PARA TERMO SINONIMO                                                                      ##
### NUM2 = NUMERO DA LINHA DE REFERENCIA PARA TERMO ANTONIMO (SENTIDO OPOSTO)                                                     ##
####################################################################################################################################

#abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios)
#arquivo apenas com termos classificados como substantivos, adjetivos e verbos
    base_tep = codecs.open(
        os.path.join(os.path.dirname(__file__), '../base_tep2/base_tep.txt'),
        'r', 'UTF8')
    #     dicionario = open('/home/panceri/git/alpes_v1/base_tep2/dicionarioSinonimos.txt', 'w')

    #variavel com conteúdo do arquivo em memoria
    #não imprimir essa variável, MUITO GRANDEE!!!
    wordNet = base_tep.readlines()

    #fechar arquivo
    base_tep.close()

    ####################################################################################################################################
    ## NORMALIZAÇÃO FEITA COM BASE NOS RADICAIS DE FORMAÇÃO DAS PALAVRAS                                                              ##
    ## APLICAÇÃO DO RSPL PRIMEIRO PARA DEPOIS BUSCAR NA BASE OS TERMOS SIMILARES                                                      ##
    ## DENTRO DA BASE_TEP OS TERMOS TAMBÉM FORAM REDUZIDOS AOS SEUS RADICIAIS DE FORMAÇÃO                                             ##
    ## O DICIONÁRIO ESTÁ COM A REFERÊNCIA PARA A LINHA AONDE ESTÃO OS TERMOS SINÔNIMOS                                                ##
    ## OS TERMOS SÃO ANALISADOS CONSIDERANDO SUAS ACENTUAÇÕES, PARA APLICAÇÃO CORRETA DO RSLP                                         ##
    ####################################################################################################################################

    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()

    st_WordNetV = [
    ]  ##armazena num, tipo, e radical dos sinonimos - APENAS VERBOS
    st_WordNetN = [
    ]  ##armazena num, tipo, e radical dos sinonimos - APENAS SUBSTANTIVOS
    st_WordNetA = [
    ]  ##armazena num, tipo, e radical dos sinonimos - APENAS ADJETIVOS
    st_WordNetO = [
    ]  ##armazena num, tipo, e radical dos sinonimos - APENAS OUTROS

    for linhaWordnet in wordNet:
        listaAux = []
        termos = re.findall(r"\{(.*)\}", linhaWordnet)
        num = re.findall(r"([0-9]+)\.", linhaWordnet)
        tipo = re.findall(r"\[(.*)\]", linhaWordnet)

        if tipo[0] == "Substantivo":
            listaAux.append(num)
            listaAux.append(tipo)

            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetN.append(listaAux)

        elif tipo[0] == "Verbo":
            listaAux.append(num)
            listaAux.append(tipo)

            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetV.append(listaAux)

        elif tipo[0] == "Adjetivo":
            listaAux.append(num)
            listaAux.append(tipo)

            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetA.append(listaAux)
        else:
            listaAux.append(num)
            listaAux.append(tipo)

            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetO.append(listaAux)

    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('stemmWordNet.out', type='callgrind')

    ####################################################################################################################################
    ### A ANÁLISE É REALIZADA COM BASE NO TEXTO SEM A EXCLUSÃO DOS ACENTOS                                                            ##
    ### POIS AO EXCLUÍ-LOS A REDUÇÃO AO RADICAL DE FORMAÇÃO (APLICAÇÃO DO RSLP) É PREJUDICADA                                         ##
    ### OS TESTES REALIZADOS MOSTRARAM QUE ESSA É UMA MELHOR ABORDAGEM, UMA VEZ QUE NOSSOS TEXTOS SÃO PEQUENOS                        ##
    ### E PRECISAMOS CHEGAR O MAIS PRÓXIMO POSSÍVEL SEM CONSIDERAR SEUS SENTIDOS E/OU CONTEXTOS                                       ##
    ####################################################################################################################################
    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()

    normalizacao = normalizacaoWordnet(st_WordNetA, st_WordNetN, st_WordNetV,
                                       st_WordNetO, st_tagcomAce_posInicial)

    ###############################################################
    # Colocando os textos normalizados numa lista de 1 diemensão
    ###############################################################
    stringNorm = ""
    auxNorm = []

    for i in range(len(normalizacao)):
        auxNorm = normalizacao[i]

        for x in range(len(auxNorm)):
            stringNorm = stringNorm + " " + auxNorm[x]

        posInicial_Normalizado.append(stringNorm)
        stringNorm = ""

    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('normalizacaoWordnet.out', type='callgrind')

    ####################################################################################################################################

    #     print "posInicial"
    #     pprint(posInicial)
    #
    #     print "comAce_posInicial"
    #     pprint(comAce_posInicial)
    #
    #     print "tag_comAce_posInicial"
    #     pprint(tag_comAce_posInicial)
    #
    #     print "sw_tagcomAce_posInicial"
    #     pprint(sw_tagcomAce_posInicial)
    #
    #     print "st_tagcomAce_posInicial"
    #     pprint(st_tagcomAce_posInicial)

    #     print "posInicial_Normalizado"
    #     print len(posInicial_Normalizado)
    #     pprint(posInicial_Normalizado)

    #     exit()
    ####################################################################################################################################

    #retorno da função - usado na views.py para alimentar o template debate.html
    #passar parametros que devem ser apresentados na templates debate.html
    return [
        st_tese, posInicial, sw_tese, aux_usu, st_posInicial, tese,
        posInicial_Normalizado
    ]
Exemplo n.º 4
0
def normalizacao1(dicSin, termo, radical, etiqueta):

    #     inicio = datetime.now()
    #     print inicio,"normalizacaoWordnet"

    #variáveis locais
    SA_wordnet = []  #armazena a wordnet sem acentos
    listaDicion = [
    ]  #lista com o número da linha de referência dos termos sinominos e com todos os termos sinonimos encontrados

    #abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios)
    base_tep = codecs.open(
        os.path.join(os.path.dirname(__file__),
                     '../../base_tep2/base_tep.txt'), 'r', 'UTF8')
    #     dicionario = open('/home/panceri/git/alpes_v1/base_tep2/dicionarioSinonimos.txt', 'w')

    #variavel com conteúdo do arquivo em memoria
    #não imprimir essa variável, MUITO GRANDEE!!!
    wordNet = base_tep.readlines()

    #fechar arquivo
    base_tep.close()

    #retirar acentos da base
    for i in wordNet:
        SA_wordnet.append(removeA(i))

    #teste com busca pelo radical (stemmer)
    stemmer = RSLPStemmer()

    #     print termo, radical, etiqueta

    #     yappi.set_clock_type('cpu')
    #     yappi.start(builtins=True)
    #
    #     start = time.time()
    # busca termo dentro de arquivo
    # armazena termo como chave do dicionario
    # os linhaWordNet são armazenados como uma lista
    if etiqueta == "N":
        for linhaWordNet in wordNet:
            if (linhaWordNet.find("[Substantivo]") >= 0):
                termosSinonimos = re.findall(r'\{(.*\w)\}', linhaWordNet)
                for listaSinonimos in termosSinonimos:
                    sa_listaSinonimos = removePontuacao(
                        listaSinonimos)  #lista de linhaWordNet sem as ,
                    for palavraSinonima in sa_listaSinonimos.split():
                        st_palavraSinonima = stemmer.stem(palavraSinonima)
                        if radical == st_palavraSinonima:
                            numETerm = re.findall(
                                r"([0-9]+). \[\w+\] \{(.*)\}", linhaWordNet)
                            listaDicion.append(numETerm)
        dicSin[termo] = listaDicion
#             pprint(dicSin)
    elif etiqueta == "ADJ":
        for linhaWordNet in wordNet:
            if (linhaWordNet.find("[Adjetivo]") >= 0):
                termosSinonimos = re.findall(r'\{(.*)\}', linhaWordNet)
                for listaSinonimos in termosSinonimos:
                    sa_listaSinonimos = removePontuacao(
                        listaSinonimos)  #lista de linhaWordNet sem as ,
                    for palavraSinonima in sa_listaSinonimos.split():
                        st_palavraSinonima = stemmer.stem(palavraSinonima)
                        #                         auxTermos = sa_listaSinonimos.split()
                        if radical == st_palavraSinonima:
                            numETerm = re.findall(
                                r"([0-9]+). \[\w+\] \{(.*)\}", linhaWordNet)
                            listaDicion.append(numETerm)
        dicSin[termo] = listaDicion
#         pprint(dicSin)

    elif etiqueta == "V" or etiqueta == "VAUX":
        for linhaWordNet in wordNet:
            if (linhaWordNet.find("[Verbo]") >= 0):
                termosSinonimos = re.findall(r'\{(.*)\}', linhaWordNet)
                for listaSinonimos in termosSinonimos:
                    sa_listaSinonimos = removePontuacao(
                        listaSinonimos)  #lista de linhaWordNet sem as ,
                    for palavraSinonima in sa_listaSinonimos.split():
                        st_palavraSinonima = stemmer.stem(palavraSinonima)
                        #                         auxTermos = sa_listaSinonimos.split()
                        if radical == st_palavraSinonima:
                            numETerm = re.findall(
                                r"([0-9]+). \[\w+\] \{(.*)\}", linhaWordNet)
                            listaDicion.append(numETerm)
        dicSin[termo] = listaDicion
#         pprint(dicSin)

    else:  #PARA TRATAR OS ADVÉRBIOS
        for linhaWordNet in wordNet:
            termosSinonimos = re.findall(r'\{(.*)\}', linhaWordNet)
            for listaSinonimos in termosSinonimos:
                sa_listaSinonimos = removePontuacao(
                    listaSinonimos)  #lista de linhaWordNet sem as ,
                for palavraSinonima in sa_listaSinonimos.split():
                    st_palavraSinonima = stemmer.stem(palavraSinonima)
                    #                     auxTermos = sa_listaSinonimos.split()
                    if radical == st_palavraSinonima:
                        numETerm = re.findall(r"([0-9]+). \[\w+\] \{(.*)\}",
                                              linhaWordNet)
                        listaDicion.append(numETerm)
        dicSin[termo] = listaDicion
Exemplo n.º 5
0
def LSA_Kmeans(clusters, textoTreinamento, nomeUsuarios, textoComparacao=None):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    ##########################################################################################
    #  PRÉ-PROCESSAMENTO DO TEXTO ENVIADO PARA CRIAÇÃO DO DICIONÁRIO DE RELACOES SEMANTICAS  #
    ##########################################################################################

    #UTILIZA AS FUNCOES removeA e removePontuacao PARA TRATAR textoTreinamento
    textoTrein = [removeA(removePontuacao(i)) for i in textoTreinamento]
    #print textoTrein

    textoComp = [removeA(removePontuacao(i)) for i in textoComparacao]

    #CARREGA A LISTA DE STOPWORDS DA NLTK
    stop = stopwords.words('portuguese')
    #RETIRA OS ACENTOS DA LISTA DE STOPWORDS
    stoplist = [(removeA(s)) for s in stop]
    #     print stoplist

    #REMOVE AS STOPWORDS E PALAVRAS COM MENOS DE 3 CARACTERES
    textoTrein = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \
             for document in textoTrein]
    #     print sw_textoTrein

    textoComp = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \
             for document in textoComp]
    #     print textoComp
    ##############################################################################################
    #     INICIO DE APLICACAO DO LSA - CRIANDO O DICIONARIO DE TERMOS/FREQUENCIA                 #
    ##############################################################################################

    #DEFINE FREQUENCIA COMO UMA VARIAVEL DO TIPO DICIONARIO DE INTEIROS
    frequencia = defaultdict(int)

    #ARMAZENA A QUANTIDADE DE REPETIÇÕES DE UM TERMO EM TODOS OS DOCUMENTOS DA COLECAO
    for t in textoTrein:
        for token in t:
            frequencia[token] += 1
#     pprint(frequencia)

#PALAVRAS COM FREQUENCIA 1 NÃO SÃO IMPORTANTES, POIS NÃO POSSUEM RELACOES DE CO-OCORRENCIA
#Remove todas as palavras que apareceram apenas 1 vez durante a contagem
    textoTrein = [[token for token in palavra if frequencia[token] > 1]\
             for palavra in textoTrein]
    #     pprint(textoTrein)

    ##########################################################################################
    # Dictionary encapsulates the mapping between normalized words and their integer ids.    #
    # The main function is `doc2bow`, which converts a collection of words to its            #
    # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.             #
    ##########################################################################################
    dicionario = corpora.Dictionary(textoTrein)
    #     print dicionario

    # Armazena o ID das palavras que aparecem apenas 1 vez nos textos
    once_ids = [
        tokenId for tokenId, docfreq in dicionario.dfs.iteritems()
        if docfreq == 1
    ]
    #     print once_ids

    #remove todas as palavras com frequencia = 1
    dicionario.filter_tokens(once_ids)

    #reorganiza o dicionario, realocando os dados para os indices que foram removidos
    dicionario.compactify()

    #     print dicionario.token2id # token -> tokenId
    #     print dicionario.dfs # document frequencies: tokenId -> in how many documents this token appeared

    # Atribui a corpus_textoTrein o textoTrein no formato "bag-of-words"
    # The main function is `doc2bow`, which converts a collection of words to its
    # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.
    corpus_textoTrein = [dicionario.doc2bow(texto) for texto in textoTrein]
    #     pprint(corpus_textoTrein)

    corpus_textoComp = [dicionario.doc2bow(textoC) for textoC in textoComp]
    #     pprint(corpus_textoComp)
    ##########################################################################################
    # MODELO DE TRANSFORMACAO - BAG-OF-WORDS PARA TF-IDF                                     #
    ##########################################################################################

    # TRANSFORMA corpus_textoTrein (bag-of-words)
    # PARA tfidf_TextoTrein (frequencia termos x inverso da freq no documento
    tfidf_TextoTrein = models.TfidfModel(corpus=corpus_textoTrein)
    #     print tfidf_TextoTrein

    #USA O posIni PARA GERAR A MATRIZ DE COMPARACAO COM OS DADOS DO DICIONARIO
    corpus_tfidf_TextoTrein = tfidf_TextoTrein[corpus_textoComp]
    #     print list(corpus_tfidf_TextoTrein)

    #TRANSFORMA A MATRIZ TF-IDF
    modelo_lsa = models.LsiModel(corpus_tfidf_TextoTrein,
                                 id2word=dicionario,
                                 num_topics=len(dicionario))

    query = []

    for q in textoComparacao:
        vec_bow = dicionario.doc2bow(q.lower().split())
        vec_lsi = modelo_lsa[
            vec_bow]  #convert a query de comparação num espaço LSI
        query.append(vec_lsi)
#     print "query"
#     pprint(query)

#TRANSFORMA corpus_textoComp num espaço LSA e indexa
    indexComp = similarities.MatrixSimilarity(modelo_lsa[corpus_textoComp])
    #     print "indexComp"
    #     pprint(list(indexComp))

    # To obtain similarities of our query document against the indexed documents:
    # perform a similarity query against the corpus
    sims = indexComp[query]

    #     pprint(sims)

    ##########################################################################################
    # JUNÇÃO COM K-MEANS PARA REALIZAR AGRUPAMENTOS                                          #
    ##########################################################################################

    ##Valor ideal, após experimentos = 100000
    km_model = KMeans(n_clusters=clusters, n_init=100000)

    km_model.fit_transform(sims)

    clustering = collections.defaultdict(list)

    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)

### impressões para visualizar no console
#     print "clustering _LSA_KMEANS"
#     pprint(clustering)

#     print len(clustering)

#     for i in range(len(clustering)):
#         for j in clustering[i]:
#             print "grupo", i
#             print j, nomeUsuarios[j]
#             print textoComparacao[j]

    return clustering
Exemplo n.º 6
0
def gruposArgumentacao(auxResult, qtdeGrupos=3, LSA=None, Normalizacao=True, TAGs=True):
    inicio = datetime.now()
    print inicio,"gruposArgumentacao"
    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()
    

    grupos = []
    tese = auxResult[5]

    posInicial_Normalizado = auxResult[6]
    
    
## dicSin = contém o dicionario com os termos sinonimos já relacionados (relaciona as palavras digitadas pelos alunos com
## o arquivo da wordnet, destaca as relações de sinonimias e apresenta o radical do termo (stemm aplicado) vinculado aos
## numeros das linha aonde estão os seus similares na wordnet    
    
    st_tese = auxResult[0] #texto da tese com aplicação de stemmer
    posIni = auxResult[1] #texto original da argumentação
    sw_tese = auxResult[2] 
    aux_usu = auxResult[3]
    st_posInicial = auxResult[4]
    
    
    base_treinamento = codecs.open(os.path.join(os.path.dirname(__file__),'../arquivos/baseTreinamento.txt'), 'r', 'UTF8')
        
    treinamento = [removeA(removePontuacao(i)) for i in base_treinamento] 
    # ALTERAR PARA PEGAR DADOS DA INTERFACE (CAIXA DE TEXTO)
    # OU COLOCAR OPÇÃO DE ENVIO DE ARQUIVO .TXT E ABRIR ESSES PARA USAR COMO BASE
    
    base_treinamento.close()

    
##########################################################################################
### ABORDAGEM (1): UTILIZAR O ARGUMENTO COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA  ###
##########################################################################################

#BASE DE TREINAMENTO COMPOSTA PELAS ARGUMENTAÇÕES DOS ALUNOS
    if LSA == True and Normalizacao == False:
        
        print "if LSA == True and Normalizacao == False:"
        
        if qtdeGrupos == 3:
            grupos = LSA_Kmeans(clusters=3, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni)
        elif qtdeGrupos == 4:
            grupos = LSA_Kmeans(clusters=4, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni)
        elif qtdeGrupos == 5:
            grupos = LSA_Kmeans(clusters=5, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni)
        elif qtdeGrupos==6:
            grupos = LSA_Kmeans(clusters=6, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni)
        else:
            print "ERRO"

###########################################################################################
### ABORDAGEM (2): UTILIZAR OUTROS TEXTOS COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA ###
###########################################################################################

#BASE DE TREINAMENTO COMPOSTA DE MATERIAIS DIDÁTICOS INDICADOS PELO PROFESSOR         
    elif LSA == False and Normalizacao == False:
        
        print "elif LSA == False and Normalizacao == False:"
        
        if qtdeGrupos == 3:
            grupos = LSA_Kmeans(clusters=3, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni)
        elif qtdeGrupos == 4:
            grupos = LSA_Kmeans(clusters=4, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni)
        elif qtdeGrupos == 5:
            grupos = LSA_Kmeans(clusters=5, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni)
        elif qtdeGrupos == 6:
            grupos = LSA_Kmeans(clusters=6, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni)
        else:
            print "ERRO"
            exit()    
            
#######################################################################################################
### ABORDAGEM (3): UTILIZAR O ARGUMENTO NORMALIZADO COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA  ###
######################################################################################################

#BASE DE TREINAMENTO COMPOSTA DE MATERIAIS DIDÁTICOS INDICADOS PELO PROFESSOR         
    elif LSA == True and Normalizacao == True:
        
        print "elif LSA == True and Normalizacao == True:"
        
        if qtdeGrupos == 3:
            grupos = LSA_Kmeans(clusters=3, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado)
        elif qtdeGrupos == 4:
            grupos = LSA_Kmeans(clusters=4, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado)
        elif qtdeGrupos == 5:
            grupos = LSA_Kmeans(clusters=5, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado)
        elif qtdeGrupos == 6:
            grupos = LSA_Kmeans(clusters=6, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado)
        else:
            print "ERRO"
            exit()    
            
            

##########################################################################################
### ABORDAGEM (4): UTILIZAÇÃO DO K-MEANS PURO COM TF-IDF                               ###
##########################################################################################
    
    elif LSA == None and Normalizacao == False:
        
        print "elif LSA == None and Normalizacao == False:"
        test_set = st_posInicial
        train_set = st_tese
    
### Utilização das funções para calculo do TF-IDF com a tese e o posInicial
### Funções implementadas com base na SkLearn
        vectorizer = CountVectorizer()
        vectorizer.fit_transform(test_set)
        count_vectorizer = CountVectorizer()
        count_vectorizer.fit_transform(train_set) 
        count_vectorizer.vocabulary_
        freq_term_matrix = count_vectorizer.transform(test_set)
        tfidf = TfidfTransformer(norm="l2")
        tfidf.fit(freq_term_matrix)
        tf_idf_matrix = tfidf.transform(freq_term_matrix)
       
        
        if qtdeGrupos == 3:
            grupos = tfIdf_Kmeans(st_posInicial, 3)
        elif qtdeGrupos == 4:
            grupos = tfIdf_Kmeans(st_posInicial, 4)
        elif qtdeGrupos == 5:
            grupos = tfIdf_Kmeans(st_posInicial, 5)
        elif qtdeGrupos == 6:
            grupos = tfIdf_Kmeans(st_posInicial, 6)
        else:
            print "ERRO"
            exit()

##########################################################################################
### ABORDAGEM (5): UTILIZAÇÃO DO K-MEANS PURO COM TF-IDF                               ###
### COM DADOS NORMALIZADOS                                                             ###
##########################################################################################

### Calculo com base nos textos normalizados!!!    
    
    elif LSA == None and Normalizacao == True:
        
        print "elif LSA == None and Normalizacao == True:"
        
        test_set = posInicial_Normalizado
        train_set = st_tese
    
### Utilização das funções para calculo do TF-IDF com a tese e o posInicial
### Funções implementadas com base na SkLearn
        vectorizer = CountVectorizer()
        vectorizer.fit_transform(test_set)
        count_vectorizer = CountVectorizer()
        count_vectorizer.fit_transform(train_set) 
        count_vectorizer.vocabulary_
        freq_term_matrix = count_vectorizer.transform(test_set)
        tfidf = TfidfTransformer(norm="l2")
        tfidf.fit(freq_term_matrix)
        tf_idf_matrix = tfidf.transform(freq_term_matrix)
       
        
        if qtdeGrupos == 3:
            grupos = tfIdf_Kmeans(posInicial_Normalizado, 3)
        elif qtdeGrupos == 4:
            grupos = tfIdf_Kmeans(posInicial_Normalizado, 4)
        elif qtdeGrupos == 5:
            grupos = tfIdf_Kmeans(posInicial_Normalizado, 5)
        elif qtdeGrupos == 6:
            grupos = tfIdf_Kmeans(posInicial_Normalizado, 6)
        else:
            print "ERRO"
            exit()

##########################################################################################
### RESULTADOS - INDEPENDEM DA ABORDAGEM                                               ###
##########################################################################################
    grupo1 = []
    grupo2 = []
    grupo3 = []
    grupo4 = []
    grupo5 = []
    grupo6 = []
    indices = []
    ind_aux = 0
    ind_aux2 = 0
    ind_aux3 = 0
    ind_aux4 = 0
    ind_aux5 = 0
    ind_aux6 = 0
    
    for i in range(len(grupos)):
        for j in range(len(grupos[i])):
            if i == 0:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " +  posIni[aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo1.append(texto)
                indices.append(grupos[i][j])                
            elif i == 1:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " +  posIni[aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo2.append(texto)        
                indices.append(grupos[i][j])    
            elif i == 2:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " +  posIni[aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo3.append(texto)
                indices.append(grupos[i][j])
            #para n_clusters = 4
            elif i == 3:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " +  posIni[aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo4.append(texto)
                indices.append(grupos[i][j])
            #para n_clusters = 5
            elif i == 4:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " +  posIni[aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo5.append(texto)
                indices.append(grupos[i][j])
            #para n_clusters = 6
            elif i == 5:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " +  posIni[aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo6.append(texto)
                indices.append(grupos[i][j])
    
    if qtdeGrupos == 3:
        ind_aux = indices[:len(grupo1)]
        ind_aux2 = indices[len(ind_aux):len(ind_aux)+len(grupo2)]
        ind_aux3 = indices[len(ind_aux)+len(grupo2):]
        
    elif qtdeGrupos == 4:
        ind_aux = indices[:len(grupo1)]
        ind_aux2 = indices[len(grupo1):len(grupo1)+len(grupo2)]
        ind_aux3 = indices[len(grupo1)+len(grupo2):(len(grupo1)+len(grupo2))+len(grupo3)]
        ind_aux4 = indices[(len(grupo1)+len(grupo2))+len(grupo3):]
        print "GRUPOS", grupos
        print "INDICES", indices
    elif qtdeGrupos == 5:        
        ind_aux = indices[:len(grupo1)]
        print "ind_aux", ind_aux
        print "len_g1", len(grupo1)
        ind_aux2 = indices[len(grupo1):len(grupo1)+len(grupo2)]
        print "ind_aux", ind_aux2
        print "len_g2", len(grupo2)
        ind_aux3 = indices[len(grupo1)+len(grupo2):(len(grupo1)+len(grupo2))+len(grupo3)]
        print "ind_aux", ind_aux3
        print "len_g3", len(grupo3)
        ind_aux4 = indices[(len(grupo1)+len(grupo2)+len(grupo3)):(len(grupo1)+len(grupo2)+len(grupo3))+len(grupo4)]
        print "ind_aux", ind_aux4
        print "len_g4", len(grupo4)
        ind_aux5 = indices[(len(grupo1)+len(grupo2)+len(grupo3))+len(grupo4):]
        print "ind_aux", ind_aux5
        print "len_g5", len(grupo5)
    elif qtdeGrupos == 6:
        ind_aux = indices[:len(grupo1)]
        print "ind_aux", ind_aux
        print "len_g1", len(grupo1)
        ind_aux2 = indices[len(grupo1):len(grupo1)+len(grupo2)]
        print "ind_aux", ind_aux2
        print "len_g2", len(grupo2)
        ind_aux3 = indices[len(grupo1)+len(grupo2):(len(grupo1)+len(grupo2))+len(grupo3)]
        print "ind_aux", ind_aux3
        print "len_g3", len(grupo3)
        ind_aux4 = indices[(len(grupo1)+len(grupo2)+len(grupo3)):(len(grupo1)+len(grupo2)+len(grupo3))+len(grupo4)]
        print "ind_aux", ind_aux4
        print "len_g4", len(grupo4)
        ind_aux5 = indices[(len(grupo1)+len(grupo2)+len(grupo3))+len(grupo4):(len(grupo1)+len(grupo2)+len(grupo3)+len(grupo4))+len(grupo5)]
        print "ind_aux", ind_aux5
        print "len_g5", len(grupo5)
        ind_aux6 = indices[(len(grupo1)+len(grupo2)+len(grupo3)+len(grupo4))+len(grupo5):]
        print "ind_aux", ind_aux6
        print "len_g6", len(grupo6)
    else:
        print "ERRO"
        exit()
    
# ##########################################################################################
# ### IMPRESSÃO DOS GRUPOS NO CONSOLE - PARA CONFERÊNCIA (COMENTAR DEPOIS)               ###
# ##########################################################################################
# 
# ##########################################################################################
# ## UTILIZADO PARA VALIDAR O CÁLCULO REALIZADO E IMPRIMI-LO                              ##
# ##########################################################################################
#     test_set = st_posInicial
#     train_set = st_tese
#     vectorizer = CountVectorizer()
#     vectorizer.fit_transform(train_set)
#     count_vectorizer = CountVectorizer()
#     count_vectorizer.fit_transform(train_set) 
#     count_vectorizer.vocabulary_
#     freq_term_matrix = count_vectorizer.transform(test_set)
#     tfidf = TfidfTransformer(norm="l2")
#     tfidf.fit(freq_term_matrix)
#     tf_idf_matrix = tfidf.transform(freq_term_matrix)
# ##########################################################################################
#     
#      
#     print "grupo 1", len(grupo1)
#     cos = []
#     lsaPosIni = []
#     lsaUsu =[]
#  
#     for y in range(len(ind_aux)):
#         print "posIni[y]", aux_usu[ind_aux[y]],posIni[ind_aux[y]]
#         lsaPosIni.append(posIni[ind_aux[y]])
#         lsaUsu.append(aux_usu[ind_aux[y]])
#         for x in range(y+1, len(ind_aux)):
#             num1 = ind_aux[y]
#             num2 = ind_aux[x]
#             cos.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#  
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#  
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#     print "cos",cos
#     print "len_cos",len(cos)
#     sum_cos = 0
#  
#     if len(cos) != 0:
#         for i in cos:
#             sum_cos = i + sum_cos
#   
#         print "media = ", sum_cos / len(cos)
#     else:
#         print "sem média"
#  
# ##########################################################################################
#     print "grupo 2", len(grupo2)
#     cos2 = []
#     lsaPosIni = []
#     lsaUsu =[]
#     print lsaPosIni
#     print lsaUsu
#  
#     for y in range(len(ind_aux2)):
#         lsaPosIni.append(posIni[ind_aux2[y]])
#         lsaUsu.append(aux_usu[ind_aux2[y]])
#         for x in range(y+1, len(ind_aux2)):
#             num1 = ind_aux2[y]
#             num2 = ind_aux2[x]
#             cos2.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#     print "cos",cos2
#     print "len_cos",len(cos2)
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#  
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#  
#  
#     sum_cos = 0
#     if len(cos2) != 0:
#         for i in cos2:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos2)
#     else:
#         print "sem média"
#  
# ##########################################################################################    
#     print "grupo 3", len(grupo3)
#     cos3 = []    
#     lsaPosIni = []
#     lsaUsu =[]
#     print lsaPosIni
#     print lsaUsu
#  
#     for y in range(len(ind_aux3)):
#         lsaPosIni.append(posIni[ind_aux3[y]])
#         lsaUsu.append(aux_usu[ind_aux3[y]])
#         for x in range(y+1, len(ind_aux3)):
#             num1 = ind_aux3[y]
#             num2 = ind_aux3[x]
#             cos3.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#  
#     print "cos",cos3
#     print "len_cos",len(cos3)
#  
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#  
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#  
#     sum_cos = 0
#     if len(cos3) != 0:
#         for i in cos3:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos3)
#     else:
#         print "sem média"
 
#########################################################################################
#     print "grupo 4", len(grupo4)
#     cos4 = []
#     lsaPosIni = []
#     lsaUsu =[]
#     print lsaPosIni
#     print lsaUsu
#     for y in range(len(ind_aux4)):
#         lsaPosIni.append(posIni[ind_aux4[y]])
#         lsaUsu.append(aux_usu[ind_aux4[y]])
#         for x in range(y+1, len(ind_aux4)):
#             num1 = ind_aux4[y]
#             num2 = ind_aux4[x]
#             cos4.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#    
#     print "cos",cos4
#     print "len_cos",len(cos4)
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#    
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#    
#     sum_cos = 0
#     if len(cos4) != 0:
#         for i in cos4:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos4)
#     else:
#         print "sem média"
#   
#   
# #########################################################################################    
#     print "grupo 5", len(grupo5)
#     cos5 = []
#     lsaPosIni = []
#     lsaUsu =[]
#     print lsaPosIni
#     print lsaUsu
#    
#     for y in range(len(ind_aux5)):
#         lsaPosIni.append(posIni[ind_aux5[y]])
#         lsaUsu.append(aux_usu[ind_aux5[y]])
#         for x in range(y+1, len(ind_aux5)):
#             num1 = ind_aux5[y]
#             num2 = ind_aux5[x]
#             cos5.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#    
#     print "cos",cos5
#     print "len_cos", len(cos5)
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#    
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#    
#     sum_cos = 0
#     if len(cos5) != 0:
#         for i in cos5:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos5)
#     else:
#         print "sem média"
#   
# #########################################################################################
#     print "grupo 6", len(grupo6)
#     cos6 = []
#     lsaPosIni = []
#     lsaUsu =[]
#    
#     for y in range(len(ind_aux6)):
#         lsaPosIni.append(posIni[ind_aux6[y]])
#         lsaUsu.append(aux_usu[ind_aux6[y]])
#         for x in range(y+1, len(ind_aux6)):
#             num1 = ind_aux6[y]
#             num2 = ind_aux6[x]
#             cos6.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#    
#     print "cos",cos6
#     print "len_cos",len(cos6)
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#    
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#    
#     sum_cos = 0
#     if len(cos6) != 0:
#         for i in cos6:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos6)
#     else:
#         print "sem média"
 
     
##########################################################################################
    fim = datetime.now()
    print fim,"gruposArgumentacao"
    
    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('gruposArgumentacao.out', type = 'callgrind')
    
    
    return grupo1, grupo2, grupo3, grupo4, grupo5, grupo6, tese
Exemplo n.º 7
0
def clusterFinal(idtese):

    #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses
    cursor = connection.cursor()
    cursor2 = connection.cursor()

    cursor.execute("select distinct `usr`.`primeironome` as `name`, `pos`.`posicionamentofinal` AS `posicionamentofinal` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr`  where ((`arg`.`tese_idtese` = " + idtese + "  ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))")
    cursor2.execute("select tese from tese where grupo_idgrupo = 1064 ")
    
    #Variavel e função para tratar tags html e acentos com codificação ISO
    h = HTMLParser.HTMLParser()
    
    #dados retirados da consulta ao banco
    dadosSql = cursor.fetchall()
    textotese = cursor2.fetchall()
    
    #listas para tratar os dados iniciais
    usu = []
    posInicial = []
    dados = []
    tese = []
    
    #lista com dados pos tagger
    tag_posInicial = []
    tag_comAce_posInicial = []
    
    
    #lista com dados após a remoção das stopwords
    sw_tese = []
    sw_posInicial = []
    aux_usu = []
    sw_tagPosInicial = [] #texto marcado e sem stopwords
    sw_tagcomAce_posInicial = [] #texto COM ACENTOS marcado e sem stopwords 


    #lista com dados após a aplicação de Stemming
    st_posInicial = []
    st_tese = []
    st_tagPosInicial = [] #texto marcado, sem stopwords e com stemmer aplicado
    st_tagcomAce_posInicial = [] #texto COM ACENTOS marcado, sem stopwords e com stemmer aplicado
    
#############################################################################################################    
    #LISTA COM OS POSICIONAMENTOS INICIAIS APÓS APLICAÇÃO DA NORMALIZAÇAÕ
    posInicial_Normalizado = []
    normalizacao = []
      

#############################################################################################################    
#Aplicacao de Case Folding

    for d in dadosSql:
        dados.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(),
                      re.sub('<[^>]*>', '', h.unescape(d[1])).lower()])

    for t in textotese:
        tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower())
            

    #Colocando os textos de posicionamento inicial em numa lista separada
    for i in dados:
        x = 0
        usu.append(i[x].upper())
        posInicial.append(i[x+1].lower()) #lista com o posicionamento Inicial com todas as letras em minusculo

#############################################################################################################
### Classificacao das palavras de acordo com sua classe gramatical
### Utilizacao do postagger NLPNET
### http://nilc.icmc.usp.br/nlpnet/index.html#
    
    tagger = nlpnet.POSTagger()
    
    semAce_posInicial = [] #armazena o posInicial apenas sem acentos, sem pontuações, sem endereço web e sem numeros 
    comAce_posInicial = [] #armazena o posInicial apenas COM acentos, sem pontuações, sem endereço web e sem numeros
    
    for i in posInicial:
        semAce_posInicial.append(removePontuacao(removeA(removeNum(removeSE(removeEndWeb((i)))))))
    
    for i in semAce_posInicial:
        tag_posInicial.append(tagger.tag(i))
        
    for i in posInicial:
        comAce_posInicial.append(removePontuacao(removeNum(removeSE(removeEndWeb((i))))))
    
    for i in comAce_posInicial:
        tag_comAce_posInicial.append(tagger.tag(i))
        
 
 #############################################################################################################   
 #APENAS PARA REALIZAR TESTE E COLOCAR NA DISSERTACAO

#     pprint(semAce_posInicial)
#     pprint(comAce_posInicial)
#     exit()

#     tagg_posInicial = []
#     for texto in posInicial:
#         tagg_posInicial.append(tagger.tag(texto))
#     
#     print "posInicial"
#     pprint(posInicial)
#     
#     print "tagg_posInicial"
#     pprint(tagg_posInicial)
    
 #############################################################################################################

#############################################################################################################
### REMOCAO DE STOPWORDS
### Remocao dos termos de acordo com a NLTK
### Remocao dos termos classificados como artigos, verbos, adverbios, etc...
    
    
    for i in usu:
        aux_usu.append(removeStopWords(i))

    for i in tese:
        sw_tese.append(removeStopWords(i))

    for i in posInicial:
        sw_posInicial.append(removeStopWords(i))
        
    for i in tag_posInicial:
        sw_tagPosInicial.append(limpaCorpus(i))
    
    for i in tag_comAce_posInicial:
        sw_tagcomAce_posInicial.append(limpaCorpus(i))
    
    
    
####################################################################################################################################
# Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa
# Retirando afixos dos textos do posInicial e tese

    
    stemmer = RSLPStemmer()
 
    for i in range(len(sw_posInicial)):
        st_aux = sw_posInicial[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)
         
        st_posInicial.append(string_aux)

    
    for i in range(len(sw_tese)):
        st_aux = sw_tese[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)
         
        st_tese.append(string_aux)
        
    for i in range(len(sw_tagPosInicial)):
        termosST = ""
        auxST = []
        for j in range(len(sw_tagPosInicial[i])):
            aux = stemmer.stem(sw_tagPosInicial[i][j][0])
            etiqueta = sw_tagPosInicial[i][j][1]
            termosST = (aux,etiqueta)
            auxST.append(termosST)
        
        st_tagPosInicial.append(auxST)
        
    for i in range(len(sw_tagcomAce_posInicial)):
        termosST = ""
        auxST = []
        for j in range(len(sw_tagcomAce_posInicial[i])):
            aux = stemmer.stem(sw_tagcomAce_posInicial[i][j][0])
            etiqueta = sw_tagcomAce_posInicial[i][j][1]
            termosST = (aux,etiqueta)
            auxST.append(termosST)
        
        st_tagcomAce_posInicial.append(auxST)


    
####################################################################################################################################
### A NORMALIZACAO DE TERMOS REFERE-SE A TECNICA DE TROCAR PALAVRAS SINONIMAS, OU SEJA, QUE TENHAM SIGNIFICADO                    ##
### SEMELHANTE, POR UM UNICO TERMO REPRESENTATIVO NO CORPUS DE ANALISE. DESSA FORMA, É POSSIVEL AUMENTAR O GRAU                   ##
### DE SIMILARIDADE ENTRE OS TEXTOS ANALISADOS ATRAVES DO USO DE TECNICAS DE ANALISE ESTATISTICAS, COMO SIMILA                    ##
### RIDADE DE COSSENOS OU DISTANCIA EUCLIDIANA.                                                                                   ##
####################################################################################################################################   
### A NORMALIZACAO FOI DESENVOLVIDA COM BASE NOS DADOS DISPONIBILIZADOS PELO PROJETO TEP 2.0 DO NILC/USP                          ##
### http://143.107.183.175:21480/tep2/index.htm                                                                                   ##
###                                                                                                                               ## 
### FORMATO DO ARQUIVO                                                                                                            ##
### NUM1. [Tipo] {termos sinonimos} <NUM2>                                                                                        ##
### 263. [Verbo] {consentir, deixar, permitir} <973>                                                                              ##
### NUM1 = NUMERO DA LINHA DE REFERENCIA PARA TERMO SINONIMO                                                                      ##
### NUM2 = NUMERO DA LINHA DE REFERENCIA PARA TERMO ANTONIMO (SENTIDO OPOSTO)                                                     ##
####################################################################################################################################
    
    #abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios)
    #arquivo apenas com termos classificados como substantivos, adjetivos e verbos 
    base_tep = codecs.open(os.path.join(os.path.dirname(__file__),'../base_tep2/base_tep.txt'), 'r', 'UTF8')
#     dicionario = open('/home/panceri/git/alpes_v1/base_tep2/dicionarioSinonimos.txt', 'w')
    
    #variavel com conteúdo do arquivo em memoria
    #não imprimir essa variável, MUITO GRANDEE!!!
    wordNet = base_tep.readlines()
    
    #fechar arquivo 
    base_tep.close()
    
####################################################################################################################################
## NORMALIZAÇÃO FEITA COM BASE NOS RADICAIS DE FORMAÇÃO DAS PALAVRAS                                                              ##
## APLICAÇÃO DO RSPL PRIMEIRO PARA DEPOIS BUSCAR NA BASE OS TERMOS SIMILARES                                                      ##
## DENTRO DA BASE_TEP OS TERMOS TAMBÉM FORAM REDUZIDOS AOS SEUS RADICIAIS DE FORMAÇÃO                                             ##
## O DICIONÁRIO ESTÁ COM A REFERÊNCIA PARA A LINHA AONDE ESTÃO OS TERMOS SINÔNIMOS                                                ##
## OS TERMOS SÃO ANALISADOS CONSIDERANDO SUAS ACENTUAÇÕES, PARA APLICAÇÃO CORRETA DO RSLP                                         ##
####################################################################################################################################
    
    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()    

    st_WordNetV = [] ##armazena num, tipo, e radical dos sinonimos - APENAS VERBOS
    st_WordNetN = [] ##armazena num, tipo, e radical dos sinonimos - APENAS SUBSTANTIVOS
    st_WordNetA = [] ##armazena num, tipo, e radical dos sinonimos - APENAS ADJETIVOS
    st_WordNetO = [] ##armazena num, tipo, e radical dos sinonimos - APENAS OUTROS
    
    for linhaWordnet in wordNet:
        listaAux = []
        termos = re.findall(r"\{(.*)\}", linhaWordnet)
        num = re.findall(r"([0-9]+)\.", linhaWordnet)
        tipo = re.findall(r"\[(.*)\]", linhaWordnet)
        
        
        if tipo[0] == "Substantivo":
            listaAux.append(num)
            listaAux.append(tipo)
            
            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetN.append(listaAux)
            
        elif tipo[0] == "Verbo":
            listaAux.append(num)
            listaAux.append(tipo)
            
            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetV.append(listaAux)
        
        elif tipo[0] == "Adjetivo":
            listaAux.append(num)
            listaAux.append(tipo)
            
            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetA.append(listaAux)
        else:
            listaAux.append(num)
            listaAux.append(tipo)
            
            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetO.append(listaAux)
            

 
    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('stemmWordNet.out', type = 'callgrind')
    
####################################################################################################################################
### A ANÁLISE É REALIZADA COM BASE NO TEXTO SEM A EXCLUSÃO DOS ACENTOS                                                            ##
### POIS AO EXCLUÍ-LOS A REDUÇÃO AO RADICAL DE FORMAÇÃO (APLICAÇÃO DO RSLP) É PREJUDICADA                                         ##
### OS TESTES REALIZADOS MOSTRARAM QUE ESSA É UMA MELHOR ABORDAGEM, UMA VEZ QUE NOSSOS TEXTOS SÃO PEQUENOS                        ##
### E PRECISAMOS CHEGAR O MAIS PRÓXIMO POSSÍVEL SEM CONSIDERAR SEUS SENTIDOS E/OU CONTEXTOS                                       ##
####################################################################################################################################
    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()    
    
    normalizacao = normalizacaoWordnet(st_WordNetA, st_WordNetN, st_WordNetV, st_WordNetO, st_tagcomAce_posInicial)
    
###############################################################
# Colocando os textos normalizados numa lista de 1 diemensão
############################################################### 
    stringNorm = ""
    auxNorm = []
    
    for i in range(len(normalizacao)):
        auxNorm = normalizacao[i]
        
        for x in range(len(auxNorm)):           
            stringNorm = stringNorm + " " + auxNorm[x]
        
        posInicial_Normalizado.append(stringNorm)
        stringNorm = ""
    
    
    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('normalizacaoWordnet.out', type = 'callgrind')


####################################################################################################################################

#     print "posInicial"
#     pprint(posInicial)
#     
#     print "comAce_posInicial"
#     pprint(comAce_posInicial)
#     
#     print "tag_comAce_posInicial"
#     pprint(tag_comAce_posInicial)
#         
#     print "sw_tagcomAce_posInicial"
#     pprint(sw_tagcomAce_posInicial)
#     
#     print "st_tagcomAce_posInicial"
#     pprint(st_tagcomAce_posInicial)
    
#     print "posInicial_Normalizado"
#     print len(posInicial_Normalizado)
#     pprint(posInicial_Normalizado)
     
#     exit()
####################################################################################################################################    


    return [st_tese, posInicial, sw_tese, aux_usu, st_posInicial, tese, posInicial_Normalizado]
Exemplo n.º 8
0
def normalizacao(dicSin, termo, radical, etiqueta):
    #variáveis locais
    SA_wordnet = [] #armazena a wordnet sem acentos   
    listaTodosSin = [] #lista com todos os termos sinonimos encontrados
    listaNumRef = [] #lista com o número da linha de referência dos termos sinominos 
    
    #abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios) 
    base_tep = codecs.open(os.path.join(os.path.dirname(__file__),'../../base_tep2/base_tep.txt'), 'r', 'UTF8')
    dicionario = open(os.path.join(os.path.dirname(__file__),'../../base_tep2/dicionarioSinonimos.txt'), 'w')
    
    #variavel com conteúdo do arquivo em memoria
    #não imprimir essa variável, MUITO GRANDEE!!!
    wordNet = base_tep.readlines()
    
    #fechar arquivo 
    base_tep.close()
    
    #retirar acentos da base
    for i in wordNet:
        SA_wordnet.append(removeA(i))
    
    #teste com busca pelo radical (stemmer)
    stemmer = RSLPStemmer()
    
#     termoStm = stemmer.stem(termo)

    
#     print termo, radical, etiqueta

    # busca termo dentro de arquivo
    # armazena termo como chave do dicionario
    # os linhaWordNet são armazenados como uma lista
    if etiqueta == "N":
        for linhaWordNet in SA_wordnet:
            if(linhaWordNet.find("[Substantivo]")>=0):
                if(linhaWordNet.find(termo)>=0):
                    listaSinonimos = re.findall('{[^}]*}', linhaWordNet)
                    for palavraSinonima in listaSinonimos:
                        numRefSin = re.findall('^[0-9]*.', linhaWordNet) #retorna o numero de referencia dos linhaWordNet
                        sa_palavraSinonima = removePontuacao(palavraSinonima) #lista de linhaWordNet sem as {}
                        for termSinWordNet in sa_palavraSinonima.split():
                            st_termSinWordNet = stemmer.stem(termSinWordNet)
                            if radical == st_termSinWordNet:
                                listaNumRef.append(numRefSin)
                            listaTodosSin.append(termSinWordNet)
        dicSin[termo] = listaNumRef,listaTodosSin

    elif etiqueta == "ADJ":
        for linhaWordNet in wordNet:
            if(linhaWordNet.find("[Adjetivo]")>=0):
                if(linhaWordNet.find(termo)>=0):
                    listaSinonimos = re.findall('{[^}]*}', linhaWordNet)
                    for palavraSinonima in listaSinonimos:
                        numRefSin = re.findall('^[0-9]*.', linhaWordNet) #retorna o numero de referencia dos linhaWordNet
                        sa_palavraSinonima = removePontuacao(palavraSinonima) #lista de linhaWordNet sem as {}
                        for termSinWordNet in sa_palavraSinonima.split():
                            st_termSinWordNet = stemmer.stem(termSinWordNet)
                            if radical == st_termSinWordNet:
                                listaNumRef.append(numRefSin)
                            listaTodosSin.append(sa_palavraSinonima)
        dicSin[termo] = listaNumRef,listaTodosSin

    elif etiqueta == "V" or etiqueta == "VAUX":
        for linhaWordNet in wordNet:
            if(linhaWordNet.find("[Verbo]")>=0):
                if(linhaWordNet.find(termo)>=0):            
                    listaSinonimos = re.findall('{[^}]*}', linhaWordNet)
                    for palavraSinonima in listaSinonimos:
                        numRefSin = re.findall('^[0-9]*.', linhaWordNet) #retorna o numero de referencia dos linhaWordNet
                        sa_palavraSinonima = removePontuacao(palavraSinonima)
                        for termSinWordNet in sa_palavraSinonima.split():
                            st_termSinWordNet = stemmer.stem(termSinWordNet)
                            if radical == st_termSinWordNet:
                                listaNumRef.append(numRefSin)
                                listaTodosSin.append(sa_palavraSinonima)
        dicSin[termo] = listaNumRef
    else: #PARA TRATAR OS ADVÉRBIOS
        for linhaWordNet in wordNet: 
            if(linhaWordNet.find(termo)>=0):
                listaSinonimos = re.findall('{[^}]*}', linhaWordNet)
                for palavraSinonima in listaSinonimos:
                    numRefSin = re.findall('^[0-9]*.', linhaWordNet) #retorna o numero de referencia dos linhaWordNet
                    sa_palavraSinonima = removePontuacao(palavraSinonima)
                    for termSinWordNet in sa_palavraSinonima.split():
                        st_termSinWordNet = stemmer.stem(termSinWordNet)
                        if radical == st_termSinWordNet:
                            listaNumRef.append(numRefSin)
                            listaTodosSin.append(sa_palavraSinonima)
        dicSin[termo] = listaNumRef
    
    
### verificar como imprimir isso num arquivo
### veriricar como imprimir um dicionario num arquivo txt    
    listaux = []
    for termo, listaNumRef in dicSin.items():
        temp = '{}: {}'.format(termo, listaNumRef)
#         print '{}: {}'.format(termo, listaNumRef)
        listaux.append(temp)
        
        dicionario.write(temp)
    
    dicionario.close()
Exemplo n.º 9
0
def gruposArgumentacao(auxResult,
                       qtdeGrupos=3,
                       LSA=None,
                       Normalizacao=True,
                       TAGs=True):
    inicio = datetime.now()
    print inicio, "gruposArgumentacao"
    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()

    grupos = []
    tese = auxResult[5]

    posInicial_Normalizado = auxResult[6]

    ## dicSin = contém o dicionario com os termos sinonimos já relacionados (relaciona as palavras digitadas pelos alunos com
    ## o arquivo da wordnet, destaca as relações de sinonimias e apresenta o radical do termo (stemm aplicado) vinculado aos
    ## numeros das linha aonde estão os seus similares na wordnet

    st_tese = auxResult[0]  #texto da tese com aplicação de stemmer
    posIni = auxResult[1]  #texto original da argumentação
    sw_tese = auxResult[2]
    aux_usu = auxResult[3]
    st_posInicial = auxResult[4]

    base_treinamento = codecs.open(
        os.path.join(os.path.dirname(__file__),
                     '../arquivos/baseTreinamento.txt'), 'r', 'UTF8')

    treinamento = [removeA(removePontuacao(i)) for i in base_treinamento]
    # ALTERAR PARA PEGAR DADOS DA INTERFACE (CAIXA DE TEXTO)
    # OU COLOCAR OPÇÃO DE ENVIO DE ARQUIVO .TXT E ABRIR ESSES PARA USAR COMO BASE

    base_treinamento.close()

    ##########################################################################################
    ### ABORDAGEM (1): UTILIZAR O ARGUMENTO COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA  ###
    ##########################################################################################

    #BASE DE TREINAMENTO COMPOSTA PELAS ARGUMENTAÇÕES DOS ALUNOS
    if LSA == True and Normalizacao == False:

        print "if LSA == True and Normalizacao == False:"

        if qtdeGrupos == 3:
            grupos = LSA_Kmeans(clusters=3,
                                textoTreinamento=posIni,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posIni)
        elif qtdeGrupos == 4:
            grupos = LSA_Kmeans(clusters=4,
                                textoTreinamento=posIni,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posIni)
        elif qtdeGrupos == 5:
            grupos = LSA_Kmeans(clusters=5,
                                textoTreinamento=posIni,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posIni)
        elif qtdeGrupos == 6:
            grupos = LSA_Kmeans(clusters=6,
                                textoTreinamento=posIni,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posIni)
        else:
            print "ERRO"

###########################################################################################
### ABORDAGEM (2): UTILIZAR OUTROS TEXTOS COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA ###
###########################################################################################

#BASE DE TREINAMENTO COMPOSTA DE MATERIAIS DIDÁTICOS INDICADOS PELO PROFESSOR
    elif LSA == False and Normalizacao == False:

        print "elif LSA == False and Normalizacao == False:"

        if qtdeGrupos == 3:
            grupos = LSA_Kmeans(clusters=3,
                                textoTreinamento=treinamento,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posIni)
        elif qtdeGrupos == 4:
            grupos = LSA_Kmeans(clusters=4,
                                textoTreinamento=treinamento,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posIni)
        elif qtdeGrupos == 5:
            grupos = LSA_Kmeans(clusters=5,
                                textoTreinamento=treinamento,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posIni)
        elif qtdeGrupos == 6:
            grupos = LSA_Kmeans(clusters=6,
                                textoTreinamento=treinamento,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posIni)
        else:
            print "ERRO"
            exit()

#######################################################################################################
### ABORDAGEM (3): UTILIZAR O ARGUMENTO NORMALIZADO COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA  ###
######################################################################################################

#BASE DE TREINAMENTO COMPOSTA DE MATERIAIS DIDÁTICOS INDICADOS PELO PROFESSOR
    elif LSA == True and Normalizacao == True:

        print "elif LSA == True and Normalizacao == True:"

        if qtdeGrupos == 3:
            grupos = LSA_Kmeans(clusters=3,
                                textoTreinamento=posInicial_Normalizado,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posInicial_Normalizado)
        elif qtdeGrupos == 4:
            grupos = LSA_Kmeans(clusters=4,
                                textoTreinamento=posInicial_Normalizado,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posInicial_Normalizado)
        elif qtdeGrupos == 5:
            grupos = LSA_Kmeans(clusters=5,
                                textoTreinamento=posInicial_Normalizado,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posInicial_Normalizado)
        elif qtdeGrupos == 6:
            grupos = LSA_Kmeans(clusters=6,
                                textoTreinamento=posInicial_Normalizado,
                                nomeUsuarios=aux_usu,
                                textoComparacao=posInicial_Normalizado)
        else:
            print "ERRO"
            exit()

##########################################################################################
### ABORDAGEM (4): UTILIZAÇÃO DO K-MEANS PURO COM TF-IDF                               ###
##########################################################################################

    elif LSA == None and Normalizacao == False:

        print "elif LSA == None and Normalizacao == False:"
        test_set = st_posInicial
        train_set = st_tese

        ### Utilização das funções para calculo do TF-IDF com a tese e o posInicial
        ### Funções implementadas com base na SkLearn
        vectorizer = CountVectorizer()
        vectorizer.fit_transform(test_set)
        count_vectorizer = CountVectorizer()
        count_vectorizer.fit_transform(train_set)
        count_vectorizer.vocabulary_
        freq_term_matrix = count_vectorizer.transform(test_set)
        tfidf = TfidfTransformer(norm="l2")
        tfidf.fit(freq_term_matrix)
        tf_idf_matrix = tfidf.transform(freq_term_matrix)

        if qtdeGrupos == 3:
            grupos = tfIdf_Kmeans(st_posInicial, 3)
        elif qtdeGrupos == 4:
            grupos = tfIdf_Kmeans(st_posInicial, 4)
        elif qtdeGrupos == 5:
            grupos = tfIdf_Kmeans(st_posInicial, 5)
        elif qtdeGrupos == 6:
            grupos = tfIdf_Kmeans(st_posInicial, 6)
        else:
            print "ERRO"
            exit()

##########################################################################################
### ABORDAGEM (5): UTILIZAÇÃO DO K-MEANS PURO COM TF-IDF                               ###
### COM DADOS NORMALIZADOS                                                             ###
##########################################################################################

### Calculo com base nos textos normalizados!!!

    elif LSA == None and Normalizacao == True:

        print "elif LSA == None and Normalizacao == True:"

        test_set = posInicial_Normalizado
        train_set = st_tese

        ### Utilização das funções para calculo do TF-IDF com a tese e o posInicial
        ### Funções implementadas com base na SkLearn
        vectorizer = CountVectorizer()
        vectorizer.fit_transform(test_set)
        count_vectorizer = CountVectorizer()
        count_vectorizer.fit_transform(train_set)
        count_vectorizer.vocabulary_
        freq_term_matrix = count_vectorizer.transform(test_set)
        tfidf = TfidfTransformer(norm="l2")
        tfidf.fit(freq_term_matrix)
        tf_idf_matrix = tfidf.transform(freq_term_matrix)

        if qtdeGrupos == 3:
            grupos = tfIdf_Kmeans(posInicial_Normalizado, 3)
        elif qtdeGrupos == 4:
            grupos = tfIdf_Kmeans(posInicial_Normalizado, 4)
        elif qtdeGrupos == 5:
            grupos = tfIdf_Kmeans(posInicial_Normalizado, 5)
        elif qtdeGrupos == 6:
            grupos = tfIdf_Kmeans(posInicial_Normalizado, 6)
        else:
            print "ERRO"
            exit()

##########################################################################################
### RESULTADOS - INDEPENDEM DA ABORDAGEM                                               ###
##########################################################################################
    grupo1 = []
    grupo2 = []
    grupo3 = []
    grupo4 = []
    grupo5 = []
    grupo6 = []
    indices = []
    ind_aux = 0
    ind_aux2 = 0
    ind_aux3 = 0
    ind_aux4 = 0
    ind_aux5 = 0
    ind_aux6 = 0

    for i in range(len(grupos)):
        for j in range(len(grupos[i])):
            if i == 0:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>" + aux_usu[
                        aux] + "</span> <br/> Posicionamento Inicial: " + posIni[
                            aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo1.append(texto)
                indices.append(grupos[i][j])
            elif i == 1:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>" + aux_usu[
                        aux] + "</span> <br/> Posicionamento Inicial: " + posIni[
                            aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo2.append(texto)
                indices.append(grupos[i][j])
            elif i == 2:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>" + aux_usu[
                        aux] + "</span> <br/> Posicionamento Inicial: " + posIni[
                            aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo3.append(texto)
                indices.append(grupos[i][j])
            #para n_clusters = 4
            elif i == 3:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>" + aux_usu[
                        aux] + "</span> <br/> Posicionamento Inicial: " + posIni[
                            aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo4.append(texto)
                indices.append(grupos[i][j])
            #para n_clusters = 5
            elif i == 4:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>" + aux_usu[
                        aux] + "</span> <br/> Posicionamento Inicial: " + posIni[
                            aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo5.append(texto)
                indices.append(grupos[i][j])
            #para n_clusters = 6
            elif i == 5:
                aux = grupos[i][j]
                if TAGs:
                    texto = "Aluno: <span>" + aux_usu[
                        aux] + "</span> <br/> Posicionamento Inicial: " + posIni[
                            aux]
                else:
                    texto = aux_usu[aux] + "#$#" + posIni[aux]
                grupo6.append(texto)
                indices.append(grupos[i][j])

    if qtdeGrupos == 3:
        ind_aux = indices[:len(grupo1)]
        ind_aux2 = indices[len(ind_aux):len(ind_aux) + len(grupo2)]
        ind_aux3 = indices[len(ind_aux) + len(grupo2):]

    elif qtdeGrupos == 4:
        ind_aux = indices[:len(grupo1)]
        ind_aux2 = indices[len(grupo1):len(grupo1) + len(grupo2)]
        ind_aux3 = indices[len(grupo1) +
                           len(grupo2):(len(grupo1) + len(grupo2)) +
                           len(grupo3)]
        ind_aux4 = indices[(len(grupo1) + len(grupo2)) + len(grupo3):]
        print "GRUPOS", grupos
        print "INDICES", indices
    elif qtdeGrupos == 5:
        ind_aux = indices[:len(grupo1)]
        print "ind_aux", ind_aux
        print "len_g1", len(grupo1)
        ind_aux2 = indices[len(grupo1):len(grupo1) + len(grupo2)]
        print "ind_aux", ind_aux2
        print "len_g2", len(grupo2)
        ind_aux3 = indices[len(grupo1) +
                           len(grupo2):(len(grupo1) + len(grupo2)) +
                           len(grupo3)]
        print "ind_aux", ind_aux3
        print "len_g3", len(grupo3)
        ind_aux4 = indices[(
            len(grupo1) + len(grupo2) +
            len(grupo3)):(len(grupo1) + len(grupo2) + len(grupo3)) +
                           len(grupo4)]
        print "ind_aux", ind_aux4
        print "len_g4", len(grupo4)
        ind_aux5 = indices[(len(grupo1) + len(grupo2) + len(grupo3)) +
                           len(grupo4):]
        print "ind_aux", ind_aux5
        print "len_g5", len(grupo5)
    elif qtdeGrupos == 6:
        ind_aux = indices[:len(grupo1)]
        print "ind_aux", ind_aux
        print "len_g1", len(grupo1)
        ind_aux2 = indices[len(grupo1):len(grupo1) + len(grupo2)]
        print "ind_aux", ind_aux2
        print "len_g2", len(grupo2)
        ind_aux3 = indices[len(grupo1) +
                           len(grupo2):(len(grupo1) + len(grupo2)) +
                           len(grupo3)]
        print "ind_aux", ind_aux3
        print "len_g3", len(grupo3)
        ind_aux4 = indices[(
            len(grupo1) + len(grupo2) +
            len(grupo3)):(len(grupo1) + len(grupo2) + len(grupo3)) +
                           len(grupo4)]
        print "ind_aux", ind_aux4
        print "len_g4", len(grupo4)
        ind_aux5 = indices[(len(grupo1) + len(grupo2) + len(grupo3)) +
                           len(grupo4):(len(grupo1) + len(grupo2) +
                                        len(grupo3) + len(grupo4)) +
                           len(grupo5)]
        print "ind_aux", ind_aux5
        print "len_g5", len(grupo5)
        ind_aux6 = indices[(len(grupo1) + len(grupo2) + len(grupo3) +
                            len(grupo4)) + len(grupo5):]
        print "ind_aux", ind_aux6
        print "len_g6", len(grupo6)
    else:
        print "ERRO"
        exit()

# ##########################################################################################
# ### IMPRESSÃO DOS GRUPOS NO CONSOLE - PARA CONFERÊNCIA (COMENTAR DEPOIS)               ###
# ##########################################################################################
#
# ##########################################################################################
# ## UTILIZADO PARA VALIDAR O CÁLCULO REALIZADO E IMPRIMI-LO                              ##
# ##########################################################################################
#     test_set = st_posInicial
#     train_set = st_tese
#     vectorizer = CountVectorizer()
#     vectorizer.fit_transform(train_set)
#     count_vectorizer = CountVectorizer()
#     count_vectorizer.fit_transform(train_set)
#     count_vectorizer.vocabulary_
#     freq_term_matrix = count_vectorizer.transform(test_set)
#     tfidf = TfidfTransformer(norm="l2")
#     tfidf.fit(freq_term_matrix)
#     tf_idf_matrix = tfidf.transform(freq_term_matrix)
# ##########################################################################################
#
#
#     print "grupo 1", len(grupo1)
#     cos = []
#     lsaPosIni = []
#     lsaUsu =[]
#
#     for y in range(len(ind_aux)):
#         print "posIni[y]", aux_usu[ind_aux[y]],posIni[ind_aux[y]]
#         lsaPosIni.append(posIni[ind_aux[y]])
#         lsaUsu.append(aux_usu[ind_aux[y]])
#         for x in range(y+1, len(ind_aux)):
#             num1 = ind_aux[y]
#             num2 = ind_aux[x]
#             cos.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#     print "cos",cos
#     print "len_cos",len(cos)
#     sum_cos = 0
#
#     if len(cos) != 0:
#         for i in cos:
#             sum_cos = i + sum_cos
#
#         print "media = ", sum_cos / len(cos)
#     else:
#         print "sem média"
#
# ##########################################################################################
#     print "grupo 2", len(grupo2)
#     cos2 = []
#     lsaPosIni = []
#     lsaUsu =[]
#     print lsaPosIni
#     print lsaUsu
#
#     for y in range(len(ind_aux2)):
#         lsaPosIni.append(posIni[ind_aux2[y]])
#         lsaUsu.append(aux_usu[ind_aux2[y]])
#         for x in range(y+1, len(ind_aux2)):
#             num1 = ind_aux2[y]
#             num2 = ind_aux2[x]
#             cos2.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#     print "cos",cos2
#     print "len_cos",len(cos2)
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#
#
#     sum_cos = 0
#     if len(cos2) != 0:
#         for i in cos2:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos2)
#     else:
#         print "sem média"
#
# ##########################################################################################
#     print "grupo 3", len(grupo3)
#     cos3 = []
#     lsaPosIni = []
#     lsaUsu =[]
#     print lsaPosIni
#     print lsaUsu
#
#     for y in range(len(ind_aux3)):
#         lsaPosIni.append(posIni[ind_aux3[y]])
#         lsaUsu.append(aux_usu[ind_aux3[y]])
#         for x in range(y+1, len(ind_aux3)):
#             num1 = ind_aux3[y]
#             num2 = ind_aux3[x]
#             cos3.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#
#     print "cos",cos3
#     print "len_cos",len(cos3)
#
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#
#     sum_cos = 0
#     if len(cos3) != 0:
#         for i in cos3:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos3)
#     else:
#         print "sem média"

#########################################################################################
#     print "grupo 4", len(grupo4)
#     cos4 = []
#     lsaPosIni = []
#     lsaUsu =[]
#     print lsaPosIni
#     print lsaUsu
#     for y in range(len(ind_aux4)):
#         lsaPosIni.append(posIni[ind_aux4[y]])
#         lsaUsu.append(aux_usu[ind_aux4[y]])
#         for x in range(y+1, len(ind_aux4)):
#             num1 = ind_aux4[y]
#             num2 = ind_aux4[x]
#             cos4.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#
#     print "cos",cos4
#     print "len_cos",len(cos4)
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#
#     sum_cos = 0
#     if len(cos4) != 0:
#         for i in cos4:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos4)
#     else:
#         print "sem média"
#
#
# #########################################################################################
#     print "grupo 5", len(grupo5)
#     cos5 = []
#     lsaPosIni = []
#     lsaUsu =[]
#     print lsaPosIni
#     print lsaUsu
#
#     for y in range(len(ind_aux5)):
#         lsaPosIni.append(posIni[ind_aux5[y]])
#         lsaUsu.append(aux_usu[ind_aux5[y]])
#         for x in range(y+1, len(ind_aux5)):
#             num1 = ind_aux5[y]
#             num2 = ind_aux5[x]
#             cos5.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#
#     print "cos",cos5
#     print "len_cos", len(cos5)
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#
#     sum_cos = 0
#     if len(cos5) != 0:
#         for i in cos5:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos5)
#     else:
#         print "sem média"
#
# #########################################################################################
#     print "grupo 6", len(grupo6)
#     cos6 = []
#     lsaPosIni = []
#     lsaUsu =[]
#
#     for y in range(len(ind_aux6)):
#         lsaPosIni.append(posIni[ind_aux6[y]])
#         lsaUsu.append(aux_usu[ind_aux6[y]])
#         for x in range(y+1, len(ind_aux6)):
#             num1 = ind_aux6[y]
#             num2 = ind_aux6[x]
#             cos6.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]))
#             euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True)
#             print aux_usu[num1],aux_usu[num2]
#             print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])
#             print "euc", euc
#
#     print "cos",cos6
#     print "len_cos",len(cos6)
#     simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni)
#     print "simLSA"
#     pprint(sorted(simLSA, reverse=True))
#
#     simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni)
#     print "simLSA1"
#     pprint(sorted(simLSA1, reverse=True))
#
#     sum_cos = 0
#     if len(cos6) != 0:
#         for i in cos6:
#             sum_cos = i + sum_cos
#         print "media = ", sum_cos / len(cos6)
#     else:
#         print "sem média"

##########################################################################################
    fim = datetime.now()
    print fim, "gruposArgumentacao"

    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('gruposArgumentacao.out', type='callgrind')

    return grupo1, grupo2, grupo3, grupo4, grupo5, grupo6, tese