Пример #1
0
def save_tfidf_like(parl_counter,sort_tfidf_like, counter_list,tot_counter,counter_list_parl):
    dic = dict(sort_tfidf_like)
    f =  open(dir_out+"tfidf_like_parametros.csv", 'w')
    f.write("palavra"+";"+"valor"+";"+"frequencia"+";"+"entropia maxima"+";"+"entropia da palvra"+";"+"prob_politica"+";"+"entropia entre deputados"+"\n")
    for word in parl_counter:
        f.write(word+";"+str(dic[word])+";"+ '%.4f'%(TfIdf.tf(word,parl_counter))+";"+
             '%.4f'%(math.log2(len(counter_list)))+";"+ '%.4f'%(TfIdf.entropy(word,tot_counter,counter_list))+";"+
             '%.4f'%(TfIdf.parl_prob(word,parl_counter,counter_list))+";"+ '%.4f'%(TfIdf.parl_entropy(word, tot_counter, counter_list_parl))+"\n")
    f.close()
Пример #2
0
def idf_like( word,parl_counter, tot_counter,doc_counter, counter_list_parl):
    return ((math.log2(len(doc_counter))-TfIdf.entropy(word,tot_counter,doc_counter))
        *TfIdf.parl_prob(word,parl_counter,doc_counter)*TfIdf.parl_entropy(word, tot_counter, counter_list_parl))
    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_in']
    dir_out = path['dir_out']
    dir_ale = path['dir_ale']
    dir_rob = path['dir_rob']

    tot_counter, parl_counter_list = load_counters(dir_out)
    tp = TextProcessor()
    tfidf = TfIdf()

    word_entropy = dict()
    for word in tot_counter:
        word_entropy[word] = tfidf.parl_entropy(word,tot_counter,parl_counter_list)

    freq = [int(math.pow(2,x)) for x in word_entropy.values() ]
plt.hist(freq, 15)
plt.xticks(np.arange(0,max(freq),20))
#plt.gca().set_yscale("log")
plt.xlabel("# de deputados que utilizaram a palavra" )
plt.ylabel("# palavras utilizadas pelos deputados" )
plt.show()
plt.clf()


    frequencies = {key:float(value)/sum(y.values()) for (key,value) in y.items()}

with open(dir_out+"word_entropy.pck", 'wb') as handle:
    pickle.dump(word_entropy, handle)