def save_tfidf_like(parl_counter,sort_tfidf_like, counter_list,tot_counter,counter_list_parl): dic = dict(sort_tfidf_like) f = open(dir_out+"tfidf_like_parametros.csv", 'w') f.write("palavra"+";"+"valor"+";"+"frequencia"+";"+"entropia maxima"+";"+"entropia da palvra"+";"+"prob_politica"+";"+"entropia entre deputados"+"\n") for word in parl_counter: f.write(word+";"+str(dic[word])+";"+ '%.4f'%(TfIdf.tf(word,parl_counter))+";"+ '%.4f'%(math.log2(len(counter_list)))+";"+ '%.4f'%(TfIdf.entropy(word,tot_counter,counter_list))+";"+ '%.4f'%(TfIdf.parl_prob(word,parl_counter,counter_list))+";"+ '%.4f'%(TfIdf.parl_entropy(word, tot_counter, counter_list_parl))+"\n") f.close()
def idf_like( word,parl_counter, tot_counter,doc_counter, counter_list_parl): return ((math.log2(len(doc_counter))-TfIdf.entropy(word,tot_counter,doc_counter)) *TfIdf.parl_prob(word,parl_counter,doc_counter)*TfIdf.parl_entropy(word, tot_counter, counter_list_parl))
cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_rob = path['dir_rob'] tot_counter, parl_counter_list = load_counters(dir_out) tp = TextProcessor() tfidf = TfIdf() word_entropy = dict() for word in tot_counter: word_entropy[word] = tfidf.parl_entropy(word,tot_counter,parl_counter_list) freq = [int(math.pow(2,x)) for x in word_entropy.values() ] plt.hist(freq, 15) plt.xticks(np.arange(0,max(freq),20)) #plt.gca().set_yscale("log") plt.xlabel("# de deputados que utilizaram a palavra" ) plt.ylabel("# palavras utilizadas pelos deputados" ) plt.show() plt.clf() frequencies = {key:float(value)/sum(y.values()) for (key,value) in y.items()} with open(dir_out+"word_entropy.pck", 'wb') as handle: pickle.dump(word_entropy, handle)