pickle.dump(alea_processed, handle) with open(dir_out+"list_alea_trigrams.pck", 'wb') as handle: pickle.dump(alea_tri_processed, handle) bgr_counter = parl_bigrams.ngram_fd parl_bgr_counter = [l.ngram_fd for l in parl_processed] docs_bgr_counter = [l.ngram_fd for l in alea_processed] docs_bgr_counter.append(bgr_counter) tfidf = TfIdf() tfidf_smooth = list() for bgr in bgr_counter: tfidf_smooth.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_smooth(bgr,docs_bgr_counter)) dic_tfidf_smooth = list(zip(bgr_counter.keys(), tfidf_smooth)) dic_tfidf_smooth = sorted(dic_tfidf_smooth, key=lambda x: x[1], reverse=True) tot_counter = dict() for y in docs_bgr_counter: for k in y.keys(): tot_counter[k] = k in tot_counter and tot_counter[k]+y[k] or y[k] tfidf_like = list() for bgr in bgr_counter: tfidf_like.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_like(bgr,bgr_counter,tot_counter,docs_bgr_counter, parl_bgr_counter)) dic_tfidf_like = list(zip(bgr_counter.keys(), tfidf_like)) dic_tfidf_like = sorted(dic_tfidf_like, key=lambda x: x[1], reverse=True)
print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list() for i , data in enumerate(categories_counter): tmp_smooth = dict() tmp_like = dict() tmp_entropy = dict() print("dataset: " + str(i)) for word in data: tf = TfIdf.tf(word, data) tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, categories_counter) tmp_smooth[word] = tf * TfIdf.idf_smooth(word, categories_counter) tmp_like[word] = tf * TfIdf.idf_like(word, i, categories_counter) tfidf_smooth.append(tmp_smooth) tfidf_like.append(tmp_like) tfidf_entropy.append(tmp_entropy) print("processing softmax confusion matrix") confusion_like = np.zeros(shape=(len(test_data), len(test_data))) confusion_smooth = np.zeros(shape=(len(test_data), len(test_data))) confusion_entropy = np.zeros(shape=(len(test_data), len(test_data))) for i, data in enumerate(test_data): for tw in data: j, value = classifier_s(tw, tfidf_like) confusion_like[i, j] += 1 j, value = classifier_s(tw, tfidf_smooth) confusion_smooth[i, j] += 1
print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list() for i , data in enumerate(dataset): tmp_smooth = dict() tmp_like = dict() tmp_entropy = dict() print("dataset: " + str(i)) for word in data: tf = TfIdf.tf(word, data) tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, dataset) tmp_smooth[word] = tf * TfIdf.idf_smooth(word, dataset) tmp_like[word] = tf * TfIdf.idf_like(word, i, dataset) tfidf_smooth.append(tmp_smooth) tfidf_like.append(tmp_like) tfidf_entropy.append(tmp_entropy) print("save tfidf") with open(dir_out+"tfidf_entropy.pck", 'wb') as handle: pickle.dump(tfidf_entropy, handle) with open(dir_out+"tfidf_smooth.pck", 'wb') as handle: pickle.dump(tfidf_smooth, handle) with open(dir_out+"tfidf_like.pck", 'wb') as handle: pickle.dump(tfidf_like, handle)
file_parl = "/Users/lucasso/Dropbox/UFMG/Processamento de Linguagem Natural/random_pck/docs/deputados.pck" tfidf_n = list() tf_log_idf = list() tfidf_like = list() corr = "" with open(file_parl, 'rb') as handle: parl_counter = pickle.load(handle) tot_counter,counter_list,_ = loadCounters(dir_in) tot_counter_dep,counter_list_dep,pck= loadCounters(dir_parl) tfidf = TfIdf() for word in parl_counter: tf = tfidf.tf(word, parl_counter) idf = tfidf.idf(word,counter_list) log_idf = tfidf.idf_smooth(word,counter_list) ent_idf = tfidf.idf_like(word,parl_counter, tot_counter, counter_list, counter_list_dep) tfidf_n.append(tf*idf) tf_log_idf.append(tf*log_idf) tfidf_like.append(tf*ent_idf) dic_tfidf= list(zip(parl_counter.keys(), tfidf_n)) dic_tf_log_idf= list(zip(parl_counter.keys(), tf_log_idf)) dic_tfidf_like= list(zip(parl_counter.keys(), tfidf_like)) """ corr += "tfidf X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf] ,[v for i,v in dic_tf_log_idf]))+"\n" corr += "tfidf X tfidf_like: "+str(stats.spearmanr([v for i,v in dic_tfidf],[v for i,v in dic_tfidf_like]))+"\n" corr += "tfidf_like X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf_like] , [v for i,v in dic_tf_log_idf]))+"\n" corr += "tfidf X tfidf_smooth: "+str(stats.pearsonr(tfidf_n,tf_log_idf))+"\n"