def tfidf_month(tw_month,random_list): tweets = list(itertools.chain.from_iterable(itertools.chain.from_iterable(tw_month))) tot_counter = Counter(tweets) dep_counts = list() for dep in tw_month: tw = list(itertools.chain.from_iterable(dep)) print(tw) dep_counts.append(Counter(tw)) docs_counter = docs_counters(random_list,tot_counter) tfidf = TfIdf() tfidf_like = list() for word in tot_counter: tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, dep_counts)) sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like)) sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True) return sort_tfidf_like
tfidf = TfIdf() tfidf_smooth = list() for bgr in bgr_counter: tfidf_smooth.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_smooth(bgr,docs_bgr_counter)) dic_tfidf_smooth = list(zip(bgr_counter.keys(), tfidf_smooth)) dic_tfidf_smooth = sorted(dic_tfidf_smooth, key=lambda x: x[1], reverse=True) tot_counter = dict() for y in docs_bgr_counter: for k in y.keys(): tot_counter[k] = k in tot_counter and tot_counter[k]+y[k] or y[k] tfidf_like = list() for bgr in bgr_counter: tfidf_like.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_like(bgr,bgr_counter,tot_counter,docs_bgr_counter, parl_bgr_counter)) dic_tfidf_like = list(zip(bgr_counter.keys(), tfidf_like)) dic_tfidf_like = sorted(dic_tfidf_like, key=lambda x: x[1], reverse=True) #processa os trigramas dos tweets dos documentos aleatorios alea_tri_processed = list() for l in list_aleatory: temp = add_separator(tp.text_process(l,text_only=True)) temp = get_trigrams(temp,2,True) alea_tri_processed.append(temp) with open(dir_out+"list_alea_trigrams.pck", 'wb') as handle: pickle.dump(alea_tri_processed, handle)
print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list() for i , data in enumerate(dataset): tmp_smooth = dict() tmp_like = dict() tmp_entropy = dict() print("dataset: " + str(i)) for word in data: tf = TfIdf.tf(word, data) tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, dataset) tmp_smooth[word] = tf * TfIdf.idf_smooth(word, dataset) tmp_like[word] = tf * TfIdf.idf_like(word, i, dataset) tfidf_smooth.append(tmp_smooth) tfidf_like.append(tmp_like) tfidf_entropy.append(tmp_entropy) print("save tfidf") with open(dir_out+"tfidf_entropy.pck", 'wb') as handle: pickle.dump(tfidf_entropy, handle) with open(dir_out+"tfidf_smooth.pck", 'wb') as handle: pickle.dump(tfidf_smooth, handle) with open(dir_out+"tfidf_like.pck", 'wb') as handle: pickle.dump(tfidf_like, handle)
print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list() for i , data in enumerate(categories_counter): tmp_smooth = dict() tmp_like = dict() tmp_entropy = dict() print("dataset: " + str(i)) for word in data: tf = TfIdf.tf(word, data) tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, categories_counter) tmp_smooth[word] = tf * TfIdf.idf_smooth(word, categories_counter) tmp_like[word] = tf * TfIdf.idf_like(word, i, categories_counter) tfidf_smooth.append(tmp_smooth) tfidf_like.append(tmp_like) tfidf_entropy.append(tmp_entropy) print("processing softmax confusion matrix") confusion_like = np.zeros(shape=(len(test_data), len(test_data))) confusion_smooth = np.zeros(shape=(len(test_data), len(test_data))) confusion_entropy = np.zeros(shape=(len(test_data), len(test_data))) for i, data in enumerate(test_data): for tw in data: j, value = classifier_s(tw, tfidf_like) confusion_like[i, j] += 1 j, value = classifier_s(tw, tfidf_smooth) confusion_smooth[i, j] += 1 j, value = classifier_s(tw, tfidf_entropy)
tfidf_n = list() tf_log_idf = list() tfidf_like = list() corr = "" with open(file_parl, 'rb') as handle: parl_counter = pickle.load(handle) tot_counter,counter_list,_ = loadCounters(dir_in) tot_counter_dep,counter_list_dep,pck= loadCounters(dir_parl) tfidf = TfIdf() for word in parl_counter: tf = tfidf.tf(word, parl_counter) idf = tfidf.idf(word,counter_list) log_idf = tfidf.idf_smooth(word,counter_list) ent_idf = tfidf.idf_like(word,parl_counter, tot_counter, counter_list, counter_list_dep) tfidf_n.append(tf*idf) tf_log_idf.append(tf*log_idf) tfidf_like.append(tf*ent_idf) dic_tfidf= list(zip(parl_counter.keys(), tfidf_n)) dic_tf_log_idf= list(zip(parl_counter.keys(), tf_log_idf)) dic_tfidf_like= list(zip(parl_counter.keys(), tfidf_like)) """ corr += "tfidf X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf] ,[v for i,v in dic_tf_log_idf]))+"\n" corr += "tfidf X tfidf_like: "+str(stats.spearmanr([v for i,v in dic_tfidf],[v for i,v in dic_tfidf_like]))+"\n" corr += "tfidf_like X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf_like] , [v for i,v in dic_tf_log_idf]))+"\n" corr += "tfidf X tfidf_smooth: "+str(stats.pearsonr(tfidf_n,tf_log_idf))+"\n" corr += "tfidf X tfidf_like: "+str(stats.pearsonr(tfidf_n,tfidf_like))+"\n"
parl_counters = list() for parl in parl_tw_processed: tw = list(itertools.chain.from_iterable(parl)) parl_counters.append(Counter(tw)) docs_counter =list() docs_counter.append(tot_counter) docs_counter.append(coleta1) docs_counter.append(coleta2) tfidf = TfIdf() tfidf_like = list() for word in tot_counter: tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters)) sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like)) sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True) with open(dir_rob+"sort_tfidf_like.pck", 'wb') as handle: pickle.dump(sort_tfidf_like, handle) with open(dir_rob+"tfidf_like.pck", 'wb') as handle: pickle.dump(tfidf_like, handle) with open(dir_rob+"parl_tw_processed.pck", 'wb') as handle: pickle.dump(parl_tw_processed, handle) f = open(dir_rob+"10k_tfidf_like.txt", 'w') for w,i in sort_tfidf_like[:10000]:
tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tweets)))) tot_counter = Counter(tweets) docs_counter = list() for alea_tw in alea_tweets: tw = list(itertools.chain.from_iterable(alea_tw)) docs_counter.append(Counter(tw)) docs_counter.append(tot_counter) parl_counters = list() for parl in parl_tweets: tw = list(itertools.chain.from_iterable(parl)) parl_counters.append(Counter(tw)) tfidf = TfIdf() tfidf_like_bi_trigrams = list() for word in tot_counter: tfidf_like_bi_trigrams.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters)) sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like_bi_trigrams)) sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True) with open(dir_out+"sort_tfidf_like_bi_trigram.pck", 'wb') as handle: pickle.dump(sort_tfidf_like, handle)