Пример #1
0
def tfidf_month(tw_month,random_list):
    tweets = list(itertools.chain.from_iterable(itertools.chain.from_iterable(tw_month)))
    tot_counter = Counter(tweets)
    dep_counts = list()
    for dep in tw_month:
        tw = list(itertools.chain.from_iterable(dep))
        print(tw)
        dep_counts.append(Counter(tw))
    docs_counter = docs_counters(random_list,tot_counter)
    tfidf = TfIdf()
    tfidf_like = list()
    for word in tot_counter:
        tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, dep_counts))
    sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like))
    sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True)
    return sort_tfidf_like
Пример #2
0
    tfidf = TfIdf()
    tfidf_smooth = list() 
    for bgr in bgr_counter:
        tfidf_smooth.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_smooth(bgr,docs_bgr_counter))

    dic_tfidf_smooth = list(zip(bgr_counter.keys(), tfidf_smooth))
    dic_tfidf_smooth = sorted(dic_tfidf_smooth, key=lambda x: x[1], reverse=True)

    tot_counter = dict()
    for y in docs_bgr_counter:
        for k in y.keys(): tot_counter[k] = k in tot_counter and tot_counter[k]+y[k] or y[k]

tfidf_like = list()
for bgr in bgr_counter:
    tfidf_like.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_like(bgr,bgr_counter,tot_counter,docs_bgr_counter, parl_bgr_counter))

dic_tfidf_like = list(zip(bgr_counter.keys(), tfidf_like))
dic_tfidf_like = sorted(dic_tfidf_like, key=lambda x: x[1], reverse=True)


    #processa os trigramas dos tweets dos documentos aleatorios
    alea_tri_processed = list()
    for l in list_aleatory:
        temp = add_separator(tp.text_process(l,text_only=True))
        temp = get_trigrams(temp,2,True)
        alea_tri_processed.append(temp)
    with open(dir_out+"list_alea_trigrams.pck", 'wb') as handle:
        pickle.dump(alea_tri_processed, handle)

Пример #3
0
    print("process tfidf")
    tfidf_entropy = list()
    tfidf_smooth = list()
    tfidf_like = list()

    for i , data in enumerate(dataset):
        tmp_smooth = dict()
        tmp_like = dict()
        tmp_entropy = dict()
        print("dataset: " + str(i))
        for word in data:
            tf = TfIdf.tf(word, data)
            tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, dataset)
            tmp_smooth[word] = tf * TfIdf.idf_smooth(word, dataset)
            tmp_like[word] = tf * TfIdf.idf_like(word, i, dataset)
        tfidf_smooth.append(tmp_smooth)
        tfidf_like.append(tmp_like)
        tfidf_entropy.append(tmp_entropy)


    print("save tfidf")
    with open(dir_out+"tfidf_entropy.pck", 'wb') as handle:
        pickle.dump(tfidf_entropy, handle)

    with open(dir_out+"tfidf_smooth.pck", 'wb') as handle:
        pickle.dump(tfidf_smooth, handle)
    
    with open(dir_out+"tfidf_like.pck", 'wb') as handle:
        pickle.dump(tfidf_like, handle) 
Пример #4
0
    print("process tfidf")
    tfidf_entropy = list()
    tfidf_smooth = list()
    tfidf_like = list()

    for i , data in enumerate(categories_counter):
        tmp_smooth = dict()
        tmp_like = dict()
        tmp_entropy = dict()
        print("dataset: " + str(i))
        for word in data:
            tf = TfIdf.tf(word, data)
            tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, categories_counter)
            tmp_smooth[word] = tf * TfIdf.idf_smooth(word, categories_counter)
            tmp_like[word] = tf * TfIdf.idf_like(word, i, categories_counter)
        tfidf_smooth.append(tmp_smooth)
        tfidf_like.append(tmp_like)
        tfidf_entropy.append(tmp_entropy)

    print("processing softmax confusion matrix")
    confusion_like = np.zeros(shape=(len(test_data), len(test_data)))
    confusion_smooth = np.zeros(shape=(len(test_data), len(test_data)))
    confusion_entropy = np.zeros(shape=(len(test_data), len(test_data)))
    for i, data in enumerate(test_data):
        for tw in data:
            j, value = classifier_s(tw, tfidf_like)
            confusion_like[i, j] += 1
            j, value = classifier_s(tw, tfidf_smooth)
            confusion_smooth[i, j] += 1
            j, value = classifier_s(tw, tfidf_entropy)
Пример #5
0
tfidf_n = list()
tf_log_idf = list()
tfidf_like = list() 
corr = ""

with open(file_parl, 'rb') as handle:
    parl_counter = pickle.load(handle)

tot_counter,counter_list,_ = loadCounters(dir_in)
tot_counter_dep,counter_list_dep,pck= loadCounters(dir_parl)
tfidf = TfIdf()
for word in parl_counter:
    tf = tfidf.tf(word, parl_counter)
    idf = tfidf.idf(word,counter_list)
    log_idf = tfidf.idf_smooth(word,counter_list)
    ent_idf = tfidf.idf_like(word,parl_counter, tot_counter, counter_list, counter_list_dep)
    tfidf_n.append(tf*idf)
    tf_log_idf.append(tf*log_idf)
    tfidf_like.append(tf*ent_idf)

dic_tfidf= list(zip(parl_counter.keys(), tfidf_n))
dic_tf_log_idf= list(zip(parl_counter.keys(), tf_log_idf))
dic_tfidf_like= list(zip(parl_counter.keys(), tfidf_like))

"""
corr +=  "tfidf X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf] ,[v for i,v in dic_tf_log_idf]))+"\n"
corr +=  "tfidf X tfidf_like: "+str(stats.spearmanr([v for i,v in dic_tfidf],[v for i,v in dic_tfidf_like]))+"\n"
corr +=  "tfidf_like X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf_like] , [v for i,v in dic_tf_log_idf]))+"\n"

    corr +=  "tfidf X tfidf_smooth: "+str(stats.pearsonr(tfidf_n,tf_log_idf))+"\n"
    corr +=  "tfidf X tfidf_like: "+str(stats.pearsonr(tfidf_n,tfidf_like))+"\n"
Пример #6
0
    parl_counters = list()
    for parl in parl_tw_processed:
        tw = list(itertools.chain.from_iterable(parl))
        parl_counters.append(Counter(tw))


    docs_counter =list()
    docs_counter.append(tot_counter)
    docs_counter.append(coleta1)
    docs_counter.append(coleta2)

    tfidf = TfIdf()

    tfidf_like = list()
    for word in tot_counter:
        tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters))

    sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like))
    sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True)

    with open(dir_rob+"sort_tfidf_like.pck", 'wb') as handle:
        pickle.dump(sort_tfidf_like, handle)

    with open(dir_rob+"tfidf_like.pck", 'wb') as handle:
        pickle.dump(tfidf_like, handle)

    with open(dir_rob+"parl_tw_processed.pck", 'wb') as handle:
        pickle.dump(parl_tw_processed, handle)

    f =  open(dir_rob+"10k_tfidf_like.txt", 'w')
    for w,i in sort_tfidf_like[:10000]:
Пример #7
0

tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tweets))))
tot_counter = Counter(tweets)

docs_counter = list()
for alea_tw in alea_tweets:
    tw = list(itertools.chain.from_iterable(alea_tw))
    docs_counter.append(Counter(tw))
docs_counter.append(tot_counter)

parl_counters = list()
for parl in parl_tweets:
    tw = list(itertools.chain.from_iterable(parl))
    parl_counters.append(Counter(tw))

tfidf = TfIdf()
tfidf_like_bi_trigrams = list()
for word in tot_counter:
    tfidf_like_bi_trigrams.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters))

sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like_bi_trigrams))
sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True)

with open(dir_out+"sort_tfidf_like_bi_trigram.pck", 'wb') as handle:
    pickle.dump(sort_tfidf_like, handle)