示例#1
0
        dataset.append(remove_irrelevant(pickle.load(data_file)))


    print("process tfidf")
    tfidf_entropy = list()
    tfidf_smooth = list()
    tfidf_like = list()

    for i , data in enumerate(dataset):
        tmp_smooth = dict()
        tmp_like = dict()
        tmp_entropy = dict()
        print("dataset: " + str(i))
        for word in data:
            tf = TfIdf.tf(word, data)
            tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, dataset)
            tmp_smooth[word] = tf * TfIdf.idf_smooth(word, dataset)
            tmp_like[word] = tf * TfIdf.idf_like(word, i, dataset)
        tfidf_smooth.append(tmp_smooth)
        tfidf_like.append(tmp_like)
        tfidf_entropy.append(tmp_entropy)


    print("save tfidf")
    with open(dir_out+"tfidf_entropy.pck", 'wb') as handle:
        pickle.dump(tfidf_entropy, handle)

    with open(dir_out+"tfidf_smooth.pck", 'wb') as handle:
        pickle.dump(tfidf_smooth, handle)
    
    with open(dir_out+"tfidf_like.pck", 'wb') as handle:
示例#2
0
        test_data.append(categ[:k])


    print("process tfidf")
    tfidf_entropy = list()
    tfidf_smooth = list()
    tfidf_like = list()

    for i , data in enumerate(categories_counter):
        tmp_smooth = dict()
        tmp_like = dict()
        tmp_entropy = dict()
        print("dataset: " + str(i))
        for word in data:
            tf = TfIdf.tf(word, data)
            tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, categories_counter)
            tmp_smooth[word] = tf * TfIdf.idf_smooth(word, categories_counter)
            tmp_like[word] = tf * TfIdf.idf_like(word, i, categories_counter)
        tfidf_smooth.append(tmp_smooth)
        tfidf_like.append(tmp_like)
        tfidf_entropy.append(tmp_entropy)

    print("processing softmax confusion matrix")
    confusion_like = np.zeros(shape=(len(test_data), len(test_data)))
    confusion_smooth = np.zeros(shape=(len(test_data), len(test_data)))
    confusion_entropy = np.zeros(shape=(len(test_data), len(test_data)))
    for i, data in enumerate(test_data):
        for tw in data:
            j, value = classifier_s(tw, tfidf_like)
            confusion_like[i, j] += 1
            j, value = classifier_s(tw, tfidf_smooth)