dataset.append(remove_irrelevant(pickle.load(data_file))) print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list() for i , data in enumerate(dataset): tmp_smooth = dict() tmp_like = dict() tmp_entropy = dict() print("dataset: " + str(i)) for word in data: tf = TfIdf.tf(word, data) tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, dataset) tmp_smooth[word] = tf * TfIdf.idf_smooth(word, dataset) tmp_like[word] = tf * TfIdf.idf_like(word, i, dataset) tfidf_smooth.append(tmp_smooth) tfidf_like.append(tmp_like) tfidf_entropy.append(tmp_entropy) print("save tfidf") with open(dir_out+"tfidf_entropy.pck", 'wb') as handle: pickle.dump(tfidf_entropy, handle) with open(dir_out+"tfidf_smooth.pck", 'wb') as handle: pickle.dump(tfidf_smooth, handle) with open(dir_out+"tfidf_like.pck", 'wb') as handle:
test_data.append(categ[:k]) print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list() for i , data in enumerate(categories_counter): tmp_smooth = dict() tmp_like = dict() tmp_entropy = dict() print("dataset: " + str(i)) for word in data: tf = TfIdf.tf(word, data) tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, categories_counter) tmp_smooth[word] = tf * TfIdf.idf_smooth(word, categories_counter) tmp_like[word] = tf * TfIdf.idf_like(word, i, categories_counter) tfidf_smooth.append(tmp_smooth) tfidf_like.append(tmp_like) tfidf_entropy.append(tmp_entropy) print("processing softmax confusion matrix") confusion_like = np.zeros(shape=(len(test_data), len(test_data))) confusion_smooth = np.zeros(shape=(len(test_data), len(test_data))) confusion_entropy = np.zeros(shape=(len(test_data), len(test_data))) for i, data in enumerate(test_data): for tw in data: j, value = classifier_s(tw, tfidf_like) confusion_like[i, j] += 1 j, value = classifier_s(tw, tfidf_smooth)