def testKnownTFIDF(self): """ Testing to see whether the tfidf values for arbitrarily selected words in the articles correspond with manually calculated values. """ articleList = [] theList = [] for string in self.strings: articleList.append(tfidf.tf(string)) for string in self.theTwentyFive: theList.append(tfidf.tf(string)) idfArtDict = tfidf.idf(articleList) idfTheDict = tfidf.idf(theList) tfidfArtList = tfidf.tfidf(idfArtDict, articleList) tfidfTheList = tfidf.tfidf(idfTheDict, theList) self.assertEqual(tfidfArtList[1]["Meditation"], math.log10(6/1) * (1/19)) self.assertEqual(tfidfArtList[2]["books"], math.log10(6/1) * (1/18)) self.assertEqual(tfidfArtList[5]["the"], math.log10(6/3) * (5/5)) self.assertEqual(tfidfTheList[3]["the"], math.log10(5/5) * (5/5))
def weight(d): global idf, x, sq total_words = sum(d.values()) dsq = 0 d1 = {} for word in d: if word in idf: d1[word] = tfidf.tf(d[word], total_words) * idf[word] dsq += d1[word] * d1[word] return d1, dsq
def testKnownTF(self): """ Testing to see if the term frequencies for words match up with manual tf calculations. """ tfDict = tfidf.tf(self.string1) self.assertEqual(tfDict["meditation"], (1/19)) tfDict = tfidf.tf(self.string2) self.assertEqual(tfDict["be"], (3/18)) tfDict = tfidf.tf(self.string3) self.assertEqual(tfDict["dog"], (0/11)) tfDict = tfidf.tf(self.string4) self.assertEqual(tfDict["bureaucracy."], (1/12)) tfDict = tfidf.tf(self.string5) self.assertEqual(tfDict["the"], (5/5))
def testArticleOrder(self): """ Testing to see whether the articles in the articleList retain their order. This is the order that they will be in for testKnownTFIDF. """ articleList = [] for string in self.strings: articleList.append(tfidf.tf(string)) self.assertEqual(articleList[1]["Meditation"], (1/19)) self.assertEqual(articleList[2]["be"], (3/18)) self.assertEqual(articleList[3]["can't"], (1/11)) self.assertEqual(articleList[4]["bureaucracy."], (1/12)) self.assertEqual(articleList[5]["the"], (5/5))
def chapterSummary(chap): unitSummary = [] for unit in chap: summarizedText = summarization.generate_summary(unit, 2) if summarizedText == []: continue tfidfText = tfidf.tf(unit) sentence = '' for x in summarizedText: sentence += x print(tfidfText, '------> ', sentence) entry = {"query": tfidfText, "ans": sentence} insert = collection.insert_one(entry) print(insert.inserted_id) unitSummary.append(summarizedText) return unitSummary
def testLength(self): """ Testing the lengths of the dictionaries to see if they hold an accurate count of unique words. """ tfDict = tfidf.tf(self.emptyString) self.assertEqual(len(tfDict), 0) tfDict = tfidf.tf(self.string1) self.assertEqual(len(tfDict), 17) tfDict = tfidf.tf(self.string2) self.assertEqual(len(tfDict), 13) tfDict = tfidf.tf(self.string3) self.assertEqual(len(tfDict), 11) tfDict = tfidf.tf(self.string4) self.assertEqual(len(tfDict), 10) tfDict = tfidf.tf(self.string5) self.assertEqual(len(tfDict), 1)
print "TFIDF for knn" print tfidf.fast_tf_idf(files, "knn") print "TFIDF for neural" print tfidf.fast_tf_idf(files, "neural") print "TFIDF for network" print tfidf.fast_tf_idf(files, "network") print "TFIDF for deep" print tfidf.fast_tf_idf(files, "deep") # Classification by TF Y = ["learning", "knn", "neural", "network", "deep"] X = [] for f in files: temp = [] for word in Y: temp.append(tfidf.tf(f, word)) X.append(temp) print "TF for [learning, knn, neural, network, deep]" print X # Hamming Distance X = [] for f1 in files: Y = [] for f2 in files: Y.append(distance.hammingDistance(f1, f2)) X.append(Y) print "Matrix of Hamming Distance between all files" plt.matshow(X) plt.show()
# import fungsi fari tfidf from tfidf import tf from tfidf import idf # variable n_term = 3 total_term = 100 n_docs = 10000000 total_docs = 1000 # memanggil fungsi tf untuk menghitung term frequency # variabel tf_value akan menampung file dari hasil komparasi fungsi tf tf_value = tf(n_term, total_term) idf_value = idf(n_docs, total_docs) # print tf_value print("Term frequency : {0}".format(tf_value)) print("IDF : {0}".format(idf_value)) # Bobot bobot = tf_value * idf_value print("Weight : {0}".format(bobot))
#import fungsi dari file tfidf from tfidf import tf, idf #variable n_terms = 3 total_terms = 100 n_docs = 10000000 n_docs_with_term = 1000 #memanggil fungsi tf untuk menghitung term frequency #variable tf_value akan menampung file dari hasil komputasi fungsi tf tf_value = tf(n_terms, total_terms) idf_value = idf(n_docs, n_docs_with_term) #print tf_value print("Term frequency: {0}".format(tf_value)) print("Inverse document frequency: {0}".format(idf_value)) tfidf_value = tf_value * idf_value print("Tf * idf: {0}".format(tfidf_value))
# -*- coding: utf-8 -*- # @Time : 2017/5/21 下午 04:30 # @Author : Yuhsuan # @File : test.py # @Software: PyCharm Community Edition from nltk.corpus import reuters fileid = 'training/3386' print(" ".join(reuters.words(fileids=fileid))) print(reuters.categories(fileids=fileid)) import tfidf as ti print(ti.tf('Portland', fileid))