示例#1
0
文件: test_tfidf.py 项目: inpho/uroc
    def testKnownTFIDF(self):
        """
        Testing to see whether the tfidf values for arbitrarily selected words 
        in the articles correspond with manually calculated values.
        """
        articleList = []
        theList = []

        for string in self.strings:
            articleList.append(tfidf.tf(string))

        for string in self.theTwentyFive:
            theList.append(tfidf.tf(string))

        idfArtDict = tfidf.idf(articleList)
        idfTheDict = tfidf.idf(theList)

        tfidfArtList = tfidf.tfidf(idfArtDict, articleList)
        tfidfTheList = tfidf.tfidf(idfTheDict, theList)

        self.assertEqual(tfidfArtList[1]["Meditation"], math.log10(6/1) * (1/19))
        self.assertEqual(tfidfArtList[2]["books"], math.log10(6/1) * (1/18))
        self.assertEqual(tfidfArtList[5]["the"], math.log10(6/3) * (5/5))

        self.assertEqual(tfidfTheList[3]["the"], math.log10(5/5) * (5/5))
def weight(d):
    global idf, x, sq
    total_words = sum(d.values())
    dsq = 0
    d1 = {}
    for word in d:
        if word in idf:
            d1[word] = tfidf.tf(d[word], total_words) * idf[word]
            dsq += d1[word] * d1[word]
    return d1, dsq
示例#3
0
文件: test_tfidf.py 项目: inpho/uroc
    def testKnownTF(self):
        """
        Testing to see if the term frequencies for words match up with manual
        tf calculations.
        """
        tfDict = tfidf.tf(self.string1)
        self.assertEqual(tfDict["meditation"], (1/19))

        tfDict = tfidf.tf(self.string2)
        self.assertEqual(tfDict["be"], (3/18))

        tfDict = tfidf.tf(self.string3)
        self.assertEqual(tfDict["dog"], (0/11))

        tfDict = tfidf.tf(self.string4)
        self.assertEqual(tfDict["bureaucracy."], (1/12))

        tfDict = tfidf.tf(self.string5)
        self.assertEqual(tfDict["the"], (5/5))
示例#4
0
文件: test_tfidf.py 项目: inpho/uroc
    def testArticleOrder(self):
        """
        Testing to see whether the articles in the articleList retain their
        order.  This is the order that they will be in for testKnownTFIDF.
        """
        articleList = []

        for string in self.strings:
            articleList.append(tfidf.tf(string))

        self.assertEqual(articleList[1]["Meditation"], (1/19))
        self.assertEqual(articleList[2]["be"], (3/18))
        self.assertEqual(articleList[3]["can't"], (1/11))
        self.assertEqual(articleList[4]["bureaucracy."], (1/12))
        self.assertEqual(articleList[5]["the"], (5/5))
示例#5
0
def chapterSummary(chap):
    unitSummary = []
    for unit in chap:
        summarizedText = summarization.generate_summary(unit, 2)
        if summarizedText == []:
            continue
        tfidfText = tfidf.tf(unit)
        sentence = ''
        for x in summarizedText:
            sentence += x
        print(tfidfText, '------> ', sentence)
        entry = {"query": tfidfText, "ans": sentence}
        insert = collection.insert_one(entry)
        print(insert.inserted_id)
        unitSummary.append(summarizedText)
    return unitSummary
示例#6
0
文件: test_tfidf.py 项目: inpho/uroc
    def testLength(self):
        """
        Testing the lengths of the dictionaries to see if they hold an
        accurate count of unique words.
        """
        tfDict = tfidf.tf(self.emptyString)
        self.assertEqual(len(tfDict), 0)

        tfDict = tfidf.tf(self.string1)
        self.assertEqual(len(tfDict), 17)

        tfDict = tfidf.tf(self.string2)
        self.assertEqual(len(tfDict), 13)

        tfDict = tfidf.tf(self.string3)
        self.assertEqual(len(tfDict), 11)

        tfDict = tfidf.tf(self.string4)
        self.assertEqual(len(tfDict), 10)

        tfDict = tfidf.tf(self.string5)
        self.assertEqual(len(tfDict), 1)
示例#7
0
print "TFIDF for knn"
print tfidf.fast_tf_idf(files, "knn")
print "TFIDF for neural"
print tfidf.fast_tf_idf(files, "neural")
print "TFIDF for network"
print tfidf.fast_tf_idf(files, "network")
print "TFIDF for deep"
print tfidf.fast_tf_idf(files, "deep")

# Classification by TF
Y = ["learning", "knn", "neural", "network", "deep"]
X = []
for f in files:
    temp = []
    for word in Y:
        temp.append(tfidf.tf(f, word))
    X.append(temp)
print "TF for [learning, knn, neural, network, deep]"
print X

# Hamming Distance
X = []
for f1 in files:
    Y = []
    for f2 in files:
        Y.append(distance.hammingDistance(f1, f2))
    X.append(Y)

print "Matrix of Hamming Distance between all files"
plt.matshow(X)
plt.show()
示例#8
0
文件: stbi.py 项目: elgarsn14/TF-IDF
# import fungsi fari tfidf
from tfidf import tf
from tfidf import idf

# variable
n_term = 3
total_term = 100
n_docs = 10000000
total_docs = 1000

# memanggil fungsi tf untuk menghitung term frequency
# variabel tf_value akan menampung file dari hasil komparasi fungsi tf
tf_value = tf(n_term, total_term)
idf_value = idf(n_docs, total_docs)

# print tf_value
print("Term frequency : {0}".format(tf_value))
print("IDF : {0}".format(idf_value))

# Bobot
bobot = tf_value * idf_value
print("Weight : {0}".format(bobot))
示例#9
0
#import fungsi dari file tfidf
from tfidf import tf, idf

#variable
n_terms = 3
total_terms = 100
n_docs = 10000000
n_docs_with_term = 1000

#memanggil fungsi tf untuk menghitung term frequency
#variable tf_value akan menampung file dari hasil komputasi fungsi tf
tf_value = tf(n_terms, total_terms)
idf_value = idf(n_docs, n_docs_with_term)
#print tf_value
print("Term frequency: {0}".format(tf_value))
print("Inverse document frequency: {0}".format(idf_value))

tfidf_value = tf_value * idf_value

print("Tf * idf: {0}".format(tfidf_value))
示例#10
0
# -*- coding: utf-8 -*-
# @Time    : 2017/5/21 下午 04:30
# @Author  : Yuhsuan
# @File    : test.py
# @Software: PyCharm Community Edition

from nltk.corpus import reuters

fileid = 'training/3386'
print(" ".join(reuters.words(fileids=fileid)))
print(reuters.categories(fileids=fileid))

import tfidf as ti

print(ti.tf('Portland', fileid))