def test_basic(self): self.assertEqual(classify('a', {'A': ['a'], 'B': ['b']}), 'A') self.assertEqual(classify('a a a', {'A': ['a'], 'B': ['b']}), 'A') self.assertEqual(classify('a a b', {'A': ['a'], 'B': ['b']}), 'A') self.assertEqual(classify('a a b', {'A': ['a', 'a'], 'B': ['b']}), 'A') self.assertEqual(classify('a b b', {'A': ['a', 'a'], 'B': ['b']}), 'B') self.assertEqual(classify('b b b', {'A': ['a', 'a'], 'B': ['b']}), 'B')
def run(): classificadores = ["Bayesian", "KNN 1", "KNN 5", "KNN 10", "KNN 20", "KNN 30", "Sum 1", "Sum 5", "Sum 10", "Sum 20", "Sum 30"] errorResults = {} for c in classificadores: errorResults[c] = [] K = [30, 20, 10, 5, 1] resultsBay = open('part2-results-bayesian.txt', 'a') resultsKn = open('part2-results-knn.txt', 'a') resultsSum = open('part2-results-sum.txt', 'a') for j in range(10): H = data_proccessing.loadData() # Folds for i in range(10): resultsBay.write("Round %d\n\n" % (j*10+i+1)) resultsKn.write("Round %d\n\n" % (j*10+i+1)) resultsSum.write("Round %d\n\n" % (j*10+i+1)) (trainX, trainY, testX, testY) = prepareSets(H, i) (P_bay, E_bay, e_rate_bay, se_bay, interval_bay) = bayesian.classify(trainX, trainY, testX, testY) resultsBay.write("- Bayesian\n") writeResults(resultsBay, e_rate_bay, se_bay, interval_bay) errorResults["Bayesian"].append(e_rate_bay) for k in K: (P_kn, E_kn, e_rate_kn, se_kn, interval_kn) = knn.classify(trainX, trainY, testX, testY, k) (P_sum, E_sum, e_rate_sum, se_sum, interval_sum) = sum_rule.classify([P_bay, P_kn], testX, testY) # resultsKn.write("- KNN (n = %d)\n" % k) writeResults(resultsKn, e_rate_kn, se_kn, interval_kn) errorResults["KNN %i" % k].append(e_rate_kn) # resultsSum.write("- Sum (n = %d)\n" % k) writeResults(resultsSum, e_rate_sum, se_sum, interval_sum) errorResults["Sum %i" % k].append(e_rate_sum) resultsBay.close() resultsKn.close() resultsSum.close() compare(errorResults)
for i in list(agerangeset): print(i) print("age class size:") for k, v in ageclass.items(): print(k, ":", len(v)) print("begin classify") rescol = [] count1 = 1 for i in c.execute("select * from knowledge_detail where mark = \"program\" limit 3000"): #for i in c.execute("select * from knowledge_detail where serial = 1909"): print(count1) count1 += 1 print(i[0]) print(i[1]) curclass = classify(i[1], ageclass, extractor=cutword) print curclass (curmin, curmax) = curclass.split('*') print("curmin:", curmin) print("curmax:", curmax) rescol.append([i[0], curmin, curmax]) #c.execute("update knowledge_detail set min=?, max=? where serial=?", (curmin, curmax, i[0])) for r in rescol: c.execute("update knowledge_detail set min=?, max=?, mark=\"programa\" where serial=?", (r[1], r[2], r[0])) #for i in idcol: # print("update mark") # c.execute("update knowledge_detail set mark=\"program\" where serial=?", (i)) conn.commit() conn.close()
"spam" : numberOfSpam, "ham" : numberOfHam } ham = readMailsFromFile("./corpus/nospam2.txt") spam = readMailsFromFile("./corpus/spam2.txt") tests = readTestMailsFromFile("./corpus/test-final.txt") print("Messages to filter: %d" % len(tests["messages"])) hamCount = 0 spamCount= 0 goodHam = 0 goodSpam = 0 for index, test in enumerate(tests["messages"]): classif = classify(test, {'spam': spam, 'ham': ham}) if test[:3] == "TAK" and classif == 'spam': goodSpam = goodSpam + 1 if test[:3] == "NIE" and classif == 'ham': goodHam = goodHam + 1 if classif == 'ham': hamCount = hamCount + 1 else: spamCount = spamCount + 1 test = test[4:] print("%d. \"%s...\" is" % (index,test[:50]), classif) print("SPAM: %d" % spamCount) print("HAM: %d" % hamCount) print("Accuracy: ", ((goodSpam+goodHam)/float(len(tests["messages"])))*100, "%")
'foot size': 7 }, { 'height': 5.75, 'weight': 150, 'foot size': 9 }] })) print('') print(' -- Spam Detection With `Classify` -- ') spams = ["buy viagra", "dear recipient", "meet sexy singles"] # etc genuines = ["let's meet tomorrow", "remember to buy milk"] message = "remember the meeting tomorrow" # Classify as "genuine" because of the words "remember" and "tomorrow". print(classify(message, {'spam': spams, 'genuine': genuines})) # Classifies "unknown_file" as either a Python or Java file, considering # you have directories with examples of each language. #print classify_file("unknown_file", ["java_files", "python_files"]) # Classifies every file under "folder" as either a Python or Java file, # considering you have subdirectories with examples of each language. #print classify_folder("folder") print('') print(' == Low Level Functions == ') print(' -- Classic Cancer Test Problem --') # 1% chance of having cancer.
def get_language(s): train = _get_train_set() return classify(s, train)
def test_sample(self): spams = ["buy viagra", "dear recipient", "meet sexy singles"] genuines = ["let's meet tomorrow", "remember to buy milk"] message = "remember the meeting tomorrow" instances = {'spam': spams, 'genuine': genuines} self.assertEqual(classify(message, instances), 'genuine')
def test_with_extraction(self): self.assertEqual(classify('a', {'A': ['a a a'], 'B': ['b']}), 'A') self.assertEqual(classify('a', {'A': ['a', 'a'], 'B': ['b b b']}), 'A')
def test_single(self): self.assertEqual(classify('a', {'A': []}), 'A') self.assertEqual(classify('a', {'A': ['a']}), 'A') self.assertEqual(classify('a', {'A': ['a', 'a']}), 'A') self.assertEqual(classify('a', {'A': ['a', 'b']}), 'A')
# -*- coding: utf-8 -*- from bayesian import classify, classify_file from cutword import cutword spams = [u"什么", u"哦 好的", u"是嘛 这样啊 你好啊"] # etc genuines = [u"明天看电影", u"想吃饭了"] message = u"看电影能吃饭吗" # Classify as "genuine" because of the words "remember" and "tomorrow". print classify(message, {'spam': spams, 'genuine': genuines}, extractor=cutword)
{'height': 5.92, 'weight': 190, 'foot size': 11}, {'height': 5.58, 'weight': 170, 'foot size': 12}, {'height': 5.92, 'weight': 165, 'foot size': 10}], 'female': [{'height': 5, 'weight': 100, 'foot size': 6}, {'height': 5.5, 'weight': 150, 'foot size': 8}, {'height': 5.42, 'weight': 130, 'foot size': 7}, {'height': 5.75, 'weight': 150, 'foot size': 9}]})) print('') print(' -- Spam Detection With `Classify` -- ') spams = ["buy viagra", "dear recipient", "meet sexy singles"] # etc genuines = ["let's meet tomorrow", "remember to buy milk"] message = "remember the meeting tomorrow" # Classify as "genuine" because of the words "remember" and "tomorrow". print(classify(message, {'spam': spams, 'genuine': genuines})) # Classifies "unknown_file" as either a Python or Java file, considering # you have directories with examples of each language. #print classify_file("unknown_file", ["java_files", "python_files"]) # Classifies every file under "folder" as either a Python or Java file, # considering you have subdirectories with examples of each language. #print classify_folder("folder") print('') print(' == Low Level Functions == ') print(' -- Classic Cancer Test Problem --') # 1% chance of having cancer.