def simpleTest():
    vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam, DS = \
        naiveBayes.getTrainedModelInfo()

    fileFolder = './test/'
    smsWords, classLables = naiveBayes.loadMailData(fileFolder)

    smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                  pWordsHealthy, pSpam, smsWords[0])
    print(smsType)
def testClassifyErrorRateMSE():
    fileFolder = './public/'
    mailWords, classLables = naiveBayes.loadMailData(fileFolder)

    test_index = [
        2, 6, 7, 8, 13, 16, 19, 29, 35, 37, 40, 42, 43, 45, 46, 49, 51, 52, 64,
        65, 71, 72, 78, 79, 80, 84, 85, 90, 91, 98, 103, 109, 111, 117, 123,
        129, 135, 138, 142, 149, 169, 188, 191, 192, 203, 221, 225, 226, 229,
        232, 236, 243, 250, 254, 257, 258, 259, 264, 268, 281, 298, 300, 308,
        319, 322, 329, 333, 335, 338, 339, 340, 344, 347, 358, 359, 362, 382,
        385, 391, 394, 402, 410, 415, 417, 418, 422, 423, 424, 425, 428, 437,
        441, 456, 461, 462, 470, 472, 477, 480, 481
    ]

    testWords = [mailWords[i] for i in test_index]
    testWordsType = [classLables[i] for i in test_index]

    # testCount = 200
    # for i in range(testCount):
    #     randomIndex = int(random.uniform(0, len(classLables)))
    #     testWordsType.append(classLables[randomIndex])
    #     testWords.append(smsWords[randomIndex])
    #     del (smsWords[randomIndex])
    #     del (classLables[randomIndex])

    vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam, DS = \
        naiveBayes.getTrainedModelInfo()

    errorCount = 0.0
    tp, tn, fp, fn = 0, 0, 0, 0
    se = 0
    for i in range(len(test_index)):
        testWordsCount = naiveBayes.setOfWordsToVecTor(vocabularyList,
                                                       testWords[i])
        trainMarkedWords = np.array(testWordsCount)
        p1, p0, type = naiveBayes.adaboostClassify(vocabularyList,
                                                   pWordsSpamicity,
                                                   pWordsHealthy, DS, pSpam,
                                                   trainMarkedWords)

        autual = testWordsType[i]
        if autual == 1:
            se += (pow((p1 / 20000 - 1), 2) + pow((p0 / 20000), 2)) / 2
        else:
            se += (pow((p1 / 20000), 2) + pow((p0 / 20000 - 1), 2)) / 2

    print("mse->", se / len(test_index))
def baselineCrossValidateEvaluate():
    beginTime = datetime.datetime.now()
    filename = './public/'
    # load data: load all the words in all the emails
    mailWords, classLables = naiveBayes.loadMailData(filename)

    skf = StratifiedKFold(classLables, k_fold_num)
    acc_per_fold = []

    for train_index, test_index in skf:
        print("train_index->", train_index)
        print("test_index->", test_index)

        predict = naiveBayes.baselinePredict(
            [mailWords[i] for i in test_index])

        acc_per_fold.append(
            accuracy_score([classLables[i] for i in test_index], predict))
        print("acc_per_fold:", acc_per_fold)

    print("acc_per_fold:", acc_per_fold)
    print("avg acc:", np.mean(acc_per_fold))
    print("k-fold:", k_fold_num, " spend:",
          (datetime.datetime.now() - beginTime))
Exemplo n.º 4
0
import numpy as np
import simpleNavie as naiveBayes
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.svm import SVC

k_fold_num = 10
filename = './public/'
# load data: load all the words in all the emails
mailWords, classLables = naiveBayes.loadMailData(filename)

skf = StratifiedKFold(classLables, k_fold_num)
acc_per_fold = []
f1_per_fold = []
recall_per_fold = []
precision_per_fold = []

for train_index, test_index in skf:
    print("train_index->", train_index)
    print("test_index->", test_index)

    preVocabularyList = naiveBayes.createVocabularyList(
        [mailWords[i] for i in train_index])
    # do wfo filter
    vocabularyList = naiveBayes.wfoFilter(
        preVocabularyList, [mailWords[i] for i in train_index],
        [classLables[i] for i in train_index])
def trainingAdaboostGetDS(iterateNum=40):
    test_index = [
        2, 6, 7, 8, 13, 16, 19, 29, 35, 37, 40, 42, 43, 45, 46, 49, 51, 52, 64,
        65, 71, 72, 78, 79, 80, 84, 85, 90, 91, 98, 103, 109, 111, 117, 123,
        129, 135, 138, 142, 149, 169, 188, 191, 192, 203, 221, 225, 226, 229,
        232, 236, 243, 250, 254, 257, 258, 259, 264, 268, 281, 298, 300, 308,
        319, 322, 329, 333, 335, 338, 339, 340, 344, 347, 358, 359, 362, 382,
        385, 391, 394, 402, 410, 415, 417, 418, 422, 423, 424, 425, 428, 437,
        441, 456, 461, 462, 470, 472, 477, 480, 481
    ]
    beginTime = datetime.datetime.now()
    filename = './public/'
    # load data: load all the words in all the emails
    mailWords, classLables = naiveBayes.loadMailData(filename)

    preVocabularyList = naiveBayes.createVocabularyList(mailWords)
    # do wfo filter
    vocabularyList = naiveBayes.wfoFilter(preVocabularyList, mailWords,
                                          classLables)
    print("length of vocabularyList", len(vocabularyList))

    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, mailWords)
    print("trainMarkedWords finished")

    # change it to array
    trainMarkedWords = np.array(trainMarkedWords)
    print("data to matrix finished")
    # calculate each propabilaty of spam and ham P(wi/s)  p(wi/h)
    pWordsSpamicity, pWordsHealthy, pSpam = \
        naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)

    DS = np.ones(len(vocabularyList))

    ds_result = {}
    minErrorRate = np.inf
    for i in range(iterateNum):
        errorCount = 0.0
        for j in test_index:
            testWordsCount = naiveBayes.setOfWordsToVecTor(
                vocabularyList, mailWords[j])
            testWordsMarkedArray = np.array(testWordsCount)
            ps, ph, mailType = naiveBayes.adaboostClassify(
                vocabularyList, pWordsSpamicity, pWordsHealthy, DS, pSpam,
                testWordsMarkedArray)

            if mailType != classLables[j]:
                errorCount += 1
                alpha = ps - ph
                if alpha > 0:  # actual: ham; predict:spam
                    DS[testWordsMarkedArray != 0] = np.abs(
                        (DS[testWordsMarkedArray != 0] - np.exp(alpha)) /
                        DS[testWordsMarkedArray != 0])
                else:  # actual: spam; predict: ham
                    DS[testWordsMarkedArray != 0] = (
                        DS[testWordsMarkedArray != 0] +
                        np.exp(alpha)) / DS[testWordsMarkedArray != 0]

        print('DS:', DS)
        errorRate = errorCount / len(mailWords)
        if errorRate < minErrorRate:
            minErrorRate = errorRate
            ds_result['minErrorRate'] = minErrorRate
            ds_result['DS'] = DS
        print('# %d,errorcount %d ,errorrate %f' % (i, errorCount, errorRate))
        if errorRate == 0.0:
            break

    ds_result['vocabularyList'] = vocabularyList
    ds_result['pWordsSpamicity'] = pWordsSpamicity
    ds_result['pWordsHealthy'] = pWordsHealthy
    ds_result['pSpam'] = pSpam
    return ds_result
def crossValidateEvaluate():
    beginTime = datetime.datetime.now()
    filename = './public/'
    # load data: load all the words in all the emails
    mailWords, classLables = naiveBayes.loadMailData(filename)

    skf = StratifiedKFold(classLables, k_fold_num)
    acc_per_fold = []
    f1_per_fold = []
    recall_per_fold = []
    precision_per_fold = []

    for train_index, test_index in skf:
        print("train_index->", train_index)
        print("test_index->", test_index)
        preVocabularyList = naiveBayes.createVocabularyList(
            [mailWords[i] for i in train_index])
        #do wfo filter
        vocabularyList = naiveBayes.wfoFilter(
            preVocabularyList, [mailWords[i] for i in train_index],
            [classLables[i] for i in train_index])
        vocabularyList = preVocabularyList
        print("length of vocabularyList", len(vocabularyList))
        fw = open('vocabularyList.txt', 'w')
        for i in vocabularyList:
            fw.write(i + '\n')
        fw.flush()
        fw.close()
        print("vocabularyList finished")

        trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
            vocabularyList, [mailWords[i] for i in train_index])
        print("trainMarkedWords finished")

        # change it to array
        trainMarkedWords = np.array(trainMarkedWords)
        print("data to matrix finished")
        # calculate each propabilaty of spam and ham P(wi/s)  p(wi/h)
        pWordsSpamicity, pWordsHealthy, pSpam = \
            naiveBayes.trainingNaiveBayes(trainMarkedWords, [classLables[i] for i in train_index])
        fpSpam = open('pSpam.txt', 'w')
        spam = pSpam.__str__()
        fpSpam.write(spam)
        fpSpam.close()

        np.savetxt('pWordsSpamicity.txt', pWordsSpamicity, delimiter='\t')
        np.savetxt('pWordsHealthy.txt', pWordsHealthy, delimiter='\t')

        predict = naiveBayes.predict([mailWords[i] for i in test_index])
        #predict = naiveBayes.adaboostPredict([smsWords[i] for i in test_index])
        acc_per_fold.append(
            accuracy_score([classLables[i] for i in test_index], predict))
        f1_per_fold.append(
            f1_score([classLables[i] for i in test_index], predict))
        recall_per_fold.append(
            recall_score([classLables[i] for i in test_index], predict))
        precision_per_fold.append(
            precision_score([classLables[i] for i in test_index], predict))
        print("acc_per_fold:", acc_per_fold)
        print("f1_per_fold:", f1_per_fold)
        print("recall_per_fold:", recall_per_fold)
        print("precision_per_fold:", precision_per_fold)

    print("acc_per_fold:", acc_per_fold)
    print("f1_per_fold:", f1_per_fold)
    print("recall_per_fold:", recall_per_fold)
    print("precision_per_fold:", precision_per_fold)
    print("k-fold:", k_fold_num, " spend:",
          (datetime.datetime.now() - beginTime))