예제 #1
0
k_fold_num = 10
filename = './public/'
# load data: load all the words in all the emails
mailWords, classLables = naiveBayes.loadMailData(filename)

skf = StratifiedKFold(classLables, k_fold_num)
acc_per_fold = []
f1_per_fold = []
recall_per_fold = []
precision_per_fold = []

for train_index, test_index in skf:
    print("train_index->", train_index)
    print("test_index->", test_index)

    preVocabularyList = naiveBayes.createVocabularyList(
        [mailWords[i] for i in train_index])
    # do wfo filter
    vocabularyList = naiveBayes.wfoFilter(
        preVocabularyList, [mailWords[i] for i in train_index],
        [classLables[i] for i in train_index])

    print("vocabularyList finished")

    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, [mailWords[i] for i in train_index])
    print("trainMarkedWords finished")
    testMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, [mailWords[i] for i in test_index])

    # # change it to array
    # trainMarkedWords = np.array(trainMarkedWords)
def trainingAdaboostGetDS(iterateNum=40):
    test_index = [
        2, 6, 7, 8, 13, 16, 19, 29, 35, 37, 40, 42, 43, 45, 46, 49, 51, 52, 64,
        65, 71, 72, 78, 79, 80, 84, 85, 90, 91, 98, 103, 109, 111, 117, 123,
        129, 135, 138, 142, 149, 169, 188, 191, 192, 203, 221, 225, 226, 229,
        232, 236, 243, 250, 254, 257, 258, 259, 264, 268, 281, 298, 300, 308,
        319, 322, 329, 333, 335, 338, 339, 340, 344, 347, 358, 359, 362, 382,
        385, 391, 394, 402, 410, 415, 417, 418, 422, 423, 424, 425, 428, 437,
        441, 456, 461, 462, 470, 472, 477, 480, 481
    ]
    beginTime = datetime.datetime.now()
    filename = './public/'
    # load data: load all the words in all the emails
    mailWords, classLables = naiveBayes.loadMailData(filename)

    preVocabularyList = naiveBayes.createVocabularyList(mailWords)
    # do wfo filter
    vocabularyList = naiveBayes.wfoFilter(preVocabularyList, mailWords,
                                          classLables)
    print("length of vocabularyList", len(vocabularyList))

    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, mailWords)
    print("trainMarkedWords finished")

    # change it to array
    trainMarkedWords = np.array(trainMarkedWords)
    print("data to matrix finished")
    # calculate each propabilaty of spam and ham P(wi/s)  p(wi/h)
    pWordsSpamicity, pWordsHealthy, pSpam = \
        naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)

    DS = np.ones(len(vocabularyList))

    ds_result = {}
    minErrorRate = np.inf
    for i in range(iterateNum):
        errorCount = 0.0
        for j in test_index:
            testWordsCount = naiveBayes.setOfWordsToVecTor(
                vocabularyList, mailWords[j])
            testWordsMarkedArray = np.array(testWordsCount)
            ps, ph, mailType = naiveBayes.adaboostClassify(
                vocabularyList, pWordsSpamicity, pWordsHealthy, DS, pSpam,
                testWordsMarkedArray)

            if mailType != classLables[j]:
                errorCount += 1
                alpha = ps - ph
                if alpha > 0:  # actual: ham; predict:spam
                    DS[testWordsMarkedArray != 0] = np.abs(
                        (DS[testWordsMarkedArray != 0] - np.exp(alpha)) /
                        DS[testWordsMarkedArray != 0])
                else:  # actual: spam; predict: ham
                    DS[testWordsMarkedArray != 0] = (
                        DS[testWordsMarkedArray != 0] +
                        np.exp(alpha)) / DS[testWordsMarkedArray != 0]

        print('DS:', DS)
        errorRate = errorCount / len(mailWords)
        if errorRate < minErrorRate:
            minErrorRate = errorRate
            ds_result['minErrorRate'] = minErrorRate
            ds_result['DS'] = DS
        print('# %d,errorcount %d ,errorrate %f' % (i, errorCount, errorRate))
        if errorRate == 0.0:
            break

    ds_result['vocabularyList'] = vocabularyList
    ds_result['pWordsSpamicity'] = pWordsSpamicity
    ds_result['pWordsHealthy'] = pWordsHealthy
    ds_result['pSpam'] = pSpam
    return ds_result
예제 #3
0
파일: train.py 프로젝트: zetaby/spam-filter
import numpy as np
import datetime
import simpleNavie as naiveBayes

beginTime = datetime.datetime.now()
filename = './public/'
#load data: load all the words in all the emails
smsWords, classLables = naiveBayes.loadMailData(filename)

#get the non-repeated features(vacabulary)
preVocabularyList = naiveBayes.createVocabularyList(smsWords)
#do wfo filter
vocabularyList = naiveBayes.wfoFilter(preVocabularyList, smsWords, classLables)

print("length of vocabularyList", len(vocabularyList))
fw = open('vocabularyList.txt', 'w')
for i in vocabularyList:
    fw.write(i + '\n')
fw.flush()
fw.close()
print( "vocabularyList finished")

#change to vector: each email is a vector included in the trainMakredWords
trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords)
print( "trainMarkedWords finished")
# change it to array
trainMarkedWords = np.array(trainMarkedWords)
print(  "data to matrix finished")
#calculate each propabilaty of spam and ham P(wi/s)  p(wi/h)
pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)
print("length of pWordsSpamicity:", len(pWordsSpamicity))
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                             f1_score, log_loss)
from sklearn.cross_validation import train_test_split
import simpleNavie as naiveBayes


filename = './public/'
# load data: load all the words in all the emails
mailWords, classLables = naiveBayes.loadMailData(filename)
preVocabularyList = naiveBayes.createVocabularyList(mailWords)
# do wfo filter
vocabularyList = naiveBayes.wfoFilter(preVocabularyList, mailWords, classLables)
print("vocabularyList finished")

trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, mailWords)
print("trainMarkedWords finished")

test_index = [2, 6, 7]

X_train = [trainMarkedWords[i] for  i in range(len(trainMarkedWords))]
X_test = [trainMarkedWords[i] for i in test_index]
y_train = [classLables[i] for  i in range(len(trainMarkedWords))]
y_test = [classLables[i] for i in test_index]

lr = LogisticRegression()
def crossValidateEvaluate():
    beginTime = datetime.datetime.now()
    filename = './public/'
    # load data: load all the words in all the emails
    mailWords, classLables = naiveBayes.loadMailData(filename)

    skf = StratifiedKFold(classLables, k_fold_num)
    acc_per_fold = []
    f1_per_fold = []
    recall_per_fold = []
    precision_per_fold = []

    for train_index, test_index in skf:
        print("train_index->", train_index)
        print("test_index->", test_index)
        preVocabularyList = naiveBayes.createVocabularyList(
            [mailWords[i] for i in train_index])
        #do wfo filter
        vocabularyList = naiveBayes.wfoFilter(
            preVocabularyList, [mailWords[i] for i in train_index],
            [classLables[i] for i in train_index])
        vocabularyList = preVocabularyList
        print("length of vocabularyList", len(vocabularyList))
        fw = open('vocabularyList.txt', 'w')
        for i in vocabularyList:
            fw.write(i + '\n')
        fw.flush()
        fw.close()
        print("vocabularyList finished")

        trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
            vocabularyList, [mailWords[i] for i in train_index])
        print("trainMarkedWords finished")

        # change it to array
        trainMarkedWords = np.array(trainMarkedWords)
        print("data to matrix finished")
        # calculate each propabilaty of spam and ham P(wi/s)  p(wi/h)
        pWordsSpamicity, pWordsHealthy, pSpam = \
            naiveBayes.trainingNaiveBayes(trainMarkedWords, [classLables[i] for i in train_index])
        fpSpam = open('pSpam.txt', 'w')
        spam = pSpam.__str__()
        fpSpam.write(spam)
        fpSpam.close()

        np.savetxt('pWordsSpamicity.txt', pWordsSpamicity, delimiter='\t')
        np.savetxt('pWordsHealthy.txt', pWordsHealthy, delimiter='\t')

        predict = naiveBayes.predict([mailWords[i] for i in test_index])
        #predict = naiveBayes.adaboostPredict([smsWords[i] for i in test_index])
        acc_per_fold.append(
            accuracy_score([classLables[i] for i in test_index], predict))
        f1_per_fold.append(
            f1_score([classLables[i] for i in test_index], predict))
        recall_per_fold.append(
            recall_score([classLables[i] for i in test_index], predict))
        precision_per_fold.append(
            precision_score([classLables[i] for i in test_index], predict))
        print("acc_per_fold:", acc_per_fold)
        print("f1_per_fold:", f1_per_fold)
        print("recall_per_fold:", recall_per_fold)
        print("precision_per_fold:", precision_per_fold)

    print("acc_per_fold:", acc_per_fold)
    print("f1_per_fold:", f1_per_fold)
    print("recall_per_fold:", recall_per_fold)
    print("precision_per_fold:", precision_per_fold)
    print("k-fold:", k_fold_num, " spend:",
          (datetime.datetime.now() - beginTime))