示例#1
0
def tarea1(entrenamiento, prueba):
    d = Main()
    (t_0, t_1) = d.split(entrenamiento)
    nb = NaiveBayes.NaiveBayes(entrenamiento, t_1, t_0, prueba)
    nb.plot()
    b = Bayes.Bayes(entrenamiento, t_1, t_0, prueba)
    b.plot()
    return
示例#2
0
def test():
    labels = []
    label_ids = set()
    doc_matrix = []

    dir = 'data/'
    train_file = 'train.txt'
    test_file = 'test.txt'
    word_set_file = 'all_words.txt'
    model_file = 'model.txt'

    with open(dir + train_file) as f:
        for l in f:
            l = l.replace('\n','')
            if l == '':
                continue
            comps = l.split('\t')
            assert(len(comps) == 2)
            if comps[1] == '':
                continue
            labels.append(comps[0])
            doc_matrix.append(comps[1].split(','))
            label_ids.add(comps[0])
    bayes_model = Bayes.Bayes(dir + word_set_file)
    bayes_model.train(doc_matrix, labels, list(label_ids), dir + model_file)

    #open the test file
    expect_labels = []
    predict_docs = []
    with open(dir + test_file) as f:
        for l in f:
            l = l.replace('\n', '')
            if l == '':
                continue
            comps = l.split('\t')
            if comps[1] == '':
                continue
            assert(len(comps) == 2)
            expect_labels.append(comps[0])
            predict_docs.append(comps[1].split(','))
    predict_labels = bayes_model.predict(dir + model_file, predict_docs)
    post_analysis(predict_labels, expect_labels)
示例#3
0
文件: 3b.py 项目: sunnyeyre/ML
from Bayes import *
import commands
import re

print '3b'
bc = Bayes()
bc.train('../data/arxiv/arxiv.train')
bc.predict('../data/arxiv/arxiv.test', 0, 1, 1, 0)

print '3c'

c = Bayes()
c.train('../data/arxiv/arxiv.train')
c.predict('../data/arxiv/arxiv.test', 0, 1, 10, 0)

print '3d'
nfold = 4
s_test = []
s_train = []
for d in range(nfold):
    s_test = []
    s_train = []
    with open('../data/arxiv/arxiv.norm.train', 'r') as f:
        for i, l in enumerate(f):
            if i % nfold == d:
                s_test.append(l)
            else:
                s_train.append(l)
    with open('../data/arxiv/arxiv.norm%d.test' % d, 'w') as test:
        for t in s_test:
            test.write(t)
        print("Dictionary classes created")

        print("Creating and completing positive and negative dictionaries...")
        if SIZED_DCT is False:
            dictionary1.create_dictionary()
            print(f"Positive dictionary created")
            dictionary0.create_dictionary()
            print(f"Negative dictionary created \n \n")
        else:
            dictionary1.create_sized_dictionary(SIZE)
            print("Positive dictionary created")
            dictionary0.create_sized_dictionary(SIZE)
            print("Negative dictionary created \n \n")

        print("Creating BAYES class...")
        bayes = Bayes(dictionary1, dictionary0, testing_set)
        print("Bayes class created")

        print("Predicting sentiments for testing set...")
        nb_undetermined = bayes.predict_sentiments(LAPLACE_SMOOTHING,
                                                   pos_spl_nb, neg_spl_nb)
        print("Prediction of sentiments for testing set done")
        print(
            f"Number of tweets with undetermined sentiments : {nb_undetermined}"
        )

        print(
            "Comparing sentiments from the dataset with predicted sentiments..."
        )
        metrics, conf_matrix = bayes.compare_sentiments()
import SVM
import lr
import Bayes
import LDA
LDA.LDA()
Bayes.Bayes()
SVM.svmwch()
lr.lr()
示例#6
0
def performBayes(inputDataClass,
                 drawPrecisionRecall=False,
                 drawConfusion=False):
    """################################# Bayes Classifier #############################################"""

    ##Sklearn
    # print("\nSklearn Naive Bayes")
    # clf = GaussianNB()
    # clf.fit(inputDataClass.Train[:,:-1], inputDataClass.Train[:,-1])

    # Ypred = clf.predict(inputDataClass.Train[:,:-1])
    # Ytrue = inputDataClass.Train[:,-1]
    # print("Training Accuracy = "+str(performanceAnalyser.calcAccuracyTotal(Ypred,Ytrue)))

    # Ypred = clf.predict(inputDataClass.Test[:,:-1])
    # Ytrue = inputDataClass.Test[:,-1]
    # print("Testing Accuracy = "+str(performanceAnalyser.calcAccuracyTotal(Ypred,Ytrue)))

    print("\nMy Naive Bayes")
    bayesClassifier = Bayes.Bayes(
        isNaive=False,
        distribution=[0 for i in range(inputDataClass.Train.shape[1] - 1)])
    # bayesClassifier = Bayes.Bayes(isNaive = True, distribution =[-1,0,0,1,1,0])
    bayesClassifier.train(inputDataClass.Train)
    print("Training of model done.")

    Ypred = bayesClassifier.fit(inputDataClass.Train)
    Ytrue = inputDataClass.Train[:, -1]
    print("Training Accuracy = " +
          str(performanceAnalyser.calcAccuracyTotal(Ypred, Ytrue)))

    Ypred = bayesClassifier.fit(inputDataClass.Test)
    Ytrue = inputDataClass.Test[:, -1]
    print("Testing Accuracy = " +
          str(performanceAnalyser.calcAccuracyTotal(Ypred, Ytrue)))

    print("Prediction done.")

    if drawConfusion:
        confusion = performanceAnalyser.getConfusionMatrix(Ytrue, Ypred)
        Visualization.visualizeConfusion(confusion)

    if drawPrecisionRecall:
        ############################ precision-recall curve #############################
        threshold = np.arange(0.9, 0.1, -0.1)
        probas = bayesClassifier.get_probas()
        for dic in probas:
            sums = 0.0
            for item in dic:
                sums += dic[item]
            for item in dic:
                dic[item] = dic[item] / sums
        roc = ROC.Roc(Ytrue, probas, threshold, '')
        roc.Roc_gen()

        precision, recall, _ = precision_recall_curve(Ytrue, probas)

        plt.step(recall, precision, color='b', alpha=0.2, where='post')
        plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title('Precision Recall Curve')

    return Ytrue, Ypred
示例#7
0
def main():
    path_boy = "F:\\study in school\\machine learning\\forstudent\\实验数据\\boynew.txt"
    path_girl = "F:\\study in school\\machine learning\\forstudent\\实验数据\\girlnew.txt"
    # height = []
    # weight = []
    # feetsize = []
    x_boy = []
    x_girl = []
    label_boy = []  # 1表示男,0表示女
    label_girl = []
    readdata1(path_boy, x_boy, label_boy, 1)
    readdata1(path_girl, x_girl, label_girl, 0)
    x_boy = np.mat(x_boy)
    x_girl = np.mat(x_girl)
    m1 = x_boy.mean(0)
    m0 = x_girl.mean(0)
    S1 = (x_boy - m1[0]).T * (x_boy - m1[0])
    S0 = (x_girl - m0[0]).T * (x_girl - m0[0])
    Sw = S1 + S0
    S_inverse = Sw.I
    W = S_inverse * (m1 - m0).T
    M1 = float(W.T * m1.T)
    M0 = float(W.T * m0.T)
    w_decision0 = (M0 + M1) / 2
    path_boy_test = "F:\\study in school\\machine learning\\forstudent\\实验数据\\boy.txt"
    path_girl_test = "F:\\study in school\\machine learning\\forstudent\\实验数据\\girl.txt"
    x = []
    label = []
    readdata1(path_boy_test, x, label, 1)
    readdata1(path_girl_test, x, label, 0)
    label_test = []
    y = x * W
    errorcount = 0
    for i in range(len(label)):
        if float(y[i] > w_decision0):
            label_test.append(1)
            if label[i] != 1:
                errorcount = errorcount + 1
        else:
            label_test.append(0)
            if label[i] != 0:
                errorcount = errorcount + 1

    e_percentage = errorcount / len(label_test)
    print('fisher测试集的错误率为%f' % e_percentage)

    #留一法
    loo = LeavePOut(p=1)
    error = 0
    for train, test in loo.split(x, label):
        x_boy = []
        x_girl = []
        label_boy = []  # 1表示男,0表示女
        label_girl = []
        for i in train:
            if label[i] == 1:
                x_boy.append(x[i])
                label_boy.append(1)
            else:
                x_girl.append(x[i])
                label_girl.append(0)
        x_boy = np.mat(x_boy)
        x_girl = np.mat(x_girl)
        m1 = x_boy.mean(0)
        m0 = x_girl.mean(0)
        S1 = (x_boy - m1[0]).T * (x_boy - m1[0])
        S0 = (x_girl - m0[0]).T * (x_girl - m0[0])
        Sw = S1 + S0
        S_inverse = Sw.I
        W = S_inverse * (m1 - m0).T
        M1 = float(W.T * m1.T)
        M0 = float(W.T * m0.T)
        w_decision0 = (M0 + M1) / 2

        for j in test:
            if float(x[j] * W > w_decision0):
                if label[j] != 1:
                    error = error + 1
            else:
                label_test.append(0)
                if label[j] != 0:
                    error = error + 1

    print('fisher留一法的错误率为%f' % (error / len(label)))

    figure(3)
    FPR, TPR = get_roc_fisher(W, w_decision0, x, label)
    plot(FPR, TPR, label='fisher')

    figure(5)
    x1 = np.arange(130, 190, 0.01)
    y1 = (w_decision0 - W[0] * x1) / W[1]
    plot(x1, array(y1)[0])
    plot(x1, x1 * float(W[1]) / float(W[0]))
    for i in range(len(label)):
        if label[i] == 1:
            plot(float(x[i][0]), float(x[i][1]), 'o', color='r')
        else:
            plot(float(x[i][0]), float(x[i][1]), 'o', color='g')
        a=(float(x[i][1])+float(x[i][0])*float(W[0])/float(W[1]))/\
            (float(W[1])/float(W[0])+float(W[0])/float(W[1]))
        b = a * float(W[1]) / float(W[0])
        plot([float(x[i][0]), a], [float(x[i][1]), b], '--', color='0.75')

    axis([140, 190, 35, 85])

    Bayes()
示例#8
0
class Test(object):

    bayes = Bayes.Bayes()

    def testingNB(self):
        listOPosts, listClasses = self.bayes.loadDataSet()
        myVocabList = self.bayes.createVocabList(listOPosts)
        trainMat = []
        for postinDoc in listOPosts:
            trainMat.append(self.bayes.setOfWords2Vec(myVocabList, postinDoc))
        p0V, p1V, pAb = self.bayes.trainNB0(array(trainMat),
                                            array(listClasses))
        testEntry = ['love', 'my', 'dalmation']
        thisDoc = array(self.bayes.setOfWords2Vec(myVocabList, testEntry))
        testEntry = ['stupid', 'garbage']
        thisDoc = array(self.bayes.setOfWords2Vec(myVocabList, testEntry))
        print testEntry, 'classified as: ', self.bayes.classifyNB(
            thisDoc, p0V, p1V, pAb)

    # 测试函数
    def spamTest(self):
        docList = []
        classList = []
        fullText = []
        for i in range(1, 26):
            # 导入文件 并解析成词列表
            wordList = self.bayes.textParse(
                open(Config.DATAS + 'NaiveBayes/email/spam/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
            wordList = self.bayes.textParse(
                open(Config.DATAS + 'NaiveBayes/email/ham/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)
        vocabList = self.bayes.createVocabList(docList)  # create vocabulary
        trainingSet = range(50)
        testSet = []  # 随机构建测试函数
        for i in range(10):
            randIndex = int(random.uniform(0, len(trainingSet)))
            testSet.append(trainingSet[randIndex])
            del (trainingSet[randIndex])
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:  # 测试分类器
            trainMat.append(
                self.bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        p0V, p1V, pSpam = self.bayes.trainNB0(array(trainMat),
                                              array(trainClasses))
        errorCount = 0
        for docIndex in testSet:  # classify the remaining items
            wordVector = self.bayes.bagOfWords2VecMN(vocabList,
                                                     docList[docIndex])
            if self.bayes.classifyNB(array(wordVector), p0V, p1V,
                                     pSpam) != classList[docIndex]:
                errorCount += 1
                print "classification error", docList[docIndex]
        e = float(errorCount) / len(testSet)
        return e
        print 'the error rate is: ', e