示例#1
0
def spamTest():
    hamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\ham")
    hamclassList = [0] * len(hamemail)
    spamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\spam")
    spamclassList = [1] * len(spamemail)
    Allemail = []
    Allemail.extend(hamemail)
    Allemail.extend(spamemail)
    AllList = []
    AllList.extend(hamclassList)
    AllList.extend(spamclassList)
    VocalbList = Bayes.createVocabList(Allemail)
    # print(VocalbList)
    testMat = []
    realclass = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(Allemail)))
        testMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[randIndex]))
        del (Allemail[randIndex])
        realclass.append(AllList[randIndex])
        del (AllList[randIndex])
    trainMat = []
    for i in range(len(Allemail)):
        trainMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[i]))
    p0vect, p1vect, pA = Bayes.trainNB0(trainMat, AllList)
    # print(p0vect,'\n',p1vect,'\n',pA)
    for i in range(10):
        print("test_result=", Bayes.classifyNB(testMat[i], p0vect, p1vect, pA),
              ",real_result=", realclass[i])
def spamTestOfvoc():
    docList =[];classList = [];fullText = []
    for i in range(1,26):#总共有50份文件,垃圾邮件25份,非垃圾邮件25份
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)  #加入一个词向量样本到docList
        fullText.extend(wordList) #extend()方法使得fullText中的元素都是单个的单词(list类型),参考:https://www.cnblogs.com/tzuxung/p/5706245.html
        classList.append(1) #spam中的样本都是垃圾邮件
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = Bayes.createVocabList(docList) #得到所有单词(特征)的词汇表
    trainingSet = range(50) #0-49
    testSet = []
    for i in range(10):#交叉验证,10个样本用于测试
        randIndex = int(random.uniform(0,len(trainingSet)))#生成一个在[0,len(trainingSet)的随机数
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])#在训练集中去掉测试集
    trainMat =[];trainClasses = []
    for docIndex in trainingSet:#得到训练集和其对应的类别
        trainMat.append(Bayes.setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = Bayes.trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:#测试,得到错误率
        wordVector = Bayes.setOfWords2Vec(vocabList,docList[docIndex])
        if Bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print ('the error rate is: ',float(errorCount)/len(testSet))
def spamTestOfbag():
    docList =[];classList = [];fullText = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList) #extend()方法使得fullText中的元素都是单个的单词(list类型),参考:https://www.cnblogs.com/tzuxung/p/5706245.html
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = Bayes.createVocabList(docList)
    trainingSet = range(50);testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat =[];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(Bayes.bagOfWord2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = Bayes.trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = Bayes.bagOfWord2VecMN(vocabList,docList[docIndex])
        if Bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print ('the error rate is: ',float(errorCount)/len(testSet))
示例#4
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('testDemo/email/span/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('testDemo/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = Bayes.createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:
            trainMat.append(Bayes.setOfWords2Vec(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        p0V, p1V, pSpam = Bayes.trainNBO(array(trainMat), array(trainClasses))
        errorCount = 0
        for docIndex in testSet:
            wordVector = Bayes.setOfWords2Vec(vocabList, docList[docIndex])
            if Bayes.classifyNB(array(wordVector), p0V, p1V,
                                pSpam) != classList[docIndex]:
                errorCount += 1
        print('the error rate is: ', float(errorCount) / len(testSet))
示例#5
0
def calculate_accurracy(root, noOfAcids, kMers, train_file, test_file, laplace_alpha, train_end_index = -1):
    csv_path = os.path.join(root, test_file)
    test_x, test_y = bs._load_dataset(csv_path)
    res, _, _ = bs.result_bayes(root, train_file, test_x, kMers, noOfAcids, laplace_alpha, train_end_index)
    #Find index of elements where we predicted cleavable
    trueIndices = np.where(np.array(test_y) == 1)
    #Find index of elements where we predicted nonCleavable
    falseIndices = np.where(np.array(test_y) == 0)
    #Generate results
    accuracy = ((np.sum(res[0,trueIndices]) + (np.size(falseIndices) - np.sum(res[0,falseIndices])))/len(test_x))
    return accuracy
示例#6
0
文件: TextParser.py 项目: baojiong/ml
def spamTest():
    docList = []
    classList = []
    fullText = []

    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(docList)
        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(docList)
        classList.append(0)

    vocabList = bayes.createVocabList(docList)
    """
    trainingSet =  [1, 49]
    生成10个50以内的随机数,加入testSet
    从trainingSet中删掉这些数。
    结果就是把【1...49],1分为2,10个作为 testSet, 其他作为 trainingSet
    trainingSet = [0, 1, 2, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 35, 38, 39, 41, 43, 44, 45, 46, 47, 48, 49]
    testSet = [36, 3, 40, 31, 10, 42, 7, 37, 15, 34]
    """
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat),
                                     np.array(trainClasses))
    errorCount = 0

    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(np.array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1

    print 'the error rate is: ', float(errorCount) / len(testSet)
示例#7
0
def eigenface(trainData, testData, dataVariety):
    # standardize train data
    dropTrainData = trainData.drop("variety", axis=1)
    trainMean = dropTrainData.sum()
    trainMean = trainMean.values.reshape([dropTrainData.shape[1], 1])
    trainMean = trainMean / dropTrainData.shape[0]
    newtrainData = PCA.normalize(trainData, trainMean)

    # calculate xT * x and its eigenvector
    normTrainData = newtrainData.drop("variety", axis=1)
    normTrainData = np.array(normTrainData)
    X = np.transpose(normTrainData)

    tempMat = np.zeros([X.shape[1], X.shape[1]])
    np.matmul(np.transpose(X), X, tempMat)
    eigValX, eigVecX = np.linalg.eigh(tempMat)

    # calculate X * eigenvector
    newEigVecX = np.zeros([X.shape[0], eigVecX.shape[1]])
    newEigVecX = np.matmul(X, eigVecX)

    # normalize eigenvector
    newEigVecX = np.transpose(newEigVecX)
    length = np.linalg.norm(newEigVecX, axis=1)
    for i in range(newEigVecX.shape[0]):
        newEigVecX[i] /= length[i]
    normEigVec = np.transpose(newEigVecX)


    # calculate A
    L = 20
    maxEigIdx = np.argsort(-eigValX)
    A = []
    for i in range(L):
        A.append(normEigVec[:, maxEigIdx[i]])
    A = np.array(A)
    A = np.transpose(A)

    newtestData = PCA.normalize(testData, trainMean)

    # projection of train data
    projTrainFrame = PCA.project(A, newtrainData)

    # projection of test data
    projTestFrame = PCA.project(A, newtestData)

    # # classify test data by likelihood
    # g1, testIdx1, success1, confusion_mat1 = Likelihood.likelihood(projTrainFrame, projTestFrame, dataVariety)
    # Header.calAccuracy(success1, projTestFrame)
    # Header.ROC_AUC(projTestFrame, dataVariety, g1, testIdx1)
    # Header.drawConfusionMat(confusion_mat1, dataVariety)

    # classify test data by bayes
    names = []
    for i in range(projTestFrame.shape[1] - 1):
        names.append('0')
    names.append('variety')
    g2, testIdx2, success2, confusion_mat2 = Bayes.bayes(projTrainFrame, projTestFrame, dataVariety, names)
    Header.calAccuracy(success2, projTestFrame)
    Header.drawConfusionMat(confusion_mat2, dataVariety)
示例#8
0
def tarea1(entrenamiento, prueba):
    d = Main()
    (t_0, t_1) = d.split(entrenamiento)
    nb = NaiveBayes.NaiveBayes(entrenamiento, t_1, t_0, prueba)
    nb.plot()
    b = Bayes.Bayes(entrenamiento, t_1, t_0, prueba)
    b.plot()
    return
示例#9
0
def testandscore(word):
    word_array = bayes.build_word_array(word)
    asfaiajioaf = bayes.setOfWordsListToVecTor(vocabList, word_array)
    aa, bb = ada_real.predict(asfaiajioaf)[0], ada_real.predict_proba(
        asfaiajioaf)[0]
    total = {}
    total["type"] = int(aa)  # 需要转化一下int跟int32是不同的,int32不能序列化
    temp = []
    ggg = {}
    ccc = {}
    ddd = {}
    print(len(str(bb[0])))
    print("end")

    a = float('%.5f' % bb[0])
    b = float('%.5f' % bb[1])
    c = float('%.5f' % bb[2])
    max_value = str(max([a, b, c]))
    min_value = str(min([a, b, c]))
    same = ''
    for i in range(0, len(min_value)):
        if max_value[i] == min_value[i]:
            same = same + min_value[i]
        else:
            break
    print(same)
    kkkk = pow(10, (len(same) - 2))
    a = (a - float(same)) * kkkk
    b = (b - float(same)) * kkkk
    c = (c - float(same)) * kkkk
    a = float('%.5f' % a)
    b = float('%.5f' % b)
    c = float('%.5f' % c)
    print(a, b, c)
    ggg["key"] = "正向"
    ggg["value"] = a
    ccc["key"] = "负向"
    ccc["value"] = b
    ddd["key"] = "客观"
    ddd["value"] = c
    temp.append(ggg)
    temp.append(ccc)
    temp.append(ddd)
    total["data"] = temp
    return total
示例#10
0
def construction():
    try:
        s = seg.Segmentation()
        s.segmentation()

        b = by.NBayes()
        b.initForTest()
    except Exception, e:
        raise
示例#11
0
def part2(root = './Dataset', trainfile = 'q2_train_set.txt', gagfile = 'q2_gag_sequence.txt'):
    def create_8mers(filename):
        with open(filename, 'r') as file:
            data = list(file.read())
            _8mer = [None] * kMers
            _8mers = [None] * (len(data) - kMers + 1)
            for char_i in range(len(data) - kMers + 1):
                for i in range(kMers):
                    _8mer[i] = data[i + char_i]
                _8mers[char_i] = _8mer
                _8mer = [None] * kMers
            return _8mers, len(data) - kMers + 1
        
    def read_amino_sequence(filename):
         with open(filename, 'r') as file:
            return list(file.read())
        
    def onehot_initialization(a):
        ncols = a.max()+1
        out = np.zeros(a.shape + (ncols,), dtype=int)
        out[all_idx(a, axis=2)] = 1
        return out
    
    def all_idx(idx, axis):
        grid = np.ogrid[tuple(map(slice, idx.shape))]
        grid.insert(axis, idx)
        return tuple(grid)
    
    noOfAcids = 20
    kMers = 8
    #Load Datasets
    gag_path = os.path.join(root, gagfile)
    mers, noOfMers = create_8mers(gag_path)
    aa_names_arr = ["g", "p", "a", "v", "l", "i", "m", "c", "f", "y", "w", "h", "k",
                "r", "q", "n", "e", "d", "s", "t"]
    aa_names = dict(zip(aa_names_arr, range(len(aa_names_arr))))
    mers = np.matrix([[aa_names[x] for x in mer_i] for mer_i in mers])
    mers_encoded = onehot_initialization(mers).reshape(noOfMers, noOfAcids * kMers)
    res, res1, res2 = bs.result_bayes(root, trainfile, mers_encoded, kMers, noOfAcids, 0)
    cleavableMers = np.where(res == 1 )[1]
    cleavableIndicesPrev = cleavableMers + 3
    cleavableIndicesNext = cleavableIndicesPrev + 1
    am_seq = read_amino_sequence(gag_path)
    cleaveAminoPrev = np.array(am_seq)[cleavableIndicesPrev]
    cleaveAminoNext = np.array(am_seq)[cleavableIndicesNext]
    cleavableAminoPairs = list(map(lambda x, y:(x,y), cleaveAminoPrev, cleaveAminoNext))
    cleavableIndexPairs = list(map(lambda x, y:(x,y), cleavableIndicesPrev, cleavableIndicesNext))
    maxCleavableIndex = np.where(res1 == np.max(res1[np.where(res == 1)]))[1]
    minNonCleavableIndex = np.where(res2 ==np.min(res2[np.where(res == 0)]))[1]
    maxCleavable8mer = [aa_names_arr[x] for x in np.squeeze(np.asarray(mers[maxCleavableIndex]))]
    minNonCleavable8mer = [aa_names_arr[x] for x in np.squeeze(np.asarray(mers[minNonCleavableIndex]))]
    print("cleavableAminoPairs:\n", cleavableAminoPairs, "\ncleavableIndexPairs:\n", cleavableIndexPairs)
    print("maxCleavableIndex:\n", maxCleavableIndex, "\nminCleavableIndex:\n", minNonCleavableIndex)
    print("maxCleavable8mer:\n", maxCleavable8mer, "\nminNonCleavable8mer:\n", minNonCleavable8mer)
示例#12
0
文件: sqamEmail.py 项目: chenruoxi/ML
def spamDict():

    docList = [
    ]  # 切分好的词组成的列表[['his','xx','xx'],['xx','xx','xx'],['xx','xx','xx']]
    classList = []  # 邮件的类别,垃圾邮件和正常邮件,垃圾邮件为1,正常邮件为0
    for i in range(1, 26):
        with open('ham/%d.txt' % i) as f:
            wordList = textParse(f.read())
            docList.append(wordList)
            classList.append(1)
        with open('spam/%d.txt' % i) as f:
            wordList = textParse(f.read())
            docList.append(wordList)
            classList.append(0)
    vocabList = Bayes.createVocabList(docList)  # 将docList组成词典
    '''
      从50封电子邮件中随机选出10封作为测试集,剩下的作为训练集
    '''
    trainingSet = range(50)
    testSet = []

    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del trainingSet[randIndex]

    trainMarix = []  # 训练集数据组成的训练矩阵
    trainingClass = []  # 训练数据集中的类别
    for docIndex in trainingSet:
        trainMarix.append(Bayes.setOfWord2Vector(vocabList, docList[docIndex]))
        trainingClass.append(classList[docIndex])
    pAb, p1v, p0v = Bayes.TrainingNB1(array(trainMarix), array(trainingClass))

    errorCount = 0.0
    for docIndex in testSet:
        thisDoc = array(Bayes.setOfWord2Vector(vocabList, docList[docIndex]))

        if classifyNB(array(thisDoc), p0v, p1v, pAb) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is :', float(errorCount) / len(testSet)
示例#13
0
def localWords(feed1, feed0):
    docList = []  #以二维数组形式存储所有样本的词汇表
    classList = []  #存储所有样本的类别信息
    fullText = []  #以一维数组形式存储所有样本的词汇表
    minLen = min(len(feed1['entries']), len(feed0['entries']))  #获取两个RSS源的最小长度
    for i in range(minLen):
        #解析feed1['entries'][i]['summary'],将长度大于2的单词提取出来,并全转换为小写
        wordList = Bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)  #将该样本词汇添加到docList中
        fullText.extend(wordList)  #将该样本词汇追加到fullText中
        classList.append(1)  #将样本类别信息添加到classList
        wordList = Bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = Bayes.createVocabList(docList)  #获取docList中所有不重复的单词列表
    #由于语言中大部分都是冗余和结构辅助性内容,导致词汇表中一小部分单词却占据了所有文本用词的一大部分。需要去除冗余词汇。
    #另一个常用的方法是不仅移除高频词,同时从某个预定词表中移除结构上的辅助词。该词表称为停用词表(stop word list)。
    top30Words = calcMostFreq(vocabList, fullText)  #获取在fullText中出现次数最多的30个词汇信息
    for pairW in top30Words:  #从词汇表vocabList中去除出现次数最多的30个单词
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = range(2 * minLen)
    #定义列表变量存储训练样本id
    print 'minLen : %d' % minLen
    if minLen < 20:
        print 'the len is too small.'
    testSet = []  #用于存储测试样本id
    for i in range(20):  #从训练样本中随机获取20个样本信息作为测试样本集,并从训练样本中去除这些样本
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    #从文本样本集中获取训练样本集,将相关文本样本的词汇出现次数信息存储到矩阵trainMat中,样本分类信息存储到trainClasses中
    for docIndex in trainingSet:
        #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况
        trainMat.append(Bayes.bagOfWords2Vec(vocabList, docList[docIndex]))
        #获取当前样本的分类信息classList[docIndex]
        trainClasses.append(classList[docIndex])
    #通过贝叶斯分类器对训练样本进行学习
    #获取两个类别各自单词的出现频率,以及样本集的概率
    p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    #使用测试样本集对学习结果进行测试
    for docIndex in testSet:
        #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况
        wordVector = Bayes.bagOfWords2Vec(vocabList, docList[docIndex])
        #对当前测试样本进行分类,判断是否与已知类型相同
        if Bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)  #打印出错误率
    return vocabList, p0V, p1V  #返回词汇表和各个词汇的出现概率
def testingNB():
    listPosts, listClasses = loadDataSet()
    vocabList = Bayes.createVocabList(listOPosts)
    trainMat = Bayes.words2Mat(vocabList, listOPosts)
    p0V, p1V, pAb = Bayes.trainNB(trainMat, np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry)
    print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'my', 'garbage']
    thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry)
    print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb))
示例#15
0
def part1(root='./Dataset',
          trainfile='q2_train_set.txt',
          testfile='q2_test_set.txt'):
    #Load Datasets
    noOfMers = 8
    noOfAcids = 20
    csv_path = os.path.join(root, trainfile)
    train_x, train_y = bs._load_dataset(csv_path)
    csv_path = os.path.join(root, testfile)
    test_x, test_y = bs._load_dataset(csv_path)
    #Train
    myRes, _, _ = bs.result_bayes(root, trainfile, test_x, noOfMers, noOfAcids)
    #Find index of elements where we predicted cleavable
    trueIndices = np.where(np.array(test_y) == 1)
    #Find index of elements where we predicted nonCleavable
    falseIndices = np.where(np.array(test_y) == 0)
    #Generate results
    print("Real cleavable number:  \t",
          np.size(trueIndices), "\t Number predicted true cleavable:\t",
          np.sum(myRes[0, trueIndices]), "\t Accuracy:\t",
          np.sum(myRes[0, trueIndices]) / np.size(trueIndices))
    print("Real nonCleavable number:\t", np.size(falseIndices),
          "\t Number predicted true nonCleavable:\t",
          np.size(falseIndices) - np.sum(myRes[0, falseIndices]),
          "\t Accuracy:\t",
          (np.size(falseIndices) - np.sum(myRes[0, falseIndices])) /
          np.size(falseIndices))
    print(
        "Total test size:\t\t", len(test_x),
        "\t Number predicted true in total:\t",
        np.sum(myRes[0, trueIndices]) +
        (np.size(falseIndices) - np.sum(myRes[0, falseIndices])),
        "\t Accuracy:\t",
        ((np.sum(myRes[0, trueIndices]) +
          (np.size(falseIndices) - np.sum(myRes[0, falseIndices]))) /
         len(test_x)))
示例#16
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):  #将文件夹spam和ham下所有文本文件解析出来
        #从对应文本文件中读出字符串,将其解析为单词列表
        wordList = Bayes.textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)  #将当前文本的词汇列表添加到docList变量中
        fullText.extend(wordList)  #将当前文本的所有单词追加到fullText变量中
        classList.append(1)  #分类列表变量classList中增加一个1类信息
        wordList = Bayes.textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)  #分类列表变量classList中增加一个0类信息
    vocabList = Bayes.createVocabList(docList)  #获取docList中所有出现过的单词的词汇表
    trainingSet = range(50)  #创建拥有50个元素的list变量,存储0-49个数字,对应spam与ham目录下所有文本
    testSet = []
    for i in range(10):  #从0-9循环,产生10个测试样本id
        #uniform() 方法将随机生成下一个实数,它在 [x, y) 范围内。
        #在[0, 50)之间产生一个随机整数
        randIndex = int(random.uniform(0, len(trainingSet)))
        print randIndex
        #将trainingSet中对应训练样本id添加到测试集testSet中,并从trainingSet中删除该id
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    #在40个训练样本中,逐个文本循环处理,获取1类和0类各个单词出现的概率及1类样本的概率
    for docIndex in trainingSet:
        #获取当前文档中单词在词汇表vocabList是否出现的列表,添加到列表变量trainMat中
        trainMat.append(Bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        #将对应文档的分类信息添加到trainClasses中
        trainClasses.append(classList[docIndex])
    #获取训练样本中1类和0类各个词汇的出现概率,及所有样本中1类样本所占概率
    p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses))
    print classList
    errorCount = 0
    #使用10个测试样本,对贝叶斯分类效果进行检测
    for docIndex in testSet:
        #获取当前测试样本中单词在词汇表vocabList是否出现的列表
        wordVector = Bayes.setOfWords2Vec(vocabList, docList[docIndex])
        #使用贝叶斯分类器对当前测试样本进行分类,判断分类结果是否正确
        if Bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)  #打印出分类错误率
def localWord(feed0,feed1):
   
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    listOfPost = [];classVec = [];fullText = []
    for i in range(minLen):
        wordList = Bayes.textParse(feed1['entries'][i]['summary'])#数组从feed1开始
        listOfPost.append(wordList)
        fullText.extend(wordList)
        classVec.append(1)
        
        wordList = Bayes.textParse(feed0['entries'][i]['summary'])
        listOfPost.append(wordList)
        fullText.extend(wordList)
        classVec.append(0)
        
    vocabList = Bayes.creatVocabList(listOfPost)
    
    top30Words = calMostFreq(vocabList,fullText)
    
    for pairW in top30Words:
        if pairW in vocabList:vocabList.remove(pairW)
        
    trainingSet = range(2*minLen) ; dataSet = []
    
    for i in range(20):
        randIndex = int(np.random.uniform(len(trainingSet)))
        dataSet.append(randIndex)
        del(trainingSet[randIndex])
    
    trainMat = [];trainClass = []
    for docIndex in trainingSet:
        trainMat.append(Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex]))
        trainClass.append(classVec[docIndex])
                    
    p0V,p1V,pSpam = Bayes.trainNB0(np.array(trainMat),trainClass)
    
    errorCount = 0.0
    
    for docIndex in dataSet:
        dataMat = Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex])
        
        if Bayes.classifyNB(np.array(dataMat),p0V,p1V,pSpam) != classVec[docIndex]:
            errorCount += 1
   # print "the error rate is :",errorCount/float(len(dataSet))
    
    return vocabList,p0V,p1V        
示例#18
0
def test():
    labels = []
    label_ids = set()
    doc_matrix = []

    dir = 'data/'
    train_file = 'train.txt'
    test_file = 'test.txt'
    word_set_file = 'all_words.txt'
    model_file = 'model.txt'

    with open(dir + train_file) as f:
        for l in f:
            l = l.replace('\n','')
            if l == '':
                continue
            comps = l.split('\t')
            assert(len(comps) == 2)
            if comps[1] == '':
                continue
            labels.append(comps[0])
            doc_matrix.append(comps[1].split(','))
            label_ids.add(comps[0])
    bayes_model = Bayes.Bayes(dir + word_set_file)
    bayes_model.train(doc_matrix, labels, list(label_ids), dir + model_file)

    #open the test file
    expect_labels = []
    predict_docs = []
    with open(dir + test_file) as f:
        for l in f:
            l = l.replace('\n', '')
            if l == '':
                continue
            comps = l.split('\t')
            if comps[1] == '':
                continue
            assert(len(comps) == 2)
            expect_labels.append(comps[0])
            predict_docs.append(comps[1].split(','))
    predict_labels = bayes_model.predict(dir + model_file, predict_docs)
    post_analysis(predict_labels, expect_labels)
示例#19
0
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = Bayes.createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        #trainMat.append(Bayes.setOfWords2Vec(myVocabList, postinDoc))
        trainMat.append(Bayes.bagOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = Bayes.trainNB0(trainMat, listClasses)
    testEntry = ['love', 'my', 'dalmation']
    #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry))
    thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry))
    print testEntry, 'classified as: ', Bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)
    testEntry = ['stupid', 'garbage']
    #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry))
    thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry))
    print testEntry, 'classified as: ', Bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)
示例#20
0
    def setUp(self):
        self.naiveBayes = Bayes.NaiveBayes()
        posts, classes = self.loadDataSet()
        self.vocabList = self.naiveBayes.composeList(posts)
        logging.log(logging.INFO, "Vocabulary List: " + str(self.vocabList))

        #start = time.time()

        vecMatrix = []
        for post in posts:
            binarizedVocab = self.naiveBayes.binarize(self.vocabList, post)
            #logging.log(logging.DEBUG, "Post: " + str(post))
            logging.log(logging.DEBUG,
                        "Binaried vector: " + str(binarizedVocab))
            vecMatrix.append(binarizedVocab)

        #stop = time.time()
        #logging.log(logging.INFO, "Consume %s seconds" % str(stop - start))

        self.p0, self.p1, self.pAbusive = self.naiveBayes.train(
            vecMatrix, classes)

        logging.log(logging.INFO, "P0: \n" + str(self.p0))
        logging.log(logging.INFO, "P1: \n" + str(self.p1))
示例#21
0
文件: 3b.py 项目: sunnyeyre/ML
from Bayes import *
import commands
import re

print '3b'
bc = Bayes()
bc.train('../data/arxiv/arxiv.train')
bc.predict('../data/arxiv/arxiv.test', 0, 1, 1, 0)

print '3c'

c = Bayes()
c.train('../data/arxiv/arxiv.train')
c.predict('../data/arxiv/arxiv.test', 0, 1, 10, 0)

print '3d'
nfold = 4
s_test = []
s_train = []
for d in range(nfold):
    s_test = []
    s_train = []
    with open('../data/arxiv/arxiv.norm.train', 'r') as f:
        for i, l in enumerate(f):
            if i % nfold == d:
                s_test.append(l)
            else:
                s_train.append(l)
    with open('../data/arxiv/arxiv.norm%d.test' % d, 'w') as test:
        for t in s_test:
            test.write(t)
示例#22
0
import Bayes
import feedparser

listOPosts, listClassed = Bayes.loadDataSet()

vocabList = Bayes.createVocabList(listOPosts)

# print vocabList

# print Bayes.setOfWordsToVec(vocabList, listOPosts[0])

# trainMat = []
# for postinDoc in listOPosts:
#     trainMat.append(Bayes.setOfWordsToVec(vocabList, postinDoc))

# p0V, p1V, pAb = Bayes.trainNB0(trainMat, listClassed)
# print p0V
# print p1V
# print pAb

# Bayes.testingNB()

# Bayes.spamTest("E:/TestDatas/MachineLearningInAction/Ch04/");

ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')

# vocabList, pNY, pSF = Bayes.localWords(ny, sf)

Bayes.getTopWords(ny, sf)
示例#23
0
    def learn(self, data=[], rare=""):
        """
    PREPROCESSING
    """
        if data == []:  # for test
            print "Enter name of file containing learning set: "
            learning_name = raw_input()
            learning_data = u.read_learning_data(learning_name, 0)
        else:
            learning_data = data
        fields = list(learning_data[0])
        num_fields = len(fields)
        num_nonevents = len(learning_data[0][fields[0]])
        num_events = len(learning_data[1][fields[0]])
        num_learning = num_nonevents + num_events

        if rare == "":
            print "Enter name of rare words file; leave blank to default to rares.txt"
            rare_name = raw_input()
            if rare_name == "":
                rare_name = "rares.txt"
            if not os.path.isfile(rare_name):
                print "Rare file does not exist. Set rares or check filename."
                return
        else:
            rare_name = rare
        """
    FEATURES
    """

        # get rares
        with open(rare_name) as rf:
            rares = rf.readlines()
        rares = [r.lower().rstrip() for r in rares]

        # get features
        learning_nonevent_features = [[] for i in xrange(num_nonevents)]
        learning_event_features = [[] for i in xrange(num_events)]

        for i in xrange(num_nonevents):
            for k in fields:
                learning_nonevent_features[i] = (
                    learning_nonevent_features[i] +
                    f.get_features(learning_data[0][k][i], k, rares))

        for i in xrange(num_events):
            for k in fields:
                learning_event_features[i] = (
                    learning_event_features[i] +
                    f.get_features(learning_data[1][k][i], k, rares))
        """
    DISTRIBUTION STORAGE
    """

        # get prior
        prior = (num_events / float(num_learning),
                 num_nonevents / float(num_learning))

        # get posterior
        event_by_features = u.by_features(learning_event_features)
        nonevent_by_features = u.by_features(learning_nonevent_features)
        event_posterior = [(b.mean(feature), b.stdev(feature))
                           for feature in event_by_features]
        nonevent_posterior = [(b.mean(feature), b.stdev(feature))
                              for feature in nonevent_by_features]

        # store in file
        distributionfile = open('distribution.txt', 'w')
        distributionfile.write("Prior (Event/Nonevent):\n")
        distributionfile.write(str(prior[0]) + " " + str(prior[1]) + "\n")
        distributionfile.write("Event posterior means: \n")
        for i in event_posterior:
            distributionfile.write(str(i[0]) + " ")
        distributionfile.write("\n")
        distributionfile.write("Event posterior stdevs: \n")
        for i in event_posterior:
            distributionfile.write(str(i[1]) + " ")
        distributionfile.write("\n")
        distributionfile.write("Nonevent posterior means: \n")
        for i in nonevent_posterior:
            distributionfile.write(str(i[0]) + " ")
        distributionfile.write("\n")
        distributionfile.write("Nonevent posterior stdevs: \n")
        for i in nonevent_posterior:
            distributionfile.write(str(i[1]) + " ")
        distributionfile.write("\n")
        distributionfile.close()
示例#24
0
    def guess(self, data=[], rare=""):
        """
    PREPROCESSING
    """

        if data == []:
            print "Enter name of file containing guess set: "
            guess_name = raw_input()
            guess_data = u.read_test_data(guess_name, 0)
        else:
            guess_data = data

        fields = list(guess_data)
        num_fields = len(fields)
        num_guesses = len(guess_data[fields[0]])

        if rare == "":
            print "Enter name of rare words file; leave blank to default to rares.txt"
            rare_name = raw_input()
            if rare_name == "":
                rare_name = "rares.txt"
            if not os.path.isfile(rare_name):
                print "Rare file does not exist. Set rares or check filename."
                return
        else:
            rare_name = rare
        """
    CALCULATE TEST FEATURES
    """

        # get rares
        with open(rare_name) as rf:
            rares = rf.readlines()
        rares = [r.lower().rstrip() for r in rares]

        guess_features = [[] for i in xrange(num_guesses)]
        for i in xrange(num_guesses):
            for k in fields:
                guess_features[i] = (
                    guess_features[i] +
                    f.get_features(guess_data[k][i], k, rares))
        """
    RETRIEVE DISTRIBUTION AND RUN BAYESIAN
    """

        if not os.path.isfile('distribution.txt'):
            print "Distribution not yet set. Run learn first."
            return

        try:
            distributionfile = open('distribution.txt', 'r')
            distributionfile.readline()  # prior title
            prior = map(float, distributionfile.readline().split())
            distributionfile.readline()  # event posterior means title
            event_means = map(float, distributionfile.readline().split())
            distributionfile.readline()  # event posterior stdevs title
            event_stdevs = map(float, distributionfile.readline().split())
            distributionfile.readline()  # nonevent posterior means title
            nonevent_means = map(float, distributionfile.readline().split())
            distributionfile.readline()  # nonevent posterior stdevs title
            nonevent_stdevs = map(float, distributionfile.readline().split())
            distributionfile.close()

            event_posterior = [(event_means[i], event_stdevs[i])
                               for i in xrange(len(event_means))]
            nonevent_posterior = [(nonevent_means[i], nonevent_stdevs[i])
                                  for i in xrange(len(nonevent_means))]
            two_posterior = (event_posterior, nonevent_posterior)
        except Exception as e:
            print e
            print "Problem reading distribution file. Rerun learn."
            return

        if len(guess_features[0]) != len(event_means):
            print "Number of features does not match distribution. Check guess set and rerun learn."
            return

        # # guess!
        guesses = [
            b.two_bayesian(prior, i, two_posterior) for i in guess_features
        ]

        # fi = open('skl.txt','r')
        # a = int(fi.readline().rstrip())
        # c = int(fi.readline().rstrip())
        # ef = [map(float, fi.readline().split()) for i in xrange(a)]
        # nef = [map(float, fi.readline().split()) for i in xrange(c)]
        # fi.close()
        # e = [1 for i in ef]
        # y = e + [0 for i in nef]
        # x = ef + nef
        # from sklearn.naive_bayes import MultinomialNB
        # clf = MultinomialNB()
        # clf.fit(x, y)
        # guesses = [clf.predict(i) for i in guess_features]
        """
    RESULTS
    """

        print "Is Event (First 40 Emails) | First 10 Words of Email Subject | First 10 Words of Email Body"
        for i in xrange(min(40, len(guesses))):
            subject = ' '.join(guess_data["subject"][i]
                               [:min(10, len(guess_data["subject"][i]))])
            message = ' '.join(guess_data["message"][i]
                               [:min(10, len(guess_data["message"][i]))])
            print guesses[i], " | ", subject, " | ", message

        # print "guesses: ", guesses
        # print "posteriror: ", two_posterior
        print "prior: ", prior

        return guesses
示例#25
0
import Bayes
import feedparser

listOPosts, listClassed = Bayes.loadDataSet()

vocabList = Bayes.createVocabList(listOPosts)

# print vocabList

# print Bayes.setOfWordsToVec(vocabList, listOPosts[0])

# trainMat = []
# for postinDoc in listOPosts:
#     trainMat.append(Bayes.setOfWordsToVec(vocabList, postinDoc))
    
# p0V, p1V, pAb = Bayes.trainNB0(trainMat, listClassed)
# print p0V
# print p1V
# print pAb

# Bayes.testingNB()

# Bayes.spamTest("E:/TestDatas/MachineLearningInAction/Ch04/");

ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')

# vocabList, pNY, pSF = Bayes.localWords(ny, sf)

Bayes.getTopWords(ny, sf)
示例#26
0
def Bayes_id(root_dir):
    global video_type_num, feature_dim
    global emotion_type_num, valabel_type_num
    splits_num = 5
    #先获取全部数据
    x = []
    y = []
    for i in range(splits_num):
        #从文件获取数据
        subjects, videos = get_subject(root_dir + 'subject/subject_video_' +
                                       str(i) + '.txt')
        features = np.loadtxt(root_dir + 'feature/EEG_feature_' + str(i) +
                              '.txt')
        va_labels = np.loadtxt(root_dir + 'valabel/valence_arousal_label_' +
                               str(i) + '.txt',
                               dtype=int)
        if 'HCI' in root_dir:
            emotions = np.loadtxt(root_dir + 'emotion/EEG_emotion_category_' +
                                  str(i) + '.txt',
                                  dtype=int)

        #归一化
        # features /= np.max(features)

        #收集每个维度的可选数目,0代表连续值
        attri_option_nums = []

        x_temp = []
        y_temp = []
        length = len(subjects)
        for j in range(length):
            x_temp.append([])
            x_temp[j].append(videos[j])
            attri_option_nums.append(video_type_num)
            x_temp[j].extend(va_labels[j])
            attri_option_nums.extend(valabel_type_num)
            if 'HCI' in root_dir:
                x_temp[j].append(emotions[j])
                attri_option_nums.append(emotion_type_num)
            x_temp[j].extend(features[j])
            attri_option_nums.extend([0] * feature_dim)
            y_temp.append(subjects[j] - 1)
        x.append(x_temp)
        y.append(y_temp)

    cvscores = []
    #交叉验证
    for i in range(splits_num):
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        for j in range(splits_num):
            if j == i:
                x_test.extend(x[j])
                y_test.extend(y[j])
            else:
                x_train.extend(x[j])
                y_train.extend(y[j])
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        model = Bayes.Mix_NB()
        model.fit(x_train, y_train, attri_option_nums)
        score = model.score(x_test, y_test)
        # temp = model.predict_log_proba(x_test)
        print("     %s: %.2f%%" % ('acc', score * 100))
        cvscores.append(score * 100)

        # model = GaussianNB()
        # model.fit(x_train,y_train)
        # score = model.score(x_test,y_test)
        # print("     %s: %.2f%%" % ('acc', score*100))
        # cvscores.append(score * 100)

    average_score = sum(cvscores) / len(cvscores)
    write_score(cvscores, average_score,
                root_dir + 'NB/subject_id_cvscores.txt')
    return average_score
示例#27
0
文件: 3b.py 项目: amnawaseem/ML
from Bayes import *
import commands
import re

print '3b'
bc = Bayes()
bc.train('../data/arxiv/arxiv.train')
bc.predict('../data/arxiv/arxiv.test', 0, 1, 1, 0)

print '3c'

c = Bayes()
c.train('../data/arxiv/arxiv.train')
c.predict('../data/arxiv/arxiv.test', 0, 1, 10, 0)

print '3d'
nfold = 4
s_test = []
s_train = []
for d in range(nfold):
  s_test = []
  s_train = []
  with open('../data/arxiv/arxiv.norm.train', 'r') as f:
    for i, l in enumerate(f):
      if i%nfold == d:
        s_test.append(l)
      else:
        s_train.append(l)
  with open('../data/arxiv/arxiv.norm%d.test' %d, 'w') as test:
    for t in s_test:
      test.write(t)
        print("Dictionary classes created")

        print("Creating and completing positive and negative dictionaries...")
        if SIZED_DCT is False:
            dictionary1.create_dictionary()
            print(f"Positive dictionary created")
            dictionary0.create_dictionary()
            print(f"Negative dictionary created \n \n")
        else:
            dictionary1.create_sized_dictionary(SIZE)
            print("Positive dictionary created")
            dictionary0.create_sized_dictionary(SIZE)
            print("Negative dictionary created \n \n")

        print("Creating BAYES class...")
        bayes = Bayes(dictionary1, dictionary0, testing_set)
        print("Bayes class created")

        print("Predicting sentiments for testing set...")
        nb_undetermined = bayes.predict_sentiments(LAPLACE_SMOOTHING,
                                                   pos_spl_nb, neg_spl_nb)
        print("Prediction of sentiments for testing set done")
        print(
            f"Number of tweets with undetermined sentiments : {nb_undetermined}"
        )

        print(
            "Comparing sentiments from the dataset with predicted sentiments..."
        )
        metrics, conf_matrix = bayes.compare_sentiments()
示例#29
0
文件: 4.py 项目: niumeng07/ML
#!/usr/bin/env python

# 中文支持
import Bayes

listOPost, listClasses = Bayes.loadDataSet()
# listOPost 由函数返回的N*M的数组  每一行为每个句子的M个词  一个N个句子
# listClasses 为每个句子是否有侮辱性词汇  0或1  在Bayes中人工确定
print("listOPost:")
print(listOPost)
print("listClasses:")
print(listClasses)
myVocabList = Bayes.createVocabList(listOPost)
# myVocabList由N*M的表创建的不重复的词汇表
print("myVocabList:")
print(myVocabList)
print("listOPost[0]:")
print(listOPost[0])
print("listOPost:")
print(listOPost)
print(Bayes.setOfWords2Vec(myVocabList, listOPost[0]))
# Bayes.setOfWords2Vec()两个参数,第一个是字典词汇表,第二个是要测试的一个句子
# 该函数测试词汇表中的每一个词是否出现在这个被测试的句子中
print(Bayes.setOfWords2Vec(myVocabList, listOPost[3]))

from numpy import *

trainMat = []
for postinDoc in listOPost:  # listOPost为一个M*N的矩阵
    print(postinDoc)  # 每一行
    trainMat.append(Bayes.setOfWords2Vec(myVocabList, postinDoc))  # 把每一行的每个词是否包含在字典中加入到trainMat中
'''
listOPosts, listClasses = loadDataSet()
vocabList = createVocabList(listOPosts)
print(vocabList)
vec = setOfWords2Vec(vocabList, listOPosts[0])
print(vec)
trainMat = words2Mat(vocabList, listOPosts)
print(trainMat.shape)
p0V, p1V, pAb = Bayes.trainNB(trainMat, np.array(listClasses))
print('p1V', p1V)
print('p0V', p0V)
print('pAb', pAb)
'''

def testingNB():
    listPosts, listClasses = loadDataSet()
    vocabList = Bayes.createVocabList(listOPosts)
    trainMat = Bayes.words2Mat(vocabList, listOPosts)
    p0V, p1V, pAb = Bayes.trainNB(trainMat, np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry)
    print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'my', 'garbage']
    thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry)
    print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb))
    
#testingNB()
    
Bayes.spamTest()