def spamTest(): hamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\ham") hamclassList = [0] * len(hamemail) spamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\spam") spamclassList = [1] * len(spamemail) Allemail = [] Allemail.extend(hamemail) Allemail.extend(spamemail) AllList = [] AllList.extend(hamclassList) AllList.extend(spamclassList) VocalbList = Bayes.createVocabList(Allemail) # print(VocalbList) testMat = [] realclass = [] for i in range(10): randIndex = int(random.uniform(0, len(Allemail))) testMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[randIndex])) del (Allemail[randIndex]) realclass.append(AllList[randIndex]) del (AllList[randIndex]) trainMat = [] for i in range(len(Allemail)): trainMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[i])) p0vect, p1vect, pA = Bayes.trainNB0(trainMat, AllList) # print(p0vect,'\n',p1vect,'\n',pA) for i in range(10): print("test_result=", Bayes.classifyNB(testMat[i], p0vect, p1vect, pA), ",real_result=", realclass[i])
def spamTestOfvoc(): docList =[];classList = [];fullText = [] for i in range(1,26):#总共有50份文件,垃圾邮件25份,非垃圾邮件25份 wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) #加入一个词向量样本到docList fullText.extend(wordList) #extend()方法使得fullText中的元素都是单个的单词(list类型),参考:https://www.cnblogs.com/tzuxung/p/5706245.html classList.append(1) #spam中的样本都是垃圾邮件 wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) #得到所有单词(特征)的词汇表 trainingSet = range(50) #0-49 testSet = [] for i in range(10):#交叉验证,10个样本用于测试 randIndex = int(random.uniform(0,len(trainingSet)))#生成一个在[0,len(trainingSet)的随机数 testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex])#在训练集中去掉测试集 trainMat =[];trainClasses = [] for docIndex in trainingSet:#得到训练集和其对应的类别 trainMat.append(Bayes.setOfWords2Vec(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = Bayes.trainNB0(array(trainMat),array(trainClasses)) errorCount = 0 for docIndex in testSet:#测试,得到错误率 wordVector = Bayes.setOfWords2Vec(vocabList,docList[docIndex]) if Bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print ('the error rate is: ',float(errorCount)/len(testSet))
def spamTestOfbag(): docList =[];classList = [];fullText = [] for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) #extend()方法使得fullText中的元素都是单个的单词(list类型),参考:https://www.cnblogs.com/tzuxung/p/5706245.html classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) trainingSet = range(50);testSet = [] for i in range(10): randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat =[];trainClasses = [] for docIndex in trainingSet: trainMat.append(Bayes.bagOfWord2VecMN(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = Bayes.trainNB0(array(trainMat),array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = Bayes.bagOfWord2VecMN(vocabList,docList[docIndex]) if Bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print ('the error rate is: ',float(errorCount)/len(testSet))
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('testDemo/email/span/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('testDemo/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(Bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = Bayes.trainNBO(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = Bayes.setOfWords2Vec(vocabList, docList[docIndex]) if Bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is: ', float(errorCount) / len(testSet))
def calculate_accurracy(root, noOfAcids, kMers, train_file, test_file, laplace_alpha, train_end_index = -1): csv_path = os.path.join(root, test_file) test_x, test_y = bs._load_dataset(csv_path) res, _, _ = bs.result_bayes(root, train_file, test_x, kMers, noOfAcids, laplace_alpha, train_end_index) #Find index of elements where we predicted cleavable trueIndices = np.where(np.array(test_y) == 1) #Find index of elements where we predicted nonCleavable falseIndices = np.where(np.array(test_y) == 0) #Generate results accuracy = ((np.sum(res[0,trueIndices]) + (np.size(falseIndices) - np.sum(res[0,falseIndices])))/len(test_x)) return accuracy
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(docList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(docList) classList.append(0) vocabList = bayes.createVocabList(docList) """ trainingSet = [1, 49] 生成10个50以内的随机数,加入testSet 从trainingSet中删掉这些数。 结果就是把【1...49],1分为2,10个作为 testSet, 其他作为 trainingSet trainingSet = [0, 1, 2, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 35, 38, 39, 41, 43, 44, 45, 46, 47, 48, 49] testSet = [36, 3, 40, 31, 10, 42, 7, 37, 15, 34] """ trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(np.random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat), np.array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet)
def eigenface(trainData, testData, dataVariety): # standardize train data dropTrainData = trainData.drop("variety", axis=1) trainMean = dropTrainData.sum() trainMean = trainMean.values.reshape([dropTrainData.shape[1], 1]) trainMean = trainMean / dropTrainData.shape[0] newtrainData = PCA.normalize(trainData, trainMean) # calculate xT * x and its eigenvector normTrainData = newtrainData.drop("variety", axis=1) normTrainData = np.array(normTrainData) X = np.transpose(normTrainData) tempMat = np.zeros([X.shape[1], X.shape[1]]) np.matmul(np.transpose(X), X, tempMat) eigValX, eigVecX = np.linalg.eigh(tempMat) # calculate X * eigenvector newEigVecX = np.zeros([X.shape[0], eigVecX.shape[1]]) newEigVecX = np.matmul(X, eigVecX) # normalize eigenvector newEigVecX = np.transpose(newEigVecX) length = np.linalg.norm(newEigVecX, axis=1) for i in range(newEigVecX.shape[0]): newEigVecX[i] /= length[i] normEigVec = np.transpose(newEigVecX) # calculate A L = 20 maxEigIdx = np.argsort(-eigValX) A = [] for i in range(L): A.append(normEigVec[:, maxEigIdx[i]]) A = np.array(A) A = np.transpose(A) newtestData = PCA.normalize(testData, trainMean) # projection of train data projTrainFrame = PCA.project(A, newtrainData) # projection of test data projTestFrame = PCA.project(A, newtestData) # # classify test data by likelihood # g1, testIdx1, success1, confusion_mat1 = Likelihood.likelihood(projTrainFrame, projTestFrame, dataVariety) # Header.calAccuracy(success1, projTestFrame) # Header.ROC_AUC(projTestFrame, dataVariety, g1, testIdx1) # Header.drawConfusionMat(confusion_mat1, dataVariety) # classify test data by bayes names = [] for i in range(projTestFrame.shape[1] - 1): names.append('0') names.append('variety') g2, testIdx2, success2, confusion_mat2 = Bayes.bayes(projTrainFrame, projTestFrame, dataVariety, names) Header.calAccuracy(success2, projTestFrame) Header.drawConfusionMat(confusion_mat2, dataVariety)
def tarea1(entrenamiento, prueba): d = Main() (t_0, t_1) = d.split(entrenamiento) nb = NaiveBayes.NaiveBayes(entrenamiento, t_1, t_0, prueba) nb.plot() b = Bayes.Bayes(entrenamiento, t_1, t_0, prueba) b.plot() return
def testandscore(word): word_array = bayes.build_word_array(word) asfaiajioaf = bayes.setOfWordsListToVecTor(vocabList, word_array) aa, bb = ada_real.predict(asfaiajioaf)[0], ada_real.predict_proba( asfaiajioaf)[0] total = {} total["type"] = int(aa) # 需要转化一下int跟int32是不同的,int32不能序列化 temp = [] ggg = {} ccc = {} ddd = {} print(len(str(bb[0]))) print("end") a = float('%.5f' % bb[0]) b = float('%.5f' % bb[1]) c = float('%.5f' % bb[2]) max_value = str(max([a, b, c])) min_value = str(min([a, b, c])) same = '' for i in range(0, len(min_value)): if max_value[i] == min_value[i]: same = same + min_value[i] else: break print(same) kkkk = pow(10, (len(same) - 2)) a = (a - float(same)) * kkkk b = (b - float(same)) * kkkk c = (c - float(same)) * kkkk a = float('%.5f' % a) b = float('%.5f' % b) c = float('%.5f' % c) print(a, b, c) ggg["key"] = "正向" ggg["value"] = a ccc["key"] = "负向" ccc["value"] = b ddd["key"] = "客观" ddd["value"] = c temp.append(ggg) temp.append(ccc) temp.append(ddd) total["data"] = temp return total
def construction(): try: s = seg.Segmentation() s.segmentation() b = by.NBayes() b.initForTest() except Exception, e: raise
def part2(root = './Dataset', trainfile = 'q2_train_set.txt', gagfile = 'q2_gag_sequence.txt'): def create_8mers(filename): with open(filename, 'r') as file: data = list(file.read()) _8mer = [None] * kMers _8mers = [None] * (len(data) - kMers + 1) for char_i in range(len(data) - kMers + 1): for i in range(kMers): _8mer[i] = data[i + char_i] _8mers[char_i] = _8mer _8mer = [None] * kMers return _8mers, len(data) - kMers + 1 def read_amino_sequence(filename): with open(filename, 'r') as file: return list(file.read()) def onehot_initialization(a): ncols = a.max()+1 out = np.zeros(a.shape + (ncols,), dtype=int) out[all_idx(a, axis=2)] = 1 return out def all_idx(idx, axis): grid = np.ogrid[tuple(map(slice, idx.shape))] grid.insert(axis, idx) return tuple(grid) noOfAcids = 20 kMers = 8 #Load Datasets gag_path = os.path.join(root, gagfile) mers, noOfMers = create_8mers(gag_path) aa_names_arr = ["g", "p", "a", "v", "l", "i", "m", "c", "f", "y", "w", "h", "k", "r", "q", "n", "e", "d", "s", "t"] aa_names = dict(zip(aa_names_arr, range(len(aa_names_arr)))) mers = np.matrix([[aa_names[x] for x in mer_i] for mer_i in mers]) mers_encoded = onehot_initialization(mers).reshape(noOfMers, noOfAcids * kMers) res, res1, res2 = bs.result_bayes(root, trainfile, mers_encoded, kMers, noOfAcids, 0) cleavableMers = np.where(res == 1 )[1] cleavableIndicesPrev = cleavableMers + 3 cleavableIndicesNext = cleavableIndicesPrev + 1 am_seq = read_amino_sequence(gag_path) cleaveAminoPrev = np.array(am_seq)[cleavableIndicesPrev] cleaveAminoNext = np.array(am_seq)[cleavableIndicesNext] cleavableAminoPairs = list(map(lambda x, y:(x,y), cleaveAminoPrev, cleaveAminoNext)) cleavableIndexPairs = list(map(lambda x, y:(x,y), cleavableIndicesPrev, cleavableIndicesNext)) maxCleavableIndex = np.where(res1 == np.max(res1[np.where(res == 1)]))[1] minNonCleavableIndex = np.where(res2 ==np.min(res2[np.where(res == 0)]))[1] maxCleavable8mer = [aa_names_arr[x] for x in np.squeeze(np.asarray(mers[maxCleavableIndex]))] minNonCleavable8mer = [aa_names_arr[x] for x in np.squeeze(np.asarray(mers[minNonCleavableIndex]))] print("cleavableAminoPairs:\n", cleavableAminoPairs, "\ncleavableIndexPairs:\n", cleavableIndexPairs) print("maxCleavableIndex:\n", maxCleavableIndex, "\nminCleavableIndex:\n", minNonCleavableIndex) print("maxCleavable8mer:\n", maxCleavable8mer, "\nminNonCleavable8mer:\n", minNonCleavable8mer)
def spamDict(): docList = [ ] # 切分好的词组成的列表[['his','xx','xx'],['xx','xx','xx'],['xx','xx','xx']] classList = [] # 邮件的类别,垃圾邮件和正常邮件,垃圾邮件为1,正常邮件为0 for i in range(1, 26): with open('ham/%d.txt' % i) as f: wordList = textParse(f.read()) docList.append(wordList) classList.append(1) with open('spam/%d.txt' % i) as f: wordList = textParse(f.read()) docList.append(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) # 将docList组成词典 ''' 从50封电子邮件中随机选出10封作为测试集,剩下的作为训练集 ''' trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del trainingSet[randIndex] trainMarix = [] # 训练集数据组成的训练矩阵 trainingClass = [] # 训练数据集中的类别 for docIndex in trainingSet: trainMarix.append(Bayes.setOfWord2Vector(vocabList, docList[docIndex])) trainingClass.append(classList[docIndex]) pAb, p1v, p0v = Bayes.TrainingNB1(array(trainMarix), array(trainingClass)) errorCount = 0.0 for docIndex in testSet: thisDoc = array(Bayes.setOfWord2Vector(vocabList, docList[docIndex])) if classifyNB(array(thisDoc), p0v, p1v, pAb) != classList[docIndex]: errorCount += 1 print 'the error rate is :', float(errorCount) / len(testSet)
def localWords(feed1, feed0): docList = [] #以二维数组形式存储所有样本的词汇表 classList = [] #存储所有样本的类别信息 fullText = [] #以一维数组形式存储所有样本的词汇表 minLen = min(len(feed1['entries']), len(feed0['entries'])) #获取两个RSS源的最小长度 for i in range(minLen): #解析feed1['entries'][i]['summary'],将长度大于2的单词提取出来,并全转换为小写 wordList = Bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) #将该样本词汇添加到docList中 fullText.extend(wordList) #将该样本词汇追加到fullText中 classList.append(1) #将样本类别信息添加到classList wordList = Bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) #获取docList中所有不重复的单词列表 #由于语言中大部分都是冗余和结构辅助性内容,导致词汇表中一小部分单词却占据了所有文本用词的一大部分。需要去除冗余词汇。 #另一个常用的方法是不仅移除高频词,同时从某个预定词表中移除结构上的辅助词。该词表称为停用词表(stop word list)。 top30Words = calcMostFreq(vocabList, fullText) #获取在fullText中出现次数最多的30个词汇信息 for pairW in top30Words: #从词汇表vocabList中去除出现次数最多的30个单词 if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = range(2 * minLen) #定义列表变量存储训练样本id print 'minLen : %d' % minLen if minLen < 20: print 'the len is too small.' testSet = [] #用于存储测试样本id for i in range(20): #从训练样本中随机获取20个样本信息作为测试样本集,并从训练样本中去除这些样本 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] #从文本样本集中获取训练样本集,将相关文本样本的词汇出现次数信息存储到矩阵trainMat中,样本分类信息存储到trainClasses中 for docIndex in trainingSet: #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况 trainMat.append(Bayes.bagOfWords2Vec(vocabList, docList[docIndex])) #获取当前样本的分类信息classList[docIndex] trainClasses.append(classList[docIndex]) #通过贝叶斯分类器对训练样本进行学习 #获取两个类别各自单词的出现频率,以及样本集的概率 p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 #使用测试样本集对学习结果进行测试 for docIndex in testSet: #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况 wordVector = Bayes.bagOfWords2Vec(vocabList, docList[docIndex]) #对当前测试样本进行分类,判断是否与已知类型相同 if Bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) #打印出错误率 return vocabList, p0V, p1V #返回词汇表和各个词汇的出现概率
def testingNB(): listPosts, listClasses = loadDataSet() vocabList = Bayes.createVocabList(listOPosts) trainMat = Bayes.words2Mat(vocabList, listOPosts) p0V, p1V, pAb = Bayes.trainNB(trainMat, np.array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry) print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'my', 'garbage'] thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry) print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb))
def part1(root='./Dataset', trainfile='q2_train_set.txt', testfile='q2_test_set.txt'): #Load Datasets noOfMers = 8 noOfAcids = 20 csv_path = os.path.join(root, trainfile) train_x, train_y = bs._load_dataset(csv_path) csv_path = os.path.join(root, testfile) test_x, test_y = bs._load_dataset(csv_path) #Train myRes, _, _ = bs.result_bayes(root, trainfile, test_x, noOfMers, noOfAcids) #Find index of elements where we predicted cleavable trueIndices = np.where(np.array(test_y) == 1) #Find index of elements where we predicted nonCleavable falseIndices = np.where(np.array(test_y) == 0) #Generate results print("Real cleavable number: \t", np.size(trueIndices), "\t Number predicted true cleavable:\t", np.sum(myRes[0, trueIndices]), "\t Accuracy:\t", np.sum(myRes[0, trueIndices]) / np.size(trueIndices)) print("Real nonCleavable number:\t", np.size(falseIndices), "\t Number predicted true nonCleavable:\t", np.size(falseIndices) - np.sum(myRes[0, falseIndices]), "\t Accuracy:\t", (np.size(falseIndices) - np.sum(myRes[0, falseIndices])) / np.size(falseIndices)) print( "Total test size:\t\t", len(test_x), "\t Number predicted true in total:\t", np.sum(myRes[0, trueIndices]) + (np.size(falseIndices) - np.sum(myRes[0, falseIndices])), "\t Accuracy:\t", ((np.sum(myRes[0, trueIndices]) + (np.size(falseIndices) - np.sum(myRes[0, falseIndices]))) / len(test_x)))
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): #将文件夹spam和ham下所有文本文件解析出来 #从对应文本文件中读出字符串,将其解析为单词列表 wordList = Bayes.textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) #将当前文本的词汇列表添加到docList变量中 fullText.extend(wordList) #将当前文本的所有单词追加到fullText变量中 classList.append(1) #分类列表变量classList中增加一个1类信息 wordList = Bayes.textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) #分类列表变量classList中增加一个0类信息 vocabList = Bayes.createVocabList(docList) #获取docList中所有出现过的单词的词汇表 trainingSet = range(50) #创建拥有50个元素的list变量,存储0-49个数字,对应spam与ham目录下所有文本 testSet = [] for i in range(10): #从0-9循环,产生10个测试样本id #uniform() 方法将随机生成下一个实数,它在 [x, y) 范围内。 #在[0, 50)之间产生一个随机整数 randIndex = int(random.uniform(0, len(trainingSet))) print randIndex #将trainingSet中对应训练样本id添加到测试集testSet中,并从trainingSet中删除该id testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] #在40个训练样本中,逐个文本循环处理,获取1类和0类各个单词出现的概率及1类样本的概率 for docIndex in trainingSet: #获取当前文档中单词在词汇表vocabList是否出现的列表,添加到列表变量trainMat中 trainMat.append(Bayes.setOfWords2Vec(vocabList, docList[docIndex])) #将对应文档的分类信息添加到trainClasses中 trainClasses.append(classList[docIndex]) #获取训练样本中1类和0类各个词汇的出现概率,及所有样本中1类样本所占概率 p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses)) print classList errorCount = 0 #使用10个测试样本,对贝叶斯分类效果进行检测 for docIndex in testSet: #获取当前测试样本中单词在词汇表vocabList是否出现的列表 wordVector = Bayes.setOfWords2Vec(vocabList, docList[docIndex]) #使用贝叶斯分类器对当前测试样本进行分类,判断分类结果是否正确 if Bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) #打印出分类错误率
def localWord(feed0,feed1): minLen = min(len(feed1['entries']),len(feed0['entries'])) listOfPost = [];classVec = [];fullText = [] for i in range(minLen): wordList = Bayes.textParse(feed1['entries'][i]['summary'])#数组从feed1开始 listOfPost.append(wordList) fullText.extend(wordList) classVec.append(1) wordList = Bayes.textParse(feed0['entries'][i]['summary']) listOfPost.append(wordList) fullText.extend(wordList) classVec.append(0) vocabList = Bayes.creatVocabList(listOfPost) top30Words = calMostFreq(vocabList,fullText) for pairW in top30Words: if pairW in vocabList:vocabList.remove(pairW) trainingSet = range(2*minLen) ; dataSet = [] for i in range(20): randIndex = int(np.random.uniform(len(trainingSet))) dataSet.append(randIndex) del(trainingSet[randIndex]) trainMat = [];trainClass = [] for docIndex in trainingSet: trainMat.append(Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex])) trainClass.append(classVec[docIndex]) p0V,p1V,pSpam = Bayes.trainNB0(np.array(trainMat),trainClass) errorCount = 0.0 for docIndex in dataSet: dataMat = Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex]) if Bayes.classifyNB(np.array(dataMat),p0V,p1V,pSpam) != classVec[docIndex]: errorCount += 1 # print "the error rate is :",errorCount/float(len(dataSet)) return vocabList,p0V,p1V
def test(): labels = [] label_ids = set() doc_matrix = [] dir = 'data/' train_file = 'train.txt' test_file = 'test.txt' word_set_file = 'all_words.txt' model_file = 'model.txt' with open(dir + train_file) as f: for l in f: l = l.replace('\n','') if l == '': continue comps = l.split('\t') assert(len(comps) == 2) if comps[1] == '': continue labels.append(comps[0]) doc_matrix.append(comps[1].split(',')) label_ids.add(comps[0]) bayes_model = Bayes.Bayes(dir + word_set_file) bayes_model.train(doc_matrix, labels, list(label_ids), dir + model_file) #open the test file expect_labels = [] predict_docs = [] with open(dir + test_file) as f: for l in f: l = l.replace('\n', '') if l == '': continue comps = l.split('\t') if comps[1] == '': continue assert(len(comps) == 2) expect_labels.append(comps[0]) predict_docs.append(comps[1].split(',')) predict_labels = bayes_model.predict(dir + model_file, predict_docs) post_analysis(predict_labels, expect_labels)
def testingNB(): listOPosts, listClasses = loadDataSet() myVocabList = Bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: #trainMat.append(Bayes.setOfWords2Vec(myVocabList, postinDoc)) trainMat.append(Bayes.bagOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = Bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation'] #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry)) thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as: ', Bayes.classifyNB( thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry)) thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as: ', Bayes.classifyNB( thisDoc, p0V, p1V, pAb)
def setUp(self): self.naiveBayes = Bayes.NaiveBayes() posts, classes = self.loadDataSet() self.vocabList = self.naiveBayes.composeList(posts) logging.log(logging.INFO, "Vocabulary List: " + str(self.vocabList)) #start = time.time() vecMatrix = [] for post in posts: binarizedVocab = self.naiveBayes.binarize(self.vocabList, post) #logging.log(logging.DEBUG, "Post: " + str(post)) logging.log(logging.DEBUG, "Binaried vector: " + str(binarizedVocab)) vecMatrix.append(binarizedVocab) #stop = time.time() #logging.log(logging.INFO, "Consume %s seconds" % str(stop - start)) self.p0, self.p1, self.pAbusive = self.naiveBayes.train( vecMatrix, classes) logging.log(logging.INFO, "P0: \n" + str(self.p0)) logging.log(logging.INFO, "P1: \n" + str(self.p1))
from Bayes import * import commands import re print '3b' bc = Bayes() bc.train('../data/arxiv/arxiv.train') bc.predict('../data/arxiv/arxiv.test', 0, 1, 1, 0) print '3c' c = Bayes() c.train('../data/arxiv/arxiv.train') c.predict('../data/arxiv/arxiv.test', 0, 1, 10, 0) print '3d' nfold = 4 s_test = [] s_train = [] for d in range(nfold): s_test = [] s_train = [] with open('../data/arxiv/arxiv.norm.train', 'r') as f: for i, l in enumerate(f): if i % nfold == d: s_test.append(l) else: s_train.append(l) with open('../data/arxiv/arxiv.norm%d.test' % d, 'w') as test: for t in s_test: test.write(t)
import Bayes import feedparser listOPosts, listClassed = Bayes.loadDataSet() vocabList = Bayes.createVocabList(listOPosts) # print vocabList # print Bayes.setOfWordsToVec(vocabList, listOPosts[0]) # trainMat = [] # for postinDoc in listOPosts: # trainMat.append(Bayes.setOfWordsToVec(vocabList, postinDoc)) # p0V, p1V, pAb = Bayes.trainNB0(trainMat, listClassed) # print p0V # print p1V # print pAb # Bayes.testingNB() # Bayes.spamTest("E:/TestDatas/MachineLearningInAction/Ch04/"); ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') # vocabList, pNY, pSF = Bayes.localWords(ny, sf) Bayes.getTopWords(ny, sf)
def learn(self, data=[], rare=""): """ PREPROCESSING """ if data == []: # for test print "Enter name of file containing learning set: " learning_name = raw_input() learning_data = u.read_learning_data(learning_name, 0) else: learning_data = data fields = list(learning_data[0]) num_fields = len(fields) num_nonevents = len(learning_data[0][fields[0]]) num_events = len(learning_data[1][fields[0]]) num_learning = num_nonevents + num_events if rare == "": print "Enter name of rare words file; leave blank to default to rares.txt" rare_name = raw_input() if rare_name == "": rare_name = "rares.txt" if not os.path.isfile(rare_name): print "Rare file does not exist. Set rares or check filename." return else: rare_name = rare """ FEATURES """ # get rares with open(rare_name) as rf: rares = rf.readlines() rares = [r.lower().rstrip() for r in rares] # get features learning_nonevent_features = [[] for i in xrange(num_nonevents)] learning_event_features = [[] for i in xrange(num_events)] for i in xrange(num_nonevents): for k in fields: learning_nonevent_features[i] = ( learning_nonevent_features[i] + f.get_features(learning_data[0][k][i], k, rares)) for i in xrange(num_events): for k in fields: learning_event_features[i] = ( learning_event_features[i] + f.get_features(learning_data[1][k][i], k, rares)) """ DISTRIBUTION STORAGE """ # get prior prior = (num_events / float(num_learning), num_nonevents / float(num_learning)) # get posterior event_by_features = u.by_features(learning_event_features) nonevent_by_features = u.by_features(learning_nonevent_features) event_posterior = [(b.mean(feature), b.stdev(feature)) for feature in event_by_features] nonevent_posterior = [(b.mean(feature), b.stdev(feature)) for feature in nonevent_by_features] # store in file distributionfile = open('distribution.txt', 'w') distributionfile.write("Prior (Event/Nonevent):\n") distributionfile.write(str(prior[0]) + " " + str(prior[1]) + "\n") distributionfile.write("Event posterior means: \n") for i in event_posterior: distributionfile.write(str(i[0]) + " ") distributionfile.write("\n") distributionfile.write("Event posterior stdevs: \n") for i in event_posterior: distributionfile.write(str(i[1]) + " ") distributionfile.write("\n") distributionfile.write("Nonevent posterior means: \n") for i in nonevent_posterior: distributionfile.write(str(i[0]) + " ") distributionfile.write("\n") distributionfile.write("Nonevent posterior stdevs: \n") for i in nonevent_posterior: distributionfile.write(str(i[1]) + " ") distributionfile.write("\n") distributionfile.close()
def guess(self, data=[], rare=""): """ PREPROCESSING """ if data == []: print "Enter name of file containing guess set: " guess_name = raw_input() guess_data = u.read_test_data(guess_name, 0) else: guess_data = data fields = list(guess_data) num_fields = len(fields) num_guesses = len(guess_data[fields[0]]) if rare == "": print "Enter name of rare words file; leave blank to default to rares.txt" rare_name = raw_input() if rare_name == "": rare_name = "rares.txt" if not os.path.isfile(rare_name): print "Rare file does not exist. Set rares or check filename." return else: rare_name = rare """ CALCULATE TEST FEATURES """ # get rares with open(rare_name) as rf: rares = rf.readlines() rares = [r.lower().rstrip() for r in rares] guess_features = [[] for i in xrange(num_guesses)] for i in xrange(num_guesses): for k in fields: guess_features[i] = ( guess_features[i] + f.get_features(guess_data[k][i], k, rares)) """ RETRIEVE DISTRIBUTION AND RUN BAYESIAN """ if not os.path.isfile('distribution.txt'): print "Distribution not yet set. Run learn first." return try: distributionfile = open('distribution.txt', 'r') distributionfile.readline() # prior title prior = map(float, distributionfile.readline().split()) distributionfile.readline() # event posterior means title event_means = map(float, distributionfile.readline().split()) distributionfile.readline() # event posterior stdevs title event_stdevs = map(float, distributionfile.readline().split()) distributionfile.readline() # nonevent posterior means title nonevent_means = map(float, distributionfile.readline().split()) distributionfile.readline() # nonevent posterior stdevs title nonevent_stdevs = map(float, distributionfile.readline().split()) distributionfile.close() event_posterior = [(event_means[i], event_stdevs[i]) for i in xrange(len(event_means))] nonevent_posterior = [(nonevent_means[i], nonevent_stdevs[i]) for i in xrange(len(nonevent_means))] two_posterior = (event_posterior, nonevent_posterior) except Exception as e: print e print "Problem reading distribution file. Rerun learn." return if len(guess_features[0]) != len(event_means): print "Number of features does not match distribution. Check guess set and rerun learn." return # # guess! guesses = [ b.two_bayesian(prior, i, two_posterior) for i in guess_features ] # fi = open('skl.txt','r') # a = int(fi.readline().rstrip()) # c = int(fi.readline().rstrip()) # ef = [map(float, fi.readline().split()) for i in xrange(a)] # nef = [map(float, fi.readline().split()) for i in xrange(c)] # fi.close() # e = [1 for i in ef] # y = e + [0 for i in nef] # x = ef + nef # from sklearn.naive_bayes import MultinomialNB # clf = MultinomialNB() # clf.fit(x, y) # guesses = [clf.predict(i) for i in guess_features] """ RESULTS """ print "Is Event (First 40 Emails) | First 10 Words of Email Subject | First 10 Words of Email Body" for i in xrange(min(40, len(guesses))): subject = ' '.join(guess_data["subject"][i] [:min(10, len(guess_data["subject"][i]))]) message = ' '.join(guess_data["message"][i] [:min(10, len(guess_data["message"][i]))]) print guesses[i], " | ", subject, " | ", message # print "guesses: ", guesses # print "posteriror: ", two_posterior print "prior: ", prior return guesses
def Bayes_id(root_dir): global video_type_num, feature_dim global emotion_type_num, valabel_type_num splits_num = 5 #先获取全部数据 x = [] y = [] for i in range(splits_num): #从文件获取数据 subjects, videos = get_subject(root_dir + 'subject/subject_video_' + str(i) + '.txt') features = np.loadtxt(root_dir + 'feature/EEG_feature_' + str(i) + '.txt') va_labels = np.loadtxt(root_dir + 'valabel/valence_arousal_label_' + str(i) + '.txt', dtype=int) if 'HCI' in root_dir: emotions = np.loadtxt(root_dir + 'emotion/EEG_emotion_category_' + str(i) + '.txt', dtype=int) #归一化 # features /= np.max(features) #收集每个维度的可选数目,0代表连续值 attri_option_nums = [] x_temp = [] y_temp = [] length = len(subjects) for j in range(length): x_temp.append([]) x_temp[j].append(videos[j]) attri_option_nums.append(video_type_num) x_temp[j].extend(va_labels[j]) attri_option_nums.extend(valabel_type_num) if 'HCI' in root_dir: x_temp[j].append(emotions[j]) attri_option_nums.append(emotion_type_num) x_temp[j].extend(features[j]) attri_option_nums.extend([0] * feature_dim) y_temp.append(subjects[j] - 1) x.append(x_temp) y.append(y_temp) cvscores = [] #交叉验证 for i in range(splits_num): x_train = [] y_train = [] x_test = [] y_test = [] for j in range(splits_num): if j == i: x_test.extend(x[j]) y_test.extend(y[j]) else: x_train.extend(x[j]) y_train.extend(y[j]) x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) y_test = np.array(y_test) model = Bayes.Mix_NB() model.fit(x_train, y_train, attri_option_nums) score = model.score(x_test, y_test) # temp = model.predict_log_proba(x_test) print(" %s: %.2f%%" % ('acc', score * 100)) cvscores.append(score * 100) # model = GaussianNB() # model.fit(x_train,y_train) # score = model.score(x_test,y_test) # print(" %s: %.2f%%" % ('acc', score*100)) # cvscores.append(score * 100) average_score = sum(cvscores) / len(cvscores) write_score(cvscores, average_score, root_dir + 'NB/subject_id_cvscores.txt') return average_score
from Bayes import * import commands import re print '3b' bc = Bayes() bc.train('../data/arxiv/arxiv.train') bc.predict('../data/arxiv/arxiv.test', 0, 1, 1, 0) print '3c' c = Bayes() c.train('../data/arxiv/arxiv.train') c.predict('../data/arxiv/arxiv.test', 0, 1, 10, 0) print '3d' nfold = 4 s_test = [] s_train = [] for d in range(nfold): s_test = [] s_train = [] with open('../data/arxiv/arxiv.norm.train', 'r') as f: for i, l in enumerate(f): if i%nfold == d: s_test.append(l) else: s_train.append(l) with open('../data/arxiv/arxiv.norm%d.test' %d, 'w') as test: for t in s_test: test.write(t)
print("Dictionary classes created") print("Creating and completing positive and negative dictionaries...") if SIZED_DCT is False: dictionary1.create_dictionary() print(f"Positive dictionary created") dictionary0.create_dictionary() print(f"Negative dictionary created \n \n") else: dictionary1.create_sized_dictionary(SIZE) print("Positive dictionary created") dictionary0.create_sized_dictionary(SIZE) print("Negative dictionary created \n \n") print("Creating BAYES class...") bayes = Bayes(dictionary1, dictionary0, testing_set) print("Bayes class created") print("Predicting sentiments for testing set...") nb_undetermined = bayes.predict_sentiments(LAPLACE_SMOOTHING, pos_spl_nb, neg_spl_nb) print("Prediction of sentiments for testing set done") print( f"Number of tweets with undetermined sentiments : {nb_undetermined}" ) print( "Comparing sentiments from the dataset with predicted sentiments..." ) metrics, conf_matrix = bayes.compare_sentiments()
#!/usr/bin/env python # 中文支持 import Bayes listOPost, listClasses = Bayes.loadDataSet() # listOPost 由函数返回的N*M的数组 每一行为每个句子的M个词 一个N个句子 # listClasses 为每个句子是否有侮辱性词汇 0或1 在Bayes中人工确定 print("listOPost:") print(listOPost) print("listClasses:") print(listClasses) myVocabList = Bayes.createVocabList(listOPost) # myVocabList由N*M的表创建的不重复的词汇表 print("myVocabList:") print(myVocabList) print("listOPost[0]:") print(listOPost[0]) print("listOPost:") print(listOPost) print(Bayes.setOfWords2Vec(myVocabList, listOPost[0])) # Bayes.setOfWords2Vec()两个参数,第一个是字典词汇表,第二个是要测试的一个句子 # 该函数测试词汇表中的每一个词是否出现在这个被测试的句子中 print(Bayes.setOfWords2Vec(myVocabList, listOPost[3])) from numpy import * trainMat = [] for postinDoc in listOPost: # listOPost为一个M*N的矩阵 print(postinDoc) # 每一行 trainMat.append(Bayes.setOfWords2Vec(myVocabList, postinDoc)) # 把每一行的每个词是否包含在字典中加入到trainMat中
''' listOPosts, listClasses = loadDataSet() vocabList = createVocabList(listOPosts) print(vocabList) vec = setOfWords2Vec(vocabList, listOPosts[0]) print(vec) trainMat = words2Mat(vocabList, listOPosts) print(trainMat.shape) p0V, p1V, pAb = Bayes.trainNB(trainMat, np.array(listClasses)) print('p1V', p1V) print('p0V', p0V) print('pAb', pAb) ''' def testingNB(): listPosts, listClasses = loadDataSet() vocabList = Bayes.createVocabList(listOPosts) trainMat = Bayes.words2Mat(vocabList, listOPosts) p0V, p1V, pAb = Bayes.trainNB(trainMat, np.array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry) print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'my', 'garbage'] thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry) print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb)) #testingNB() Bayes.spamTest()