def svmCV(k, filetype, groups, cp, ke): predictAcc = np.zeros(k) for i in range(k): print 'k:', k, 'run:', i result = dv.createDataVoca(k, i, filetype) data = result[0] voca = result[1] label = result[2] subsetSize = len(label) / k testLabel = label[i * subsetSize:(i + 1) * subsetSize] trainLabel = np.append(label[:i * subsetSize], label[(i + 1) * subsetSize:], axis=0) lenTestLab = len(testLabel) lenTrainLab = len(trainLabel) print 'len(testLabel):', lenTestLab print 'len(trainLabel):', lenTrainLab mask = np.logical_and((data[:, 0] > i * subsetSize), (data[:, 0] <= (i + 1) * subsetSize)) tem = np.array([i * subsetSize, 0, 0]) testData = data[mask] - tem tem1 = np.array([subsetSize, 0, 0]) maskhi = data[:, 0] > (i + 1) * subsetSize trainDatahi = data[maskhi] - tem1 masklo = data[:, 0] <= i * subsetSize trainDatalo = data[masklo] trainData = np.append(trainDatahi, trainDatalo, axis=0) # generate features for all the reviews trainFeat = fvec.mulFeatGen(trainData, voca, lenTrainLab) testFeat = fvec.mulFeatGen(testData, voca, lenTestLab) clf = svm.SVC(C=cp, kernel=ke, cache_size=1000) print "we are training our data:" clf.fit(trainFeat, trainLabel) print "we are testing our data:" predictMul = clf.predict(testFeat) print "we are calculating accuracy:" predictAcc[i] = fvec.accPredict(testLabel, predictMul) print "accuracy: " + str(predictAcc[i]) joblib.dump( clf, '../modelSave/svmModelAll_K' + str(k) + '_Run' + str(i) + '_' + filetype + '.pkl') # averageAcc = np.mean(predictAcc) return predictAcc
def svmCV(k, filetype, groups, cp, ke): predictAcc = np.zeros(k) for i in range(k): print 'k:',k,'run:',i result = dv.createDataVoca(k,i,filetype) data = result[0] voca = result[1] label = result[2] subsetSize = len(label)/k testLabel = label[i*subsetSize:(i+1)*subsetSize] trainLabel = np.append(label[:i*subsetSize],label[(i+1)*subsetSize:], axis=0) lenTestLab = len(testLabel) lenTrainLab = len(trainLabel) print 'len(testLabel):',lenTestLab print 'len(trainLabel):',lenTrainLab mask = np.logical_and((data[:,0]>i*subsetSize), (data[:,0]<=(i+1)*subsetSize)) tem = np.array([i*subsetSize,0,0]) testData = data[mask] - tem tem1 = np.array([subsetSize,0,0]) maskhi = data[:,0]>(i+1)*subsetSize trainDatahi = data[maskhi] - tem1 masklo = data[:,0]<=i*subsetSize trainDatalo = data[masklo] trainData = np.append(trainDatahi, trainDatalo, axis=0) # generate features for all the reviews trainFeat = fvec.mulFeatGen(trainData, voca, lenTrainLab) testFeat = fvec.mulFeatGen(testData, voca, lenTestLab) clf = svm.SVC(C=cp, kernel=ke, cache_size=1000) print "we are training our data:" clf.fit(trainFeat, trainLabel) print "we are testing our data:" predictMul = clf.predict(testFeat) print "we are calculating accuracy:" predictAcc[i] = fvec.accPredict(testLabel,predictMul) print "accuracy: "+str(predictAcc[i]) joblib.dump(clf, '../modelSave/svmModelAll_K'+str(k)+'_Run'+str(i)+'_'+filetype+'.pkl') # averageAcc = np.mean(predictAcc) return predictAcc
def navieBayesMulTest(groups, voca, testData, testLabel, probY, probIyMul): # Number of testing docs numDocs = len(testLabel) # Generate feature vector for each doc testMulFeat = fvec.mulFeatGen(testData, voca, numDocs) # Apply Navie Bayes classifier to test data # Multinomial: log(p(x|y)) = sum(xi*log(Pi|y)) + log(p(y)) # p(y=label|x) ~ p(y=label)*p(x|y=label) probYXmul = np.zeros((numDocs, groups)) for index, item in enumerate(testMulFeat): probYXmul[index, :] = np.sum(item * np.log(probIyMul), axis=1) + np.log(probY) # Find the best prediction label for each doc predictMul = np.argmax(probYXmul, axis=1) + 1 return predictMul
import numpy as np import featureVector as fvec with np.load("learnMulAlp.npz") as data: groups = data["params"][0] voca = data["params"][1] probIyMulAlpha = data["probIyMulAlpha"] probY = data["probY"] testData = np.loadtxt("../data/test.data", delimiter=' ', dtype=int) #testLabel = np.loadtxt("../data/test.label", delimiter=' ', dtype=int) # Generate feature vector for each doc testMulFeat = fvec.mulFeatGen(testData, voca) # Number of testing docs numDocs = len(testMulFeat) # Apply Navie Bayes classifier to test data # Multinomial: log(p(x|y)) = sum(xi*log(Pi|y)) + log(p(y)) # p(y=label|x) ~ p(y=label)*p(x|y=label) alpRan = probIyMulAlpha.shape[0] probYXmul = np.zeros((alpRan,numDocs,groups)) for i,alp in enumerate(probIyMulAlpha): for index, item in enumerate(testMulFeat): probYXmul[i,index,:] = np.sum(item*np.log(alp),axis=1)+np.log(probY) # Find the best prediction label for each doc