def svmCV(k, filetype, groups, cp, ke):

    predictAcc = np.zeros(k)

    for i in range(k):

        print 'k:', k, 'run:', i
        result = dv.createDataVoca(k, i, filetype)
        data = result[0]
        voca = result[1]
        label = result[2]

        subsetSize = len(label) / k
        testLabel = label[i * subsetSize:(i + 1) * subsetSize]
        trainLabel = np.append(label[:i * subsetSize],
                               label[(i + 1) * subsetSize:],
                               axis=0)

        lenTestLab = len(testLabel)
        lenTrainLab = len(trainLabel)

        print 'len(testLabel):', lenTestLab
        print 'len(trainLabel):', lenTrainLab

        mask = np.logical_and((data[:, 0] > i * subsetSize),
                              (data[:, 0] <= (i + 1) * subsetSize))

        tem = np.array([i * subsetSize, 0, 0])
        testData = data[mask] - tem

        tem1 = np.array([subsetSize, 0, 0])
        maskhi = data[:, 0] > (i + 1) * subsetSize
        trainDatahi = data[maskhi] - tem1

        masklo = data[:, 0] <= i * subsetSize
        trainDatalo = data[masklo]

        trainData = np.append(trainDatahi, trainDatalo, axis=0)

        # generate features for all the reviews
        trainFeat = fvec.mulFeatGen(trainData, voca, lenTrainLab)
        testFeat = fvec.mulFeatGen(testData, voca, lenTestLab)

        clf = svm.SVC(C=cp, kernel=ke, cache_size=1000)
        print "we are training our data:"
        clf.fit(trainFeat, trainLabel)
        print "we are testing our data:"
        predictMul = clf.predict(testFeat)
        print "we are calculating accuracy:"
        predictAcc[i] = fvec.accPredict(testLabel, predictMul)

        print "accuracy: " + str(predictAcc[i])
        joblib.dump(
            clf, '../modelSave/svmModelAll_K' + str(k) + '_Run' + str(i) +
            '_' + filetype + '.pkl')
    # averageAcc = np.mean(predictAcc)

    return predictAcc
def svmCV(k, filetype, groups, cp, ke):

	predictAcc = np.zeros(k)

	for i in range(k):

		print 'k:',k,'run:',i
		result = dv.createDataVoca(k,i,filetype)
		data = result[0]
		voca = result[1]
		label = result[2]

		subsetSize = len(label)/k
		testLabel = label[i*subsetSize:(i+1)*subsetSize]
		trainLabel = np.append(label[:i*subsetSize],label[(i+1)*subsetSize:], axis=0)

		lenTestLab = len(testLabel)
		lenTrainLab = len(trainLabel)

		print 'len(testLabel):',lenTestLab
		print 'len(trainLabel):',lenTrainLab

		mask = np.logical_and((data[:,0]>i*subsetSize), (data[:,0]<=(i+1)*subsetSize))
		
		tem = np.array([i*subsetSize,0,0])
		testData = data[mask] - tem

		tem1 = np.array([subsetSize,0,0])
		maskhi = data[:,0]>(i+1)*subsetSize
		trainDatahi = data[maskhi] - tem1

		masklo = data[:,0]<=i*subsetSize
		trainDatalo = data[masklo]

		trainData = np.append(trainDatahi, trainDatalo, axis=0)
		
		# generate features for all the reviews
		trainFeat = fvec.mulFeatGen(trainData, voca, lenTrainLab)
		testFeat = fvec.mulFeatGen(testData, voca, lenTestLab)

		clf = svm.SVC(C=cp, kernel=ke, cache_size=1000)
		print "we are training our data:"
		clf.fit(trainFeat, trainLabel)
		print "we are testing our data:"
		predictMul = clf.predict(testFeat)		
		print "we are calculating accuracy:"
		predictAcc[i] = fvec.accPredict(testLabel,predictMul)
		
		print "accuracy: "+str(predictAcc[i])
		joblib.dump(clf, '../modelSave/svmModelAll_K'+str(k)+'_Run'+str(i)+'_'+filetype+'.pkl') 
	# averageAcc = np.mean(predictAcc)

	return predictAcc
def navieBayesMulTest(groups, voca, testData, testLabel, probY, probIyMul):

    # Number of testing docs
    numDocs = len(testLabel)

    # Generate feature vector for each doc
    testMulFeat = fvec.mulFeatGen(testData, voca, numDocs)

    # Apply Navie Bayes classifier to test data
    # Multinomial: log(p(x|y)) = sum(xi*log(Pi|y)) + log(p(y))
    # p(y=label|x) ~ p(y=label)*p(x|y=label)

    probYXmul = np.zeros((numDocs, groups))
    for index, item in enumerate(testMulFeat):
        probYXmul[index, :] = np.sum(item * np.log(probIyMul), axis=1) + np.log(probY)

        # Find the best prediction label for each doc
    predictMul = np.argmax(probYXmul, axis=1) + 1

    return predictMul
def navieBayesMulTest(groups, voca, testData, testLabel, probY, probIyMul):

    # Number of testing docs
    numDocs = len(testLabel)

    # Generate feature vector for each doc
    testMulFeat = fvec.mulFeatGen(testData, voca, numDocs)

    # Apply Navie Bayes classifier to test data
    # Multinomial: log(p(x|y)) = sum(xi*log(Pi|y)) + log(p(y))
    # p(y=label|x) ~ p(y=label)*p(x|y=label)

    probYXmul = np.zeros((numDocs, groups))
    for index, item in enumerate(testMulFeat):
        probYXmul[index, :] = np.sum(item * np.log(probIyMul),
                                     axis=1) + np.log(probY)

    # Find the best prediction label for each doc
    predictMul = np.argmax(probYXmul, axis=1) + 1

    return predictMul
예제 #5
0
import numpy as np
import featureVector as fvec


with np.load("learnMulAlp.npz") as data:
	groups = data["params"][0]
	voca = data["params"][1]
	probIyMulAlpha = data["probIyMulAlpha"]
	probY = data["probY"]

testData = np.loadtxt("../data/test.data", delimiter=' ', dtype=int)
#testLabel = np.loadtxt("../data/test.label", delimiter=' ', dtype=int)

# Generate feature vector for each doc
testMulFeat = fvec.mulFeatGen(testData, voca)

# Number of testing docs
numDocs = len(testMulFeat)


# Apply Navie Bayes classifier to test data
# Multinomial: log(p(x|y)) = sum(xi*log(Pi|y)) + log(p(y))
# p(y=label|x) ~ p(y=label)*p(x|y=label)
alpRan = probIyMulAlpha.shape[0]
probYXmul = np.zeros((alpRan,numDocs,groups))
for i,alp in enumerate(probIyMulAlpha):
	for index, item in enumerate(testMulFeat):
		probYXmul[i,index,:] = np.sum(item*np.log(alp),axis=1)+np.log(probY)


# Find the best prediction label for each doc