def testRBF(): dataArr, labelArr = readData.loadDataSet('data/testSetRBF.txt') smo = SMO(dataArr, labelArr, 200, 0.0001, 'rbf', 1.3) b, alphas = smo.train(100) X = np.array(dataArr) y = np.array(labelArr).T[:, np.newaxis] svInd = np.nonzero(alphas > 0)[0] sVs = X[svInd] labelSV = y[svInd] print("there are %d Support Vectors" % np.shape(sVs)[0]) m, n = np.shape(X) errorCount = 0 # print(labelSV.shape, alphas.shape, len(b)) for i in range(m): kernelEval = smo.kernel(sVs, X[i, :], 'rbf', 1.3) # print(np.shape(labelSV.T)) predict = np.dot(kernelEval, labelSV * alphas[svInd]) + b if np.sign(predict) != np.sign(y[i]): errorCount += 1 print("the training error rate is: %f" % (np.float(errorCount) / m)) dataArr, labelArr = readData.loadDataSet('data/testSetRBF2.txt') errorCount = 0 X = np.array(dataArr) y = np.array(labelArr).T m, n = np.shape(X) for i in range(m): kernelEval = smo.kernel(sVs, X[i, :], 'rbf', 1.3) predict = np.dot(kernelEval.T, labelSV * alphas[svInd]) + b if np.sign(predict) != np.sign(y[i]): errorCount += 1 print("the test error rate is: %f" % (np.float(errorCount) / m))
def processTrainData(filename): listOfTrainComments = [] listOfUniqueTokens = [] # unique tokens of the entire corpus documentFrequencyOfTokens = {} xVal, yVal = rd.loadDataSet(filename) for i in range(xVal.shape[0]): tempVal = Comment(i) tempVal.setContent(xVal[i]) tempVal.setStatus(yVal[i]) listOfTrainComments.append(tempVal) for i in range(len(listOfTrainComments)): content = listOfTrainComments[i].getContent() status = listOfTrainComments[i].getStatus() content = fext.commentNormalizer(content) tokenList = fext.commentTokenizer(content) tokenList = fext.removeStopWords(tokenList) tokenList = fext.commentStemmer(tokenList) # bigramList = fext.convertToBigrams(tokenList) # tokenList = tokenList + bigramList listOfUniqueTokens = listOfUniqueTokens + tokenList # list of unique tokens dicTokens = calculateTermFrequency(tokenList) listOfTrainComments[i].setTokenList(dicTokens) for key, value in dicTokens.items(): if key in documentFrequencyOfTokens: documentFrequencyOfTokens[key] += 1 else: documentFrequencyOfTokens[key] = 1 #documentFrequencyOfTokens = sorted(documentFrequencyOfTokens.items(), key=lambda x: x[1], reverse=True) for key, val in documentFrequencyOfTokens.items(): if val >= 5: listOfUniqueTokens.append(key) invertedDocumentFrequencyOfTokens = {} totalNumberOfDoc = len(listOfTrainComments) for key, val in documentFrequencyOfTokens.items(): invertedDocumentFrequencyOfTokens[key] = 1 + np.log2( totalNumberOfDoc / val) ''' for i in range(len(listOfTrainComments)): cmnt = listOfTrainComments[i] tokenList = cmnt.getTokensList() for key, val in tokenList.items(): tokenList[key] = val * invertedDocumentFrequencyOfTokens[key] ''' #print(len(listOfUniqueTokens)) return (listOfTrainComments, listOfUniqueTokens, invertedDocumentFrequencyOfTokens, documentFrequencyOfTokens)
def processTrainData(filename): listOfTrainComments = [] listOfUniqueTokens = [] # unique tokens of the entire corpus documentFrequencyOfTokens = {} xVal, yVal = rd.loadDataSet(filename) for i in range(xVal.shape[0]): tempVal = Comment(i) tempVal.setContent(xVal[i]) tempVal.setStatus(yVal[i]) listOfTrainComments.append(tempVal) for i in range(len(listOfTrainComments)): content = listOfTrainComments[i].getContent() status = listOfTrainComments[i].getStatus() content = fext.commentNormalizer(content) tokenList = fext.commentTokenizer(content) tokenList = fext.removeStopWords(tokenList) tokenList = fext.commentStemmer(tokenList) # bigramList = fext.convertToBigrams(tokenList) # tokenList = tokenList + bigramList listOfUniqueTokens = listOfUniqueTokens + tokenList # list of unique tokens dicTokens = calculateTermFrequency(tokenList) listOfTrainComments[i].setTokenList(dicTokens) for key, value in dicTokens.items(): if key in documentFrequencyOfTokens: documentFrequencyOfTokens[key] += 1 else: documentFrequencyOfTokens[key] = 1 #documentFrequencyOfTokens = sorted(documentFrequencyOfTokens.items(), key=lambda x: x[1], reverse=True) for key, val in documentFrequencyOfTokens.items(): if val >= 5: listOfUniqueTokens.append(key) invertedDocumentFrequencyOfTokens = {} totalNumberOfDoc = len(listOfTrainComments) for key, val in documentFrequencyOfTokens.items(): invertedDocumentFrequencyOfTokens[key] = 1 + np.log2(totalNumberOfDoc / val) ''' for i in range(len(listOfTrainComments)): cmnt = listOfTrainComments[i] tokenList = cmnt.getTokensList() for key, val in tokenList.items(): tokenList[key] = val * invertedDocumentFrequencyOfTokens[key] ''' #print(len(listOfUniqueTokens)) return (listOfTrainComments, listOfUniqueTokens, invertedDocumentFrequencyOfTokens, documentFrequencyOfTokens)
def testLinear(): dataArr, labelArr = readData.loadDataSet('data/testSet.txt') smo = SMO(dataArr, labelArr, 0.6, 0.001, 'linear') b, alphas = smo.train(40) w = smo.calcLinearWs() X = np.array(dataArr) y = np.array(labelArr).T X_pos = X[y > 0] X_neg = X[y < 0] x = np.linspace(-7, 12, 100) plt.figure() plt.scatter(X_pos[:, 0], X_pos[:, 1], c='r') plt.scatter(X_neg[:, 1], X_neg[:, 1], c='g') print( w[0], w[1], ) plt.plot(x[:], (w[0] * x[:] + b) / -w[1]) plt.show()
def processTestData(filename): listOfTestComments = [] xVal, yVal = rd.loadDataSet(filename) for i in range(xVal.shape[0]): tempVal = Comment(i) tempVal.setContent(xVal[i]) tempVal.setStatus(yVal[i]) listOfTestComments.append(tempVal) for i in range(len(listOfTestComments)): content = listOfTestComments[i].getContent() status = listOfTestComments[i].getStatus() content = fext.commentNormalizer(content) tokenList = fext.commentTokenizer(content) tokenList = fext.removeStopWords(tokenList) tokenList = fext.commentStemmer(tokenList) # bigramList = fext.convertToBigrams(tokenList) # tokenList = tokenList + bigramList dicTokens = calculateTermFrequency(tokenList) listOfTestComments[i].setTokenList(dicTokens) return listOfTestComments