def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=2000) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) pC1 = getClassProb(Ytrain, -1) pC2 = getClassProb(Ytrain, 1) wordList = d.getWordList() w1 = [getFeatureProb(Xtrain, Ytrain, -1, wordIndex) for wordIndex in range(len(wordList))] aw1 = np.asarray(w1) w2 = [getFeatureProb(Xtrain, Ytrain, 1, wordIndex) for wordIndex in range(len(wordList))] aw2 = np.asarray(w2) trainError = computeError(Xtrain, Ytrain, pC1, pC2, aw1, aw2) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest, Ytest, pC1, pC2, aw1, aw2) print 'Test error rate is ' + str(testError)
def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=200) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) lam = 100 cols = [] currentError = 1 n = Xtrain.shape[1] dic = {} ## i is the number of features to be added to cols for i in range(40): bestJ = 0 bestErrorRate = 1 for j in range(n): cols.append(j) w = trainRidge(Xtrain[:, cols], Ytrain, lam) errorRate = computeError(Xtrain[:, cols], Ytrain, w) if errorRate < bestErrorRate: bestJ = j bestErrorRate = errorRate ## print 'Best error rate is ' + str(bestErrorRate) cols.pop() if bestErrorRate >= currentError: break else: cols.append(bestJ) dic[bestJ] = currentError - bestErrorRate currentError = bestErrorRate print 'Current error rate is ' + str(currentError) w = trainRidge(Xtrain[:, cols], Ytrain, lam) trainError = computeError(Xtrain[:, cols], Ytrain, w) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest[:, cols], Ytest, w) print 'Test error rate is ' + str(testError) ## find the top 10 features wordList = d.getWordList() topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)] topCols = topCols[: 10] topFeatures = [wordList[index] for (index, value) in topCols] for f in topFeatures: print f
def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=1000) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) lam = 100 cols = [] currentError = 1 n = Xtrain.shape[1] dic = {} for j in range(n): cols.append(j) w = trainRidge(Xtrain[:, cols], Ytrain, lam) errorRate = computeError(Xtrain[:, cols], Ytrain, w) if errorRate >= currentError: cols.pop() else: dic[j] = currentError - errorRate currentError = errorRate ## print out currentError once a while if j % 10 == 0: print currentError w = trainRidge(Xtrain[:, cols], Ytrain, lam) trainError = computeError(Xtrain[:, cols], Ytrain, w) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest[:, cols], Ytest, w) print 'Test error rate is ' + str(testError) ## find the top 10 features wordList = d.getWordList() topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)] topCols = topCols[: 10] topFeatures = [wordList[index] for (index, value) in topCols] for f in topFeatures: print f