Пример #1
0
def runKNNSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M):
    outFile = open('knnLog25.txt','a')
    print 'running mashable knn simulation'
    outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_M.shape[0],test_M.shape[1]))
    with SimpleTimer('time to train', outFile):
        clf = KNeighborsClassifier(weights='distance', ).fit(train_M, dataTrain.target)
    plot_learning_curve(clf, 'knn with %d neighbors' , train_M, dataTrain.target, cv=5, n_jobs=4)
    
    baseScore = clf.score(test_M, dataTest.target)
    baseParams = clf.get_params(True)
    baseNeighbors = baseParams['n_neighbors']
    print 'baseline score %.3f base n_neighbors %d' % (baseScore, baseNeighbors)
    outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseNeighbors))
    
    res = []
    with SimpleTimer('time to fine tune number of neighbors', outFile):
        for neighbors in range(2,baseNeighbors * 10):
#             print 'training for neighbors %d' % neighbors
            clf = KNeighborsClassifier(n_neighbors=neighbors, weights='distance').fit(train_M, dataTrain.target)
            score = clf.score(hold_M, holdout.target)
            res.append((score, neighbors))
            outFile.write('%d %.3f \n' % (neighbors, score))
    res = sorted(res, key=lambda x:x[0], reverse=True)
    print res[:5]
    bestNeighbors = res[0][1]
    print ('best number of neighbors is %d' % bestNeighbors)
    outFile.write('best number of neighbors is %d  and score is %.3f\n' % (bestNeighbors, res[0][0]))
    
    bestClf = KNeighborsClassifier(n_neighbors=bestNeighbors, weights='distance')
    bestClf.fit(train_M, dataTrain.target)
    
    predicted = bestClf.predict(test_M)
    trainPredict = bestClf.predict(train_M)
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, trainPredict, outFile)
    
    results = predicted == dataTest.target
    print numpy.mean(results)
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    '''
    train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_M, dataTrain.target, train_sizes=[50, 80, 110], cv=5)
    print train_sizes
    print train_scores
    print valid_scores
    '''
       
    plot_learning_curve(bestClf, 'knn with %d neighbors' % bestNeighbors, train_M, dataTrain.target, cv=5, n_jobs=4)
Пример #2
0
def runBoosting(dataTrain, dataTest, holdout, train_M, test_M, hold_M):
    outFile = open('boostingLog.txt','a')
    print 'running boosting algo'
    outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_M.shape[0],test_M.shape[1]))
    # takes a very long time to run
#     score, bestDepth, num = tryVariousHyperParams(dataTrain, dataTest, train_M, test_M)
    bestDepth = 7
    bestNum = 10000
    with SimpleTimer('time to train', outFile):
        estimator = DecisionTreeClassifier(max_depth=bestDepth)
        bestClf = AdaBoostClassifier(base_estimator=estimator,  n_estimators=bestNum)
        bestClf.fit(train_M, dataTrain.target)
    
    bestScore = bestClf.score(test_M, dataTest.target)
    print 'the best score %.3f' % bestScore
    outFile.write('depth %d, num %d score %.3f \n'%(bestDepth, bestNum, bestScore))
    bestClf.fit(train_M, dataTrain.target)
    predicted = bestClf.predict(test_M)
    
    trainPredict = bestClf.predict(train_M)
    
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    
    print 'training score'
    outFile.write('training score')
    outputScores(dataTrain.target, trainPredict, outFile)
    
    results = predicted == dataTest.target
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    plot_learning_curve(bestClf, 'boosting with %d trees' % bestNum, train_M, dataTrain.target, cv=3, n_jobs=4)
Пример #3
0
def runSVMSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M):
    kernel = "linear"
    outFile = open('svmSarinLog%s.txt' % kernel,'a')
    print 'running svm code'
    
    outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_M.shape[0],test_M.shape[1]))
    
    penalty = 0.025
    with SimpleTimer('time to train', outFile):
#         clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=30, random_state=42)
#         clf = LinearSVC(C=1.0)
        clf = SVC(kernel=kernel, C=penalty, degree=1)
        clf.fit(train_M, dataTrain.target)
    
    baseScore = clf.score(test_M, dataTest.target)
    baseIter = 5
    print 'baseline score %.3f base iter %d' % (baseScore, baseIter)
    outFile.write('baseline score %.3f base iter %d \n' % (baseScore, baseIter))
    
    res = []
    with SimpleTimer('number of iter', outFile):
        for pen in [1,5,10,15,20,30]:
            print 'training for neighbors %.3f' % pen
            clf = SVC(kernel=kernel, C=pen, degree=1)
#             clf = LinearSVC(loss='squared_hinge', C=1.0)
            clf.fit(train_M, dataTrain.target)
            score = clf.score(hold_M, holdout.target)
            res.append((score, pen))
            trainPredict = clf.score(train_M, dataTrain.target)
            outFile.write('test %.3f %.3f \n' % (pen, score))
            outFile.write('train %.3f %.3f \n' % (pen, trainPredict))
            
    res = sorted(res, key=lambda x:x[0], reverse=True)
    print res[:5]
    
    bestPen = res[0][1]
    print ('best number of iter is %.3f' % bestPen) 
    
    bestClf = SVC(kernel=kernel, C=penalty, degree=bestPen)
    bestClf.fit(train_M, dataTrain.target)
    
    predicted = bestClf.predict(test_M)
    
    trainPredict = bestClf.predict(train_M)
    
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    
    print 'training score'
    outFile.write('training score')
    outputScores(dataTrain.target, trainPredict, outFile)
    
    results = predicted == dataTest.target
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
        

    
    plot_learning_curve(bestClf, 'svm with %s kernel & penalty %.3f' % (kernel, bestPen), train_M, dataTrain.target, cv=5, n_jobs=4)
    '''
def runDecisionTreeSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M):
    print 'running decision tree'
    outFile = open('decisionTreeLog30.txt','a')
    
    
    outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_M.shape[0],test_M.shape[1]))
    with SimpleTimer('time to train', outFile):
        clf = DecisionTreeClassifier().fit(train_M, dataTrain.target)
    
    baseScore = clf.score(test_M, dataTest.target)
    initHeight = clf.tree_.max_depth
    
    print 'baseline score %.3f base height %d' % (baseScore, initHeight)
    outFile.write('baseline score %.3f base height %d \n' % (baseScore, initHeight))
    
    res = []
    with SimpleTimer('time to prune', outFile):
        for height in range(initHeight, 2 , -1):
#             print 'training for height %d' % height
            clf = DecisionTreeClassifier(max_depth=height).fit(train_M, dataTrain.target)
            score = clf.score(hold_M, holdout.target)
            res.append((score, height))
            outFile.write('%d %.3f \n' % (height, score))
    res = sorted(res, key=lambda x:x[0], reverse=True)
    print res[:5]
    '''
    train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_M, dataTrain.target, train_sizes=[50, 80, 110], cv=5)
    print train_sizes
    print train_scores
    print valid_scores
    '''
    bestDepth = res[0][1]
    print ('best height is %d' % bestDepth)
    outFile.write('best depth is %d  and score is %.3f \n' % (bestDepth, res[0][0]))
    bestClf = DecisionTreeClassifier(max_depth=bestDepth)
    bestClf.fit(train_M, dataTrain.target)
    predicted = bestClf.predict(test_M)
    trainPredict = bestClf.predict(train_M)
    print len(filter(lambda x:x==0, dataTrain.target)), len(filter(lambda x:x==0, trainPredict))
    print len(filter(lambda x:x==1, dataTrain.target)), len(filter(lambda x:x==1, trainPredict))
    print len(filter(lambda x:x==2, dataTrain.target)), len(filter(lambda x:x==2, trainPredict))
    
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, trainPredict, outFile)
    
    results = predicted == dataTest.target
    wrong = []
    for i in range(len(results)):
        if not results[i]:
            wrong.append(i)
    print 'classifier got these wrong:'
    for i in wrong[:10]:
        print dataTest.data[i][0], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i][0], dataTest.target[i]))
    plot_learning_curve(bestClf, 'decision tree after pruning from %d to %d depth' % (initHeight, bestDepth), train_M, dataTrain.target, cv=5, n_jobs=4)