Пример #1
0
def test_run(xTrainFile, yTrainFile):
    X = np.genfromtxt(xTrainFile,delimiter=",")
    Y = np.genfromtxt(yTrainFile,delimiter=",")
    
    M = X.shape[0]

    #maxDepth
    ########################

    nBags = 3
    YHat = np.zeros((M,nBags))

    rforest = [None] * nBags

    maxDepth = 10
    nFeatures = 91
    minParent = 8

    for l in range(1,nBags):
        print "bags", l
        Xi,Yi = ml.bootstrapData(X,Y, M)

        rforest[l] = dtree.treeRegress()
        rforest[l].train(Xi,Yi,maxDepth=maxDepth)
        YHat[:,l] = rforest[l].predict(X)[:,0]

    write_to_kaggle(YHat)
Пример #2
0
def train_from_triples(models, triple_file_name, destination_folder):
    Xdata, Ydata, Xtedata = init()
    Xs, _ = ml.transforms.rescale(Xdata)
    Ys = Ydata
    Xtes, _ = ml.transforms.rescale(Xtedata)
    print '----Training models------'
    #Xi, Yi = Xdata[0:10000], Ydata[0:10000]
    #Xs, Ys = Xs[0:10000], Ys[0:10000]
    Xi, Yi = Xdata, Ydata
    Xs, Ys = Xs, Ydata
    f = open(triple_file_name, 'r')
    triples = f.readlines()
    f.close()

    for triple in triples:
        nf = int(triple.split(',')[0].strip())
        d = int(triple.split(',')[1].strip())
        l = int(triple.split(',')[2].strip())
        print 'Now Training (nf,d,ml):', nf, d, l
        #dt = ml.dtree.treeClassify(Xi, Yi, maxDepth=d, nFeatures=nf, minLeaf=l)
        #models.append(dt)
        #Ypred = dt.predict(Xi)
        #print 'Training error with triple on unscaled: ', triple.strip(), 'is', computeError(Ypred[:,np.newaxis], Yi)
        Xi, Yi = ml.bootstrapData(Xs, Ys, Xs.shape[0])
        dt = ml.dtree.treeClassify(Xi, Yi, maxDepth=d, nFeatures=nf, minLeaf=l)
        Ypred = dt.predict(Xs)
        print 'Training error with triple on scaled: ', triple.strip(
        ), 'is', computeError(Ypred[:, np.newaxis], Ys)
        models.append(dt)

    #save_models(models, destination_folder)
    print '-----Predicting the scores------'
    kaggle_predict(models, True)
def test_run(xTrainFile, yTrainFile):
    X = np.genfromtxt(xTrainFile,delimiter=",")
    Y = np.genfromtxt(yTrainFile,delimiter=",")
    TEST = np.genfromtxt("/home/john/Downloads/kaggle.X1.test.txt",delimiter=",") 
    M = X.shape[0]

    #maxDepth
    ########################

    nBags = 125

    rforest = [None] * nBags

    maxDepth = 40
    nFeatures = 100
    minParent = 8

    for l in range(nBags):
        print "bags", l
        Xi,Yi = ml.bootstrapData(X,Y, M)

        rforest[l] = dtree.treeRegress()
        rforest[l].train(Xi,Yi,maxDepth=maxDepth)

    mTest = TEST.shape[0]
    predict = np.zeros( (mTest, nBags) )
    for i in range(nBags):
        predict[:,i] = rforest[i].predict(TEST).T[0]
 
    predict = predict[:,0]
    _write_to_kaggle("treebag.csv",predict)
Пример #4
0
def train(models, lower, upper, destination_folder):
    Xdata, Ydata, Xtedata = init()
    #X, Y = Xdata[0:10], Ydata[0:10]
    X, Y = Xdata, Ydata
    X, _ = ml.transforms.rescale(X)
    nFolds = 5
    trError = []
    testError = []
    thresholdError = 0.7
    nFolds = 5
    #leaves = [5, 7, 10, 13, 15, 18, 21, 24, 27, 30, 33, 36, 40]
    for nFeatures in range(lower, upper):
        for depth in [10, 15, 16, 17, 19, 21, 30, 45, 50]:
            for minLeaf in [
                    5, 7, 10, 13, 20, 30, 64, 128, 150, 200, 250, 500, 1000,
                    1250
            ]:
                #print 'depth', depth
                print 'Features, Depth, minLeaf, modelIndex: ', (nFeatures,
                                                                 depth,
                                                                 minLeaf,
                                                                 len(models))

                start = time.time()
                Xi, Yi = ml.bootstrapData(X, Y, X.shape[0])
                errTr, errTe, m = train_fold(Xi, Yi, nFolds, nFeatures, depth,
                                             minLeaf)
                end = time.time()
                #models.extend(m)

                trError.append(errTr)
                testError.append(errTe)

                print 'Average training erorr', trError[-1]
                print 'Average test error', testError[-1]
                print 'Total time for model', (
                    end - start), 'Time per split: ', ((end - start) /
                                                       (1.0 * nFolds))

                if testError[-1] < 0.29 or trError[-1] < 0.20:
                    print ':::LOW ERR::: (f,d,ml,len_model,teE, trE', \
                        (nFeatures, depth, minLeaf, len(models), testError[-1], trError[-1])

        # TODO: If if erorr is less than threshold, then add it to the models array

    # plt.plot(range(0, len(trError)), trError, 'b-')
    # plt.plot(range(0, len(testError)), testError, 'g-')
    # plt.show()

    f = open('training_error' + str(lower) + '_' + str(upper), 'w')
    p.dump(trError, f)
    f.close()
    f = open('test_error' + str(lower) + '_' + str(upper), 'w')
    p.dump(testError, f)
    f.close()

    save_models(models, destination_folder)
Пример #5
0
 def __init__(self, X, Y, nFeatures, maxDepth, minLeaf, number_of_learner):
     (N, D) = X.shape
     self.number_of_learner = number_of_learner
     self.learners = [0] * self.number_of_learner
     for i in range(self.number_of_learner):
         (bstrp_x, bstrp_y) = ml.bootstrapData(X, Y)
         self.learners[i] = ml.dtree.treeClassify(bstrp_x,
                                                  bstrp_y,
                                                  nFeatures=nFeatures,
                                                  maxDepth=maxDepth,
                                                  minLeaf=minLeaf)
Пример #6
0
 def __init__(self, X, Y, Nbags=80, maxDepth=20, nFeatures=20):
     self.bags = []
     for i in range(Nbags):
         Xi, Yi = ml.bootstrapData(X, Y, X.shape[0])
         tree = ml.dtree.treeClassify(Xi,
                                      Yi,
                                      maxDepth=maxDepth,
                                      nFeatures=nFeatures)
         self.bags.append(tree)
     self.bt = BaggedTree(self.bags)
     self.bt.classes = np.unique(Y)
Пример #7
0
def TrainEnsemble():
    Xtr, Ytr = X[:10000, :], Y[:10000]
    Xval, Yval = X[10000:20000, :], Y[10000:20000]

    nFeatures = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    for nFeature in nFeatures:
        print "=" * 50
        print 'Training Decision Trees with ', str(nFeature), ' features'
        bags = [1, 5, 10, 25, 45, 60, 75]
        bagTrainError = []
        bagValidationError = []
        ensembles = []
        for bag in bags:
            print 'Training ', bag, ' decision tree(s)'
            decisionTrees = [None] * bag
            trainingError = []
            for i in range(0, bag, 1):
                # Drawing a random training sample every single time
                Xi, Yi = ml.bootstrapData(Xtr, Ytr, n_boot=10000)
                decisionTrees[i] = DecisionTreeClassifier(
                    max_features=nFeature)
                decisionTrees[i] = decisionTrees[i].fit(Xi, Yi)
                # decisionTrees[i] = ml.dtree.treeClassify(Xi, Yi, maxDepth=16, minLeaf=256, nFeatures=nFeature)

            YHatValidation = np.zeros((Xval.shape[0], bag))
            YHatTraining = np.zeros((Xtr.shape[0], bag))
            for i in range(0, len(decisionTrees), 1):
                decisionTree = decisionTrees[i]
                YHatValidation[:, i] = decisionTree.predict(Xval)
                YHatTraining[:, i] = decisionTree.predict(Xtr)

            # YHatValidation = np.sum(YHatValidation, axis=1)/float(bag)
            YHatValidation = np.mean(YHatValidation, axis=1)
            YHatValidation[YHatValidation > 0.5] = 1
            YHatValidation[YHatValidation <= 0.5] = 0

            # YHatTraining = np.sum(YHatTraining, axis=1)/float(bag)
            YHatTraining = np.mean(YHatTraining, axis=1)
            YHatTraining[YHatTraining > 0.5] = 1
            YHatTraining[YHatTraining <= 0.5] = 0

            bagValidationError.append(np.mean(YHatValidation != Yval))
            bagTrainError.append(np.mean(YHatTraining != Ytr))

            ensembles.append(decisionTrees)

        index = np.argmin(bagValidationError)
        print 'Minimum Validation Error = ', bagValidationError[index]
        print 'Number of learners in Bag = ', bags[index]
def setup_code(xTrainFile, yTrainFile):
    X1 = np.genfromtxt(xTrainFile,delimiter=",")
    Y = np.genfromtxt(yTrainFile,delimiter=",")
    Xtr,Xte,Ytr,Yte = ml.splitData(X1,Y,0.80)
    
    M = Xtr.shape[0]
    Mv= Xte.shape[0]

    #maxDepth
    ########################

    nBags = 6000
    YtHat = np.zeros((M,nBags))
    YvHat = np.zeros((Mv,nBags))
    rforest = [None] * nBags

    maxDepth = 40
    lowestMaxDepth = LowestMSE()
    nFeatures = 60
    minParent = 8

    for l in range(1,nBags):
        print "bags", l
        Xi,Yi = ml.bootstrapData(Xtr,Ytr, M)

        rforest[l] = dtree.treeRegress()
        rforest[l].train(Xi,Yi,maxDepth=maxDepth)
        YtHat[:,l] = rforest[l].predict(Xtr)[:,0] # predict on training data
        YvHat[:,l] = rforest[l].predict(Xte)[:,0]
        mseT = ((Ytr - YtHat[:,0:l].mean(axis=1))**2).mean()
        mseV = ((Yte - YvHat[:,0:l].mean(axis=1))**2).mean()
        lowestMaxDepth.set(mseV, l, maxDepth, minParent, l)
    

    print "Lowest"
    print lowestMaxDepth
Пример #9
0
Y = np.genfromtxt("data/Y_train.txt", delimiter=None)

Xtr, Ytr = X[:180000, :], Y[:180000]
Xval, Yval = X[180000:, :], Y[180000:]

bags = [1, 5, 10, 25, 45, 60, 75]
bagTrainError = []
bagValidationError = []
ensembles = []
for bag in bags:
    print 'Training ', bag, ' decision trees'
    decisionTrees = [None] * bag
    trainingError = []
    for i in range(0, bag, 1):
        # Drawing a random training sample every single time
        Xi, Yi = ml.bootstrapData(Xtr, Ytr, n_boot=180000)
        decisionTrees[i] = ml.dtree.treeClassify(Xi,
                                                 Yi,
                                                 maxDepth=16,
                                                 minLeaf=256,
                                                 nFeatures=9)

    YHatValidation = np.zeros((Xval.shape[0], bag))
    YHatTraining = np.zeros((Xtr.shape[0], bag))
    for i in range(0, len(decisionTrees), 1):
        decisionTree = decisionTrees[i]
        YHatValidation[:, i] = decisionTree.predict(Xval)
        YHatTraining[:, i] = decisionTree.predict(Xtr)

    # YHatValidation = np.sum(YHatValidation, axis=1)/float(bag)
    YHatValidation = np.mean(YHatValidation, axis=1)
# Note: file is comma-delimited
X = np.genfromtxt("data/trainX.txt", delimiter=',')
Y = np.genfromtxt("data/trainY.txt", delimiter=',')
# also load features of the test data (to be predicted)
Xe1 = np.genfromtxt("data/devX.txt", delimiter=',')
Ye1 = np.genfromtxt("data/devY.txt", delimiter=',')
print X.shape
print Y.shape

nBag = 10

m, n = X.shape
classifiers = [None] * nBag  # Allocate space for learners

for i in range(nBag):
    Xi, Yi = ml.bootstrapData(X, Y)
    classifiers[i] = ml.dtree.treeRegress(
        Xi, Yi, maxDepth=20, minParent=1024,
        nFeatures=60)  # Train a model on data Xi, Yi

#training errors
trainingErrors = np.zeros(
    nBag)  # Allocate space for predictions from each model
for i in range(nBag):
    temp = np.sqrt(classifiers[i].mse(X, Y))  # Apply each classifier
    trainingErrors[i] = temp
# Make overall prediction by majority vote
#tE = np.mean(trainingErrors, axis=)

# test on data Xtest
predict = np.zeros(nBag)  # Allocate space for predictions from each model
Пример #11
0
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners

    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)


n_bags = 7
bags = []  # self.learners
for l in range(n_bags):
    # Each boosted data is the size of the original data.
    Xi, Yi = ml.bootstrapData(Xtr, Ytr, Xtr.shape[0])

    # Train the model on that draw
    tree = ml.dtree.treeClassify(Xi,
                                 Yi,
                                 minParent=2**6,
                                 maxDepth=100,
                                 nFeatures=6)

    bags.append(tree)

bt = BaggedTree(bags)
bt.classes = np.unique(Y)

print("{0:>15}: {1:.4f}".format('Train AUC', bt.auc(Xtr, Ytr)))
print("{0:>15}: {1:.4f}".format('Validation AUC', bt.auc(Xva, Yva)))
Пример #12
0
# # (g)
# learner.train(Xtr, Ytr, minParent=4, maxDepth=14, minLeaf=4)
# Ypred = learner.predictSoft(Xte)
# print(Ypred.shape)
# # Now output a file with two columns, a row ID and a confidence in class 1:
# np.savetxt('data/Yhat_dtree.txt', np.vstack((np.arange(len(Ypred)), Ypred[:, 1])).T, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')

# #
# problem 3
#
# (a)
ensemble = [None] * 50

for i in range(0, 50):
    print(i)
    Xtri, Ytri = ml.bootstrapData(Xtr, Ytr)
    ensemble[i] = ml.dtree.treeClassify(Xtri,
                                        Ytri,
                                        minParent=8,
                                        maxDepth=14,
                                        minLeaf=4,
                                        nFeatures=8)

sizeArray = [1, 5, 10, 25, 50]

Yhat_va = np.zeros(10000)
Yhat_tr = np.zeros(90000)
valid_err = []
train_err = []

for size in sizeArray:
#
## produce the actual learner
svr_rbf = SVR(kernel="rbf", C=10, gamma=0.1)
svr_learner = svr_rbf.fit(Xtr, Ytr)
ensemble.add(svr_learner)


# In[ ]:

### Andrew Fischer's learners
tree_learner = Ensemble()

# Find results from bagged tree
for i in range(0, 50):
    time_start = time.time()
    x,y = ml.bootstrapData(X, Y, len(X))
    tl = tree.DecisionTreeRegressor(max_features=50)
    tl.fit(x, y)
    tree_learner.add(tl)
    time_end = time.time()
    print("Iteration=" + str(i) + ", seconds=" + str(time_end - time_start))

ensemble.add(tree_learner)


# In[ ]:

#--------------------------------- Ensemble ---------------------------------#
# store the learners we have in to a list, index them and then make predictions
Yhat = ensemble.predict(Xe1)
Пример #14
0
    dt = ml.dtree.treeClassify(Xt,Yt,minLeaf=8, minParent = 16, maxDepth = d)
    err_depth_tr[d] = ml.dtree.treeClassify.err(dt,Xt,Yt)
    err_depth_v[d] = ml.dtree.treeClassify.err(dt,Xv,Yv)
xs = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
plt.plot(xs, err_depth_tr, '-r', xs, err_depth_v, '-b')
plt.legend(('Training Error Rate', 'Validation Error Rate'),'upper right')
plt.xlabel('Maximum Depth')
plt.ylabel('error rate')
plt.show()

rf = [None]*25
Yt_hat = np.zeros((Yt.shape[0],25))
Yv_hat = np.zeros((Yv.shape[0],25))

for i in range(0,24,1):
    [Xi, Yi] = ml.bootstrapData(Xt, Yt, Xt.shape[0])
    rf[i] = ml.dtree.treeClassify(Xi, Yi, minLeaf=8, minParent = 512, maxDepth = 7, nFeatures = 14)
    Yt_hat[:, i] = rf[i].predict(Xt)
    Yv_hat[:, i] = rf[i].predict(Xv)

err_e_t = [None]*6
err_e_v = [None]*6

Yt_hat_e = Yt_hat[:, 0]
err_e_t[0] = np.mean(Yt_hat_e.reshape(Yt.shape) != Yt)
Yv_hat_e = Yv_hat[:, 0]
err_e_v[0] = np.mean(Yv_hat_e.reshape(Yv.shape) != Yv)

j=1
for i in [5, 10, 15, 20, 25]:
    Yt_hat_e = (np.mean(Yt_hat[:,0:i], axis=1)>0.5)
Пример #15
0
# RANDOM FORESTS

# 3 (A)

# In[58]:


#Random Forest of size 25
# Load data set X, Y for training the ensemble…
m,n = Xtr.shape
ensemble = [ None ] * 25 # Allocate space for learners
n=7
for i in range(25):
    #ind = np.floor( m * np.random.rand(n) ).astype(int)
    #Xb, Yb = Xtr[ind,:],Ytr[ind]
    Xb,Yb=ml.bootstrapData(Xtr,Ytr)
    ensemble[i]=ml.dtree.treeClassify(Xb,Yb, maxDepth=5,minLeaf=256,nFeatures=n) 


# In[15]:


# test on data Xva
mTest = Xva.shape[0]
predictTe = np.zeros( (mTest, 25) ) # Allocate space for predictions from each model
predictTr = np.zeros( (mTest, 25) )
for i in range(25):
    predictTe[:,i] = ensemble[i].predict(Xva) # Apply each classifier
    predictTr[:,i] = ensemble[i].predict(Xtr)
    
predictTest = np.mean(predictTe, axis=1)
Пример #16
0
#X = VarianceThreshold(threshold=(.8*.2)).fit_transform(X)
Xtr,Xte,Ytr,Yte = ml.splitData(X,Y,0.8)

#testdat = open('testdat.csv','w')

netbags = []


for iter in range(100):
    for moment in [0.3]:
        for learnRate in [0.05]:
            for epochs in [30]:
                for depth in [3]:
                    for hidw in [8]:

                        Xboot, Yboot = ml.bootstrapData(Xtr,np.array([Ytr]).T,Xtr.shape[0]//50)
                        print Xboot.shape
                        Yboot = Yboot.T
                        print Yboot

                        net = FeedForwardNetwork()

                        w = X.shape[1]
                        hw = hidw#8

                        inl = TanhLayer(w)
                        net.addInputModule(inl)

                        last = inl

                        for i in range(3):
Пример #17
0
print X.shape
print Y.shape

nBag = 101
learners = np.array([2, 5, 10, 20, 25, 50])

classifiers = [None] * nBag  # Allocate space for learners

errT = np.zeros((len(learners), ))

nFolds = 10
errX = np.zeros((len(learners), nFolds))
for iFold in range(nFolds):
    [Xti, Xvi, Yti, Yvi] = ml.crossValidate(X, Y, nFolds, iFold)
    for i in range(nBag):
        Xi, Yi = ml.bootstrapData(Xti, Yti)
        classifiers[i] = ml.dtree.treeRegress(
            Xi, Yi, maxDepth=20, minParent=1024,
            nFeatures=60)  # Train a model on data Xi, Yi
    for i in range(len(learners)):
        learnerNum = learners[i]
        predict = np.zeros(
            (learnerNum))  # Allocate space for predictions from each model
        for j in range(learnerNum):
            predict[j] = np.sqrt(classifiers[j].mse(
                Xvi, Yvi))  # Apply each classifier, calculate RMSE
        errX[i, iFold] = np.mean(predict)

errX = np.mean(errX, axis=1)
print errX.shape
Пример #18
0
Xv = np.genfromtxt("X_train.txt",
                   delimiter=None)[10001:20000]  # load the text file
Yv = np.genfromtxt("Y_train.txt",
                   delimiter=None)[10001:20000]  # load the text file

#test data
Xte = np.genfromtxt("X_test.txt", delimiter=None)  # load the text file

ensemble = []
trainError = []
validError = []
predicts = []
aucs = []

for x in range(14, 21):
    ind = ml.bootstrapData(Xt, Yt, n_boot=x)
    ensem = ml.dtree.treeClassify(ind[0], ind[1], minLeaf=4)  ## 0.55670,
    ensemble.append(ensem)
    trainError.append(ensem.err(Xt, Yt))
    validError.append(ensem.err(Xv, Yv))
    predicts.append(ensem.predict(Xte))
    aucs.append(ensem.auc(Xv, Yv))

aucMean = 0
for x in aucs:
    aucMean = aucMean + x

print aucMean / len(aucs)

Ypred = [[x] for x in predicts[0]]
for x in Ypred:
Пример #19
0
                                  ,minParent=512)  
Ypred = learnerTR.predictSoft(nXte)
np.savetxt('Yhat_dtree.txt',
np.vstack( (np.arange(len(Ypred)) , Ypred[:,1]) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',');
           
# Problem 3: Random Forests

# Part A
ensemble = [0]*25
Ytrhat = np.zeros((np.size(Ytr),25))
Ytehat = np.zeros((np.size(Yte),25))

# Evaluate for up to 25 learners.
for i in range(25):
    Xb,Yb = ml.bootstrapData(Xtr,Ytr)
    ensemble[i] = ml.dtree.treeClassify(Xb,Yb,maxDepth=15
            ,minLeaf=4,nFeatures=60)
    Ytrhat[:,i] = ensemble[i].predict(Xtr)
    Ytehat[:,i] = ensemble[i].predict(Xte)

# Write down mseTR and mseTE for learners [1,5,10,25]
mseTR = []
mseTE = []
for index, value in enumerate([1,5,10,25]):    
    mseTR.append(np.mean( (Ytr-np.mean(Ytrhat[:,0:value],1))**2 ))     
    mseTE.append(np.mean( (Yte-np.mean(Ytehat[:,0:value],1))**2 ))
    print(str(value)+" Ensemble Members: mseTR = "
          +str(mseTR[index])+" | mseTE = "+str(mseTE[index]))
    
_,axis = plt.subplots()