예제 #1
0
def train_fold(X, Y, nFolds, nFeatures, depth, minLeaf):
    m = []
    errTr = []
    errTe = []
    print "NFOLD: ", X.shape, Y.shape
    for iFold in range(nFolds):
        Xtri, Xtei, Ytri, Ytei = ml.crossValidate(X, Y, nFolds, iFold)
        print Xtri.shape, Ytri.shape, Xtei.shape, Ytei.shape
        dt = ml.dtree.treeClassify(Xtri,
                                   Ytri,
                                   minLeaf=minLeaf,
                                   maxDepth=depth,
                                   nFeatures=nFeatures)
        #m.append(dt)

        Yteihat = dt.predict(Xtei)
        Ytrihat = dt.predict(Xtri)

        errTr.append(computeError(Ytri, Ytrihat))
        errTe.append(computeError(Ytei, Yteihat))

        if (errTr[-1] > 0.32 and errTe[-1] > 0.32):
            print "High Err: ", (
                nFeatures, depth, minLeaf
            ), "has high error. Stopping nFold training on these parameters"
            break

    return (np.mean(errTr), np.mean(errTe), m)
예제 #2
0
def DegreeCrossValidation(nFolds, degree, Xtr, Ytr):
    J = dict()
    XtrP = ml.transforms.fpoly(Xtr, degree, bias=False)
    XtrP, params = ml.transforms.rescale(XtrP)
    for iFold in range(nFolds):
        Xti, Xvi, Yti, Yvi = ml.crossValidate(XtrP, Ytr, nFolds, iFold)
        learner = ml.linear.linearRegress(Xti, Yti)
        J[iFold] = MSE(Yvi, learner.predict(Xvi))
    return (sum([x[0] for x in J.values()]) / 5)
예제 #3
0
def hists():
    X, Y, Xte = r.init()
    Xtri, Xtei, Ytri, Ytei = ml.crossValidate(X, Y, 5, 0)
    # for i in range(0, 4):
    #     plt.subplot(1, 4, i+1)
    #     plt.hist(Xtei[:,i])
    # plt.show()
    for i in range(Xtei.shape[0]):
        plt.hist(Xtei[:,i])
        plt.show()
예제 #4
0
def scatter():
    X, Y, Xte = r.init()
    #for x in combinations(range(14), 2):
    #    #plt.subplot(1, 14, x[1])
    #    plt.scatter(X[:,x[0]], X[:,x[1]], c=Y)
    #    plt.show()

    # everything wrt feature 1
    const_feature = 0
    Xtri, Xtei, Ytri, Ytei = ml.crossValidate(X, Y, 5, 0)
    for i in range(1, 14):
        if i != const_feature:
            plt.scatter(Xtei[:,const_feature], Xtei[:,i], c=Ytei)
            plt.show()
def predict2():
    X, Y, Xte = r.init()
    X, _ = ml.transforms.rescale(X)
    nFolds = 5
    errTr = []
    errTe = []
    l = [
        2, 4, 6, 8, 10, 16, 32, 64, 100, 128, 150, 256, 328, 400, 512, 768,
        800, 1024, 1568, 2048
    ]
    for features in l:
        dtc = DecisionTreeClassifier(max_leaf_nodes=features)
        tre = 0
        tee = 0
        for iFold in range(nFolds):
            print 'Training for features', features, 'fold', iFold
            Xtri, Xtei, Ytri, Ytei = ml.crossValidate(X, Y, nFolds, iFold)
            Ytri, Ytei = Ytri[:, np.newaxis], Ytei[:, np.newaxis]
            #print Xtri.shape, Xtei.shape, Ytri.shape, Ytei.shape
            dtc.fit(Xtri, Ytri)
            e1 = r.computeError(dtc.predict(Xtri)[:, np.newaxis], Ytri)
            tre += e1
            print 'Training Error', e1
            e1 = r.computeError(dtc.predict((Xtei))[:, np.newaxis], Ytei)
            print 'Test Error', e1
            tee += e1
        errTr.append(tre / nFolds)
        errTe.append(tee / nFolds)

        print '===== features:', features, 'Training: ', errTr[
            -1], 'Test', errTe[-1]
    print 'Training Error', errTr
    print 'Test Error', errTe

    plt.plot(l, errTr, 'r.')
    plt.plot(l, errTe, 'b.')
    plt.show()
예제 #6
0
# %%
from sklearn.neural_network import MLPClassifier
""" solver = 'adam' for large data sets """

Test = X_test.shape[0]
neural_net = np.zeros((Test, 2))

train = []
average = []

nFolds = 10

for iFold in range(nFolds):

    Xtr, Xva, Ytr, Yva = ml.crossValidate(X_data, Y_data, nFolds, iFold)

    neural_network = MLPClassifier(solver='adam', random_state=0)

    neural_network.fit(Xtr, Ytr)

    predict += neural_network.predict_proba(X_test)

    train.append(np.mean(neural_network.predict(Xtr) == Ytr))

    average.append(np.mean(neural_network.predict(Xva) == Yva))

    #print(Yhat)

print("training error: {}".format(np.mean(train)))
data = np.genfromtxt("data/curve80.txt", delimiter=None)
X = data[:, 0]  # First column is feature
X = X[:,
      np.newaxis]  # code expects shape (M,N) so make sure it's 2-dimensional
Y = data[:, 1]  # Second column is the result
Xtr, Xte, Ytr, Yte = ml.splitData(X, Y, 0.75)  # split data set 75/25

nFolds = 5
degrees = [1, 3, 5, 7, 10, 18]
validationMSEs = []
for degree in degrees:
    J = []
    for iFold in range(nFolds):
        # ith block as validation
        Xti, Xvi, Yti, Yvi = ml.crossValidate(Xtr, Ytr, nFolds, iFold)
        Yvi = Yvi[:, np.newaxis]
        XtiP = ml.transforms.fpoly(Xti, degree, bias=False)
        XtiP, params = ml.transforms.rescale(XtiP)
        learner = ml.linear.linearRegress(XtiP, Yti)
        XviP, _ = ml.transforms.rescale(
            ml.transforms.fpoly(Xvi, degree, False), params)

        # Calculating error in test and training data
        YValPredP = learner.predict(XviP)
        valError = np.mean((YValPredP - Yvi)**2)
        J.append(valError)
    validationMSEs.append(np.mean(J))
plt.semilogy(degrees, validationMSEs, c='red')
plt.xticks(degrees, degrees)
plt.show()
test_error = []
cross_error = []
cross_fold = 5
degrees = range(1, 20, 3)
#degrees = (1, 3, 5, 7, 10, 18)

plt.figure(1, (17, 7))
plt.subplot(1, 2, 1)
plt.scatter(train_features, train_targets, color='b', label='training data')
plt.scatter(test_features, test_targets, color='r', label='test data')

for degree in degrees:
    # cross validate
    c_error_d = np.array([])
    for iFold in range(cross_fold):
        Xt, Xv, Yt, Yv = ml.crossValidate(train_features, train_targets,
                                          cross_fold, iFold)
        pXt, params = ml.transforms.rescale(ml.transforms.fpoly(Xt, degree, 0))
        pXv = ml.transforms.rescale(ml.transforms.fpoly(Xv, degree, 0),
                                    params)[0]
        learner = ml.linear.linearRegress(pXt, Yt)
        predicted_Yv = learner.predict(pXv).flatten()
        c_error_d = np.append(
            c_error_d,
            np.sum(np.power(predicted_Yv - Yv, 2)) / float(Yv.shape[0]))
    cross_error.append(np.mean(c_error_d))

    # prepare data
    poly_train_features, params = ml.transforms.rescale(
        ml.transforms.fpoly(train_features, degree, 0))
    poly_test_features = ml.transforms.rescale(
        ml.transforms.fpoly(test_features, degree, 0), params)[0]
예제 #9
0
plt.semilogy([1, 3, 5, 7, 10, 18], mse_te, 'g-', linewidth=2)
plt.xlabel('Degree')
plt.ylabel('MSE')
plt.show()
'''
Problem 2
'''

mse_cv = []
nFolds = 5
for degree in [1, 3, 5, 7, 10, 18]:
    params = (None, None)
    # Define a function "Phi(X)" which outputs the expanded and scaled feature matrix:
    Phi = lambda X: ml.transforms.rescale(
        ml.transforms.fpoly(X, degree, False), params)[0]
    # the parameters "degree" and "params" are memorized at the function definition
    J = np.zeros(nFolds)
    for iFold in range(nFolds):
        Xti, Xvi, Yti, Yvi = ml.crossValidate(
            Xtr, Ytr, nFolds, iFold)  # take ith data block as validation
        learner = ml.linear.linearRegress(
            Phi(Xti), Yti)  # train on Xti, Yti , the data for this fold
        J[iFold] = learner.mse(
            Phi(Xvi), Yvi)  # now compute the MSE on Xvi, Yvi and save it
    mse_cv = np.append(mse_cv, np.mean(J))
plt.semilogy([1, 3, 5, 7, 10, 18], mse_cv, 'b-', linewidth=2)
plt.semilogy([1, 3, 5, 7, 10, 18], mse_te, 'g-', linewidth=2)
plt.xlabel('Degree')
plt.ylabel('MSE Cross Validation')
plt.show()
예제 #10
0
Y = np.genfromtxt("data/trainY.txt", delimiter=',')
# also load features of the test data (to be predicted)
print X.shape
print Y.shape

nBag = 101
learners = np.array([2, 5, 10, 20, 25, 50])

classifiers = [None] * nBag  # Allocate space for learners

errT = np.zeros((len(learners), ))

nFolds = 10
errX = np.zeros((len(learners), nFolds))
for iFold in range(nFolds):
    [Xti, Xvi, Yti, Yvi] = ml.crossValidate(X, Y, nFolds, iFold)
    for i in range(nBag):
        Xi, Yi = ml.bootstrapData(Xti, Yti)
        classifiers[i] = ml.dtree.treeRegress(
            Xi, Yi, maxDepth=20, minParent=1024,
            nFeatures=60)  # Train a model on data Xi, Yi
    for i in range(len(learners)):
        learnerNum = learners[i]
        predict = np.zeros(
            (learnerNum))  # Allocate space for predictions from each model
        for j in range(learnerNum):
            predict[j] = np.sqrt(classifiers[j].mse(
                Xvi, Yvi))  # Apply each classifier, calculate RMSE
        errX[i, iFold] = np.mean(predict)

errX = np.mean(errX, axis=1)
# constants
train_dp = features_train.shape[
    0]  # number of data points of the training data
test_dp = features_test.shape[0]  # number of data points of the testing data
cv_k = 5  # k value for k-fold cross validate
degrees = range(
    2, 15,
    3)  # polynomial degrees on which the training/testing will take place

# train and test

error_cv = [
]  # MSE on different polynomial degrees obtained by k-fold cross validation
error_test = []  # MSE on different polynomial degrees obtained by testing
print_report('training starts...')
for degree in degrees:
    print_report('polynomial degree %d' % degree)
    # k-fold cross validation
    c_error_d = []  # MSE on each fold
    for k in range(0, cv_k):
        print_report('k-fold cross validation, fold %d' % k)
        x_train, x_test, y_train, y_test = ml.crossValidate(
            features_train, targets_train, cv_k, k)
        x_train_, params = ml.transforms.rescale(
            ml.transforms.fpoly(x_train, degree, 0))
        x_test = ml.transforms.rescale(ml.transforms.fpoly(x_test, degree, 0),
                                       params)[0]
        learner = ml.linear.linearRegress(x_train, y_train)
        y_predicted = learner.predict(y_test)