def __init__(
        self,
        X=None,
        Y=None,
        params={},
        split_percentage=0.75,
        output_file='predictions.csv'
    ):
        if X is None:
            X = np.genfromtxt('data/kaggle.X1.train.fragment.txt', delimiter=',')
        if Y is None:
            Y = np.genfromtxt('data/kaggle.Y.train.fragment.txt', delimiter=',')

        self.Xtr, self.Xte, self.Ytr, self.Yte = splitData(X, Y, split_percentage)
        self.params = params
        self.output_file = output_file
Пример #2
0
def predictLinearRegress(attributeList, starTargetList):

    print("\nLinear Regression")

    starTargetList = np.array(starTargetList)
    Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75)

    lr = ml.linear.linearRegress(Xtrain, Ytrain)

    yHatInitial = lr.predict(Xtest)
    print("MSE test: ", mean_squared_error(yHatInitial, Ytest))
    print("RMSE test: ", math.sqrt(mean_squared_error(yHatInitial, Ytest)))


    incorrect = 0
    total = 0
    for i, value in enumerate(yHatInitial):
        if(abs(yHatInitial[i] - Ytest[i]) > 0.5):
            incorrect += 1
        total += 1

    ratioIncorrect = float(float(incorrect) / float(total))
    print("Ratio incorrect: " + str(ratioIncorrect))


    onesCol = np.ones((len(Xtrain),1))
    Xtrain = np.concatenate((onesCol, Xtrain), 1)
    onesCol = np.ones((len(Xtest),1))
    Xtest = np.concatenate((onesCol, Xtest), 1)
    m, n = np.shape(Xtrain)

    clf = SGDRegressor(loss="squared_loss")
    clf.fit(Xtrain, Ytrain)
    yHat = clf.predict(Xtest)

    print("MSE after GD: ", mean_squared_error(yHat, Ytest))
    print("RMSE after GD: ", math.sqrt(mean_squared_error(yHat, Ytest)))

    incorrect = 0
    total = 0
    for i, value in enumerate(yHat):
        if(abs(yHat[i] - Ytest[i]) > 0.5):
            incorrect += 1
        total += 1

    ratioIncorrect = float(float(incorrect) / float(total))
    print("Ratio incorrect: " + str(ratioIncorrect))
Пример #3
0
def predictRandomForests(attributeList, starTargetList):

    print("\nRandom Forests")

    starTargetList = np.array(starTargetList)
    Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75)

    RFModel = RandomForestRegressor(n_estimators=200)
    RFModel.fit(Xtrain, Ytrain)
    yHat = RFModel.predict(Xtest)

    total = 0
    numIncorrect = 0
    for i, value in enumerate(Ytest):
        if abs(Ytest[i] - yHat[i]) > 0.5:
            numIncorrect += 1
        total += 1

    print("MSE Test: ", mean_squared_error(yHat, Ytest))
    print("RMSE Test: ", math.sqrt(mean_squared_error(yHat, Ytest)))
    print("Ratio Incorrect: " + str(float(numIncorrect / total)))
Пример #4
0
def predictXGBoosting(attributeList, starTargetList):

    print("\nExtreme Gradient Boosting")

    starTargetList = np.array(starTargetList)
    Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75)

    xgb_model = xgboost.XGBRegressor(missing=np.nan, max_depth=11, n_estimators=400, learning_rate=0.03, nthread=4, subsample=0.85, colsample_bytree=0.75, seed=4242)
    xgb_model.fit(Xtrain, Ytrain, early_stopping_rounds=20, eval_metric="rmse", eval_set=[(Xtest, Ytest)])

    yHat = xgb_model.predict(Xtest)

    total = 0
    numIncorrect = 0
    for i, value in enumerate(Ytest):
        if abs(Ytest[i] - yHat[i]) > 0.5:
            numIncorrect += 1
        total += 1

    print("MSE Test: ", mean_squared_error(yHat, Ytest))
    print("RMSE Test: ", math.sqrt(mean_squared_error(yHat, Ytest)))
    print("Ratio Incorrect: " + str(float(numIncorrect / total)))
def setup_code(xTrainFile, yTrainFile):
    X1 = np.genfromtxt(xTrainFile,delimiter=",")
    Y = np.genfromtxt(yTrainFile,delimiter=",")
    Xtr,Xte,Ytr,Yte = ml.splitData(X1,Y,0.80)
    
    M = Xtr.shape[0]
    Mv= Xte.shape[0]

    #maxDepth
    ########################

    nBags = 6000
    YtHat = np.zeros((M,nBags))
    YvHat = np.zeros((Mv,nBags))
    rforest = [None] * nBags

    maxDepth = 40
    lowestMaxDepth = LowestMSE()
    nFeatures = 60
    minParent = 8

    for l in range(1,nBags):
        print "bags", l
        Xi,Yi = ml.bootstrapData(Xtr,Ytr, M)

        rforest[l] = dtree.treeRegress()
        rforest[l].train(Xi,Yi,maxDepth=maxDepth)
        YtHat[:,l] = rforest[l].predict(Xtr)[:,0] # predict on training data
        YvHat[:,l] = rforest[l].predict(Xte)[:,0]
        mseT = ((Ytr - YtHat[:,0:l].mean(axis=1))**2).mean()
        mseV = ((Yte - YvHat[:,0:l].mean(axis=1))**2).mean()
        lowestMaxDepth.set(mseV, l, maxDepth, minParent, l)
    

    print "Lowest"
    print lowestMaxDepth
Пример #6
0
def predictKNN(attributeList, starTargetList):

    print("\nKNN")

    K = [1]#, 20, 50, 100, 500, 1000, 1500, 2000]
    starTargetList = np.array(starTargetList)
    Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75)

    for i in range(0, 1):
        knn = ml.knn.knnClassify()
        knn.train(Xtrain, Ytrain, K[i])
        YtestHat = knn.predict(Xtest)

        total = 0
        numIncorrect = 0
        for i, value in enumerate(Ytest):
            if abs(Ytest[i] - YtestHat[i]) > 0.5:
                numIncorrect += 1
            total += 1


        print("MSE test: ", mean_squared_error(YtestHat, Ytest))
        print("RMSE test: ", math.sqrt(mean_squared_error(YtestHat, Ytest)))
        print("Ratio Incorrect: " + str(float(numIncorrect / total)))
Пример #7
0
    return result


def calc_error(Y_val, Y_hat_val):
    i = 0
    mistakes = 0
    while (i < Y_val.size):
        if (Y_val[i] != Y_hat_val[i]):
            mistakes = mistakes + 1
        i = i + 1
    error_rate = mistakes / Y_val.size
    return error_rate


# split data into 80/20 train/validation
AOne_tr, AOne_va, TOne_tr, TOne_va = ml.splitData(Activity_One, Target_One,
                                                  0.8)
ATwo_tr, ATwo_va, TTwo_tr, TTwo_va = ml.splitData(Activity_Two, Target_Two,
                                                  0.8)
AThree_tr, AThree_va, TThree_tr, TThree_va = ml.splitData(
    Activity_Three, Target_Three, 0.8)
AFour_tr, AFour_va, TFour_tr, TFour_va = ml.splitData(Activity_Four,
                                                      Target_Four, 0.8)
#AFive_tr, AFive_va, TFive_tr, TFive_va = ml.splitData(Activity_Five, Target_Five, 0.8)

# transform target arrays into target matrices with 1 column
TOne_tr = np.reshape(TOne_tr, [TOne_tr.size, 1])
TOne_va = np.reshape(TOne_va, [TOne_va.size, 1])
TTwo_tr = np.reshape(TTwo_tr, [TTwo_tr.size, 1])
TTwo_va = np.reshape(TTwo_va, [TTwo_va.size, 1])
TThree_tr = np.reshape(TThree_tr, [TThree_tr.size, 1])
TThree_va = np.reshape(TThree_va, [TThree_va.size, 1])
Пример #8
0
import warnings
warnings.filterwarnings('ignore')


np.random.seed(0)

# Data loading
X = np.genfromtxt("data/X_train.txt", delimiter=None)
Y = np.genfromtxt("data/Y_train.txt", delimiter=None)
X_test = np.genfromtxt("data/X_test.txt", delimiter=None)

# Test data
Xte = np.genfromtxt("data/X_test.txt", delimiter=None)

# Train and Validation splits
Xtr, Xval, Ytr, Yval = ml.splitData(X, Y, 0.75)

# Taking a subsample of the data so that trains faster.  You should train on whole data for homework and Kaggle.
Xt, Yt = Xtr[:4000], Ytr[:4000]

# flatten y into a 1-D array
Ytf = np.ravel(Yt)

# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(Xt, Ytf)

# check the accuracy on the training set
print(model.score(Xt, Ytf))

# predict class labels for the test set

# ====================== Training level-0 learners  ===================
n_trees = 50
# level 0 learners:
clfs = [
        ExtraTreesRegressor(n_estimators = n_trees *2),
        RandomForestRegressor(n_estimators = n_trees),
        GradientBoostingRegressor(n_estimators = n_trees)
    ]

# split data into (X1, Y1) and (X2, Y2)
# (X1, Y1) are used to train the 3 level-0 learners
# X2 is used to geneate temp_train from the three level-0 learners
# (temp_train, Y2) are used to train the level-1 learner
X1,X2,Y1,Y2 = ml.splitData(X,Y,0.75)


# temp_train are the intermediate training data, ie, outputs of the 3 level-0 learners, also inputs of the level-1 learner
temp_train = np.zeros((  len(Y2)   ,len(clfs)    ))
temp_test=np.zeros((  Xtest.shape[0]   ,len(clfs)    ))
for i, clf in enumerate(clfs):
    clf.fit(X1,Y1)                             # train each level-0 learner
    temp_train[:,i] = clf.predict(X2)          # intermediate data for level-1 learner given data X2 are generated
    temp_test[:,i] = clf.predict(Xtest)        # intermediate data for level-1 learner given data Xtest are also generated
    

# ====================== Training the level-1 learner  ===================
# level-1 learner
# cv = 5: 5 folds cross validation    
alphas = [0.0001, 0.005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0]
Пример #10
0
def main() :
    iris = np.genfromtxt("data/iris.txt", delimiter=None)
    Y = iris[:,-1]
    X = iris[:, 0:-1]
    print X.shape
    # Part 2
    # for f in X.T:
    #     plt.hist(f)
    #     plt.show()
    # Part 3
    for f in X.T:
        print "Mean: ", np.mean(f)
        print "Standard deviation: ", np.std(f)
    # Part 4
    # pairs = [[0, 1, 4], [0, 2, 4], [0, 3, 4]]
    # colors = ['r*', 'g*', 'b*']
    # for p in pairs:
    #     for feature in iris[:, p]:
    #         plt.plot(feature[0], feature[1], colors[int(feature[2])])
    # plt.show()
    # Question 2
    # Part 1)
    # XX = X[:, [0, 1]]
    # np.random.seed(1)
    # XX, Y = ml.shuffleData(XX, Y)
    # np.random.seed(1)
    # XXtr, XXva, Ytr, Yva = ml.splitData(XX, Y, 0.75)
    # K = [1, 5, 10, 50];
    # for k in K:
    #     knn = ml.knn.knnClassify()
    #     knn.train(XXtr, Ytr, k)
    #     ml.plotClassify2D(knn, XXtr, Ytr, axis=plt)
    #     plt.title("K = ", k)
    #     plt.show()
    # Part 2
    np.random.seed(1)
    X, Y = ml.shuffleData(X, Y)
    np.random.seed(1)
    Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75)
    XXtr = Xtr[:, [0,1]]
    XXva = Xva[:, [0,1]]
    K = [1, 2, 5, 10, 50, 100, 200];
    trainErr  =[]
    validErr = []
    for i,k in enumerate(K):
        knn = ml.knn.knnClassify()
        knn.train(XXtr, Ytr, k)
        YHat = knn.predict(XXtr)
        trainErr.append( np.sum(YHat != Ytr)*1.0/len(YHat) )
        YHat = knn.predict(XXva);
        validErr.append( np.sum(YHat != Yva)*1.0/len(YHat) )
        print "K = ", k, ": Error rate on training data = ", trainErr[i], ", on validation data = ", validErr[i]
    plt.semilogx(K, trainErr, color = "r", label = "Error on Training Data")
    plt.semilogx(K, validErr, color = "g", label = "Error on ")
    plt.show()

    trainErr = []
    validErr = []
    for i, k in enumerate(K):
        knn = ml.knn.knnClassify()
        knn.train(Xtr, Ytr, k)
        YHat = knn.predict(Xtr)
        trainErr.append(np.sum(YHat != Ytr) * 1.0 / len(YHat))
        YHat = knn.predict(Xva);
        validErr.append(np.sum(YHat != Yva) * 1.0 / len(YHat))
        print "K = ", k, ": Error rate on training data = ", trainErr[i], ", on validation data = ", validErr[i]
    plt.semilogx(K, trainErr, color="r", label="Error on Training Data")
    plt.semilogx(K, validErr, color="g", label="Error on Validation Data")
    plt.show()
    print "OK, I'm done."
Пример #11
0
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

# (a) Loading the data from the curve80.txt file, and splitting to
#       75-25, training and test data
data = np.genfromtxt("data/curve80.txt", delimiter=None)
X = data[:, 0]  # First column is feature
X = X[:,
      np.newaxis]  # code expects shape (M,N) so make sure it's 2-dimensional
Y = data[:, 1]  # Second column is the result
Xtr, Xte, Ytr, Yte = ml.splitData(X, Y, 0.75)  # split data set 75/25
Ytr = Ytr[:, np.newaxis]
Yte = Yte[:, np.newaxis]

# (b) Plotting the linear regression prediction function, and training data,
#       finding the regression coefficients, finding MSE of train and test data
lr = ml.linear.linearRegress(Xtr, Ytr)  # create and train model
xs = np.linspace(0, 10, 200)  # densely sample possible x-values
xs = xs[:, np.newaxis]  # force "xs" to be an Mx1 matrix
ys = lr.predict(xs)  # make predictions at xs
plt.scatter(Xtr, Ytr, c='red')  # Plotting the training data points
plt.plot(xs, ys, c='black')  # Plotting the predictor line
plt.title('Regression Function')
plt.show()
print 'Regression Coefficients\t=\t', lr.theta
YTrainPred = lr.predict(Xtr)
YTestPred = lr.predict(Xte)
mseTrain = np.mean((YTrainPred - Ytr)**2)
mseTest = np.mean((YTestPred - Yte)**2)
print 'Mean Square Error on Training Data\t=\t', mseTrain
Пример #12
0
Y = iris[:,-1]
X = iris[:,0:-3]


# Note: indexing with ":" indicates all values (in this case, all rows);
# indexing with a value ("0", "1", "-1", etc.) extracts only that one value (here, columns);
# indexing rows/columns with a range ("1:-1") extracts any row/column in that range.
import mltools as ml
# We'll use some data manipulation routines in the provided class code

X,Y = ml.shuffleData(X,Y); # shuffle data randomly
# (This is a good idea in case your data are ordered in some pathological way,
# as the Iris data are)

Xtr,Xte,Ytr,Yte = ml.splitData(X,Y, 0.75); # split data into 75/25 train/test

"""
K = 50 #for nearest neighbor prediction

knn = ml.knn.knnClassify() # create the object and train it
knn.train(Xtr, Ytr, K) # where K is an integer, e.g. 1 for nearest neighbor prediction
YteHat = knn.predict(Xte) # get estimates of y for each data point in Xte
ml.plotClassify2D( knn, Xtr, Ytr ); # make 2D classification plot with data (Xtr,Ytr)
"""
errTrain = [0] * 7;

K=[1,2,5,10,50,100,200];
for i,k in enumerate(K):
    learner = ml.knn.knnClassify(Xtr, Ytr, k) # TODO: complete code to train model
    Yhat = learner.predict(Xtr) # TODO: complete code to predict results on training data
Пример #13
0
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml


def po_re(x, d, p):
    return ml.transforms.rescale(ml.transforms.fpoly(x, d, False), p)[0]


# 1.(a)
np.random.seed(0)
data = np.genfromtxt("data/curve80.txt", delimiter=None)
X = data[:, 0]
X = X[:, np.newaxis]  # code expects shape (M,N) so make sure it's 2-dimensional
Y = data[:, 1]  # doesn't matter for Y
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75)  # split data set 75/25

# 1.(b)
lr = ml.linear.linearRegress(Xtr, Ytr)  # create and train model
xs = np.linspace(0, 10, 200)  # densely sample possible x-values
xs = xs[:, np.newaxis]  # force "xs" to be an Mx1 matrix
ys = lr.predict(xs)  # make predictions at xs
print('Theta for linear regression:', lr.theta)

plt.scatter(Xtr, Ytr, label='training data')
plt.scatter(Xva, Yva, label='validation data')
plt.plot(xs, ys, c='g', label='degree=1')
plt.legend(loc='lower right')
ax = plt.axis()
plt.show()
# plt.savefig('figure/figure_1_b')
Пример #14
0
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import VarianceThreshold
import pickle

from pybrain.structure import FeedForwardNetwork
from pybrain.structure import LinearLayer, SigmoidLayer
from pybrain.structure import FullConnection
from pybrain.structure import TanhLayer

X = np.genfromtxt("data/kaggle.X1.train.txt",delimiter=',')
Y = np.genfromtxt("data/kaggle.Y.train.txt",delimiter=',')
Xtest = np.genfromtxt("data/kaggle.X1.test.txt",delimiter=',')

#X = SelectKBest(f_classif, k=35).fit_transform(X, Y)
#X = VarianceThreshold(threshold=(.8*.2)).fit_transform(X)
Xtr,Xte,Ytr,Yte = ml.splitData(X,Y,0.8)

#testdat = open('testdat.csv','w')

netbags = []


for iter in range(100):
    for moment in [0.3]:
        for learnRate in [0.05]:
            for epochs in [30]:
                for depth in [3]:
                    for hidw in [8]:

                        Xboot, Yboot = ml.bootstrapData(Xtr,np.array([Ytr]).T,Xtr.shape[0]//50)
                        print Xboot.shape
Пример #15
0
import numpy as np
import mltools as ml
import matplotlib.pyplot as plt

X = np.genfromtxt("data/X_train.txt", delimiter=None)
Y = np.genfromtxt("data/Y_train.txt", delimiter=None)
Xt, Xv, Yt, Yv = ml.splitData(X, Y, 0.01)

err_k_t = [None] * 15
err_k_v = [None] * 15

for i in range(1, 15, 1):
    Xt, Xv, Yt, Yv = ml.splitData(X[:, 0:i], Y, 0.01)
    knn = ml.knn.knnClassify()
    knn.train(Xt, Yt)
    knn.K = 3
    print i
    err_k_t[i] = knn.err(Xt, Yt)
    err_k_v[i] = knn.err(Xv, Yv)
print err_k_t, err_k_v
plt.plot(err_k_t, 'g-', err_k_v, 'r')
plt.legend(('Training Error Rate', 'Validation Error Rate'), 'upper right')
#plt.xlabel('number of neighbor K')
plt.xlabel('number of feature')
plt.ylabel('error rate')
plt.show()
"""knn = ml.knn.knnClassify()
knn.train(Xt, Yt)
knn.K = 3
Xte = np.genfromtxt("data/X_test.txt", delimiter=None)
Ypred = knn.predictSoft(Xte)
#!/usr/bin/env python
"""2016W-CS178: Homework 1, Problem2"""

import numpy
import matplotlib.pyplot as plt
import mltools

iris = numpy.genfromtxt("data/iris.txt")
Y = iris[:, -1]
X = iris[:, 0:2]  # feature 1 & 2
X, Y = mltools.shuffleData(X, Y)
trainX, testX, trainY, testY = mltools.splitData(X, Y, 0.75)

# problem 2(a)
plt.figure(1, (12, 9))

for i, k in enumerate([1, 5, 10, 50]):
    learner = mltools.knn.knnClassify()
    learner.train(trainX, trainY, k)
    plt.subplot(2, 2, i + 1)
    mltools.plot_classify_2d(learner, trainX, trainY)
    plt.grid(1)
    plt.xlabel('feature 1')
    plt.ylabel('feature 2')
    plt.title('Iris KNN: Feature 1 & 2, K = %d' % k)

plt.show()
plt.close(1)

# problem 2(b)
K = [1, 2, 5, 10, 50, 100, 200]
Пример #17
0
import numpy as np
import mltools as ml
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
X = np.genfromtxt("data/X_train.txt", delimiter=None)
Y = np.genfromtxt("data/Y_train.txt", delimiter=None)

Xtr, Xv, Ytr, Yv = ml.splitData(X, Y, 0.8)
err_t = [None] * 10
err_v = [None] * 10
for i in range(1, 10, 1):
    Xtr, Xv, Ytr, Yv = ml.splitData(X[:, 0:i], Y, 0.8)
    clf = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes=[14] * i,
                        random_state=1)
    clf.fit(Xtr, Ytr)
    Yet = clf.predict(Xtr)
    err_t[i] = np.mean(Yet != Ytr)
    Yev = clf.predict(Xv)
    err_v[i] = np.mean(Yev != Yv)
print err_t, err_v
plt.plot(err_t, 'g-', err_v, 'r-')
plt.legend(('Training Error Rate', 'Validation Error Rate'), 'upper left')
plt.xlabel('Hidden Layer Size')
#plt.xlabel('Number of Features')
plt.ylabel('error rate')
plt.show()
"""Xte = np.genfromtxt("data/X_test.txt", delimiter=None)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(14,14,14,14,14), random_state=1)
clf.fit(Xtr, Ytr)
Пример #18
0
def split_numpy(data, train_fraction=0):
    """
    - Split Numpy Array Into Features & Target Values
    - Verify Shape Of Features & Target Values

    Parameters
    ----------
    data: Numpy Array
    train_fraction : Fraction that specifies train/test split
        If 0 no split Occurs and function returns only X,Y

    Returns
    -------
    X: Features
    Y: Target Values

    Examples
    --------
    >>> split_numpy(data, train_fraction=0)
    tuple(X, Y)

    """

    # Split Into Features & Target Values
    X = data[:,0:-1]                     # N-1 Columns Of Data ( Scalar Features -> X Values)
    Y = data[:,-1]                       # Last Column of Data ( Target Values -> Y values)

    # Assert Shape
    if len(X.shape) != 2:
        X = X[:,np.newaxis]             # Code expects shape (M,N) so make sure it's 2-dimensional
    if len(Y.shape) != 2:
        Y = Y[:,np.newaxis]             # Code expects shape (M,N) so make sure it's 2-dimensional

    assert(len(X.shape) == 2 )
    assert(len(Y.shape) == 2 )

    # Split Data into Test & Train
    if train_fraction != 0:

        # Split data into 75/25 train/test
        X_train,X_test,Y_train,Y_test = ml.splitData(X,Y, train_fraction)

        # Assert Shape
        if len(X_train.shape) != 2:
            X_train = np.array(X_train[:,np.newaxis])
        if len(X_test.shape) != 2:
            X_test = np.array(X_test[:,np.newaxis])
        if len(Y_train.shape) != 2:
            Y_train = np.array(Y_train[:,np.newaxis])
        if len(Y_test.shape) != 2:
            Y_test = np.array(Y_test[:,np.newaxis])

        assert (len(X_train.shape) == 2)
        assert (len(X_test.shape) == 2)
        assert (len(Y_train.shape) == 2)
        assert (len(Y_test.shape) == 2)
        #print("Train Shape (x, y): (",X_train.shape,",",Y_train.shape,")")
        #print("Test  Shape (x, y): (",X_test.shape,",",Y_test.shape,")")

        return X_train,X_test,Y_train,Y_test

    # If no test/train split return data
    return X,Y
Пример #19
0
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from pprint import pprint
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
import mltools as ml

X = np.loadtxt("data/X_train.txt")
Y = np.loadtxt("data/Y_train.txt")

## shuffle and split
# Xtr,Ytr = ml.shuffleData(X,Y)
Xtr = X[:10000]
Ytr = Y[:10000]
Xtr,Xva,Ytr,Yva = ml.splitData(Xtr,Ytr,0.25)

print(Xtr.shape)
print(Xva.shape)

# Parameters
n_estimators = [1000,1500,2000]
max_features = ['auto', 'log2']
max_depth = [1,4,7]
learning_rate = [1,.1,0.01,0.001]
min_samples_split = [2, 6, 10]
min_samples_leaf = [2, 5, 10]

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
# params = {
Пример #20
0
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
from sklearn.svm import SVR

import numpy as np
import mltools as ml
%matplotlib inline

X = np.genfromtxt("kaggle.X1.train.txt",delimiter=",")
Y = np.genfromtxt("kaggle.Y.train.txt",delimiter=",")
[Xt,Xv,Yt,Yv] = ml.splitData(X,Y,0.75)
svc = SVR(kernel="linear")

# this will literally take about forever.  on 60K points of data, ~7 hours.
# features to select = features that will be kept
# step size is number of features to eliminate each recursive evaluation of the function.
# while it shortens the time somewhat, it also produces more error.

RFE(estimator=svc, n_features_to_select=70, step=10)
rfe.fit(Xt, Yt)

print rfe.ranking_.shape
ranking = rfe.ranking_.reshape((1,91))

# Plotting the ranking is helpful to see which features, corresponding to their
# indices in the feature vector, are kept.  Features with ranking #1 are kept.

plt.matshow(ranking)
plt.colorbar()
plt.title("Ranking of pixels with RFE")
plt.show()
Пример #21
0
#Created on Tue Mar 14 19:25:08 2017
#
#@author: Malav
#"""
import numpy as np

np.random.seed(0)
import mltools as ml
#import matplotlib.pyplot as plt   # use matplotlib for plotting with inline plots
from sklearn.ensemble import BaggingClassifier

#%matplotlib inline

X = np.genfromtxt("X_train.txt", delimiter=' ')
Y = np.genfromtxt("Y_train.txt", delimiter=' ')
Xt, Xv, Yt, Yv = ml.splitData(X, Y, 0.90)

Xe = np.genfromtxt('X_test.txt', delimiter=' ')


def auc(soft, Y):
    """Manual AUC function for applying to soft prediction vectors"""
    indices = np.argsort(soft)  # sort data by score value
    Y = Y[indices]
    sorted_soft = soft[indices]

    # compute rank (averaged for ties) of sorted data
    dif = np.hstack(([True], np.diff(sorted_soft) != 0, [True]))
    r1 = np.argwhere(dif).flatten()
    r2 = r1[0:-1] + 0.5 * (r1[1:] - r1[0:-1]) + 0.5
    rnk = r2[np.cumsum(dif[:-1]) - 1]
Пример #22
0
import mltools as ml
# We'll use some data manipulation routines in the provided class code
# Make sure the "mltools" directory is in a directory on your Python path, e.g.,
# export PYTHONPATH=${PYTHONPATH}:/path/to/parent/dir
# or add it to your path inside Python:
# import sys
# sys.path.append('/path/to/parent/dir/');
# X,Y = ml.shuffleData(X,Y); # shuffle data randomly
# (This is a good idea in case your data are ordered in some pathological way,
# as the Iris data are)
# Xtr,Xte,Ytr,Yte = ml.splitData(X,Y, 0.75); # split data into 75/25 train/test

# (a)
# Use only first two features of X
X_new, Y_new = ml.shuffleData(X[:, [0, 1]], Y)
Xtr, Xte, Ytr, Yte = ml.splitData(X_new, Y_new, 0.75)
# Visualize classification boundary for varying values of K = [1,5,10,50]

for K in [1, 5, 10, 50]:
    knn = ml.knn.knnClassify(Xtr, Ytr, K)
    ml.plotClassify2D(knn, Xtr, Ytr)

# (b) Prediction/ error for training set and test set
K = [1, 2, 5, 10, 50, 100, 200]
errTrain = np.zeros(7)
errTest = np.zeros(7)
for i, k in enumerate(K):
    learner = ml.knn.knnClassify(Xtr, Ytr, k)
    Yhat_tr = learner.predict(Xtr)
    Yhat_te = learner.predict(Xte)
    errTrain[i] = (np.sum(Yhat_tr != Ytr)) / len(Ytr)
Пример #23
0
    3,
    figsize=(10, 3),
)
for index, (x, y) in enumerate(((1, 2), (1, 3), (1, 4))):
    plots[index].scatter(x=X[:, x - 1], y=X[:, y - 1], c=Y[:])
    plots[index].set_xlabel("Features: ({}, {})".format(x, y))

# %% [markdown]
# # Problem 2:  kNN predictions
# %% [markdown]
# ## 1. Classification boundary for varying values of K = [1, 5, 10, 50] for features (1, 2)
# %%
import mltools as ml
np.random.seed(0)
X, Y = ml.shuffleData(X, Y)
Xtr, Xva, Ytr, Yva = ml.splitData(X[:, :2], Y, 0.75)
knn = ml.knn.knnClassify()
for K in (1, 5, 10, 50):
    knn.train(Xtr, Ytr, K)
    ml.plotClassify2D(knn, Xtr, Ytr)

# %% [markdown]
# ## 2. The error rate (number of misclassifications) on both the training and validation data as a function of K = [1, 2, 5, 10, 50, 100, 200] for features (1, 2).

# %%
Xtr, Xva, Ytr, Yva = ml.splitData(X[:, :2], Y, 0.75)
K_values = [1, 2, 5, 10, 50, 100, 200]
errTrain = [0] * len(K_values)
errVal = [0] * len(K_values)
for i, K in enumerate(K_values):
    learner = ml.knn.knnClassify(Xtr, Ytr, K)
Пример #24
0

import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
from sklearn.metrics import mean_squared_error

dataXtr = np.genfromtxt("/Users/dharshanbj/Desktop/X_train.txt",delimiter=None)
dataYtr = np.genfromtxt("/Users/dharshanbj/Desktop/Y_train.txt",delimiter=None)
dataXte = np.genfromtxt("/Users/dharshanbj/Desktop/X_test.txt",delimiter=None)
X=dataXtr[:20000]
Y=dataYtr[:20000]
Xtest=dataXte[:,:]


Xtr,Xva,Ytr,Yva = ml.splitData(X,Y,0.5) #first 10000 training ,next 10000 is testing data set

Xtr=np.array(Xtr) # learner takes array as an input
Ytr=np.array(Ytr)
Xtest=np.array(Xtest)


# 2(B)

# In[5]:


#train the learner
learner = ml.dtree.treeClassify(Xtr,Ytr, maxDepth=50)

#predict values of y for training data
Пример #25
0
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('/path/to/parent/dir/')

iris = np.genfromtxt("data/iris.txt", delimiter=None)  # load the data
Y = iris[:, -1]
X = iris[:, 0:2]
# Note: indexing with ":" indicates all values (in this case, all rows);
# indexing with a value ("0", "1", "-1", etc.) extracts only that value (here, columns); # indexing rows/columns with a range ("1:-1") extracts any row/column in that range.
import mltools as ml
#We'll use some data manipulation routines in the provided class code
#Make sure the "mltools" directory is in a directory on your Python path, e.g.,
#export PYTHONPATH=$\$${PYTHONPATH}:/path/to/parent/dir
# or add it to your path inside Python:

X, Y = ml.shuffleData(X, Y)
# shuffle data randomly
# (This is a good idea in case your data are ordered in some pathological way,
# as the Iris data are)
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75)
# split data into 75/25 train/validation

knn = ml.knn.knnClassify()  # create the object and train it
knn.train(Xtr, Ytr,
          1)  # where K is an integer, e.g. 1 for nearest neighbor prediction
YvaHat = knn.predict(Xva)  # get estimates of y for each data point in Xva
ml.plotClassify2D(knn, Xtr, Ytr)
# make 2D classification plot with data (Xtr,Ytr)
Пример #26
0
# We'll use some data manipulation routines in the provided class code
# Make sure the "mltools" directory is in a directory on your Python path, e.g.,
# export PYTHONPATH=${PYTHONPATH}:/path/to/parent/dir
# or add it to your path inside Python:

# import sys

# sys.path.append('/path/to/parent/dir/');

X,Y = ml.shuffleData(X,Y); # shuffle data randomly

# (This is a good idea in case your data are ordered in some pathological way,
# as the Iris data are)

Xtr,Xva,Ytr,Yva = ml.splitData(X,Y, 0.75); # split data into 75/25 train/validation

for K in [1, 5, 10, 50]: ## visualize classification boundary
    knn = ml.knn.knnClassify() # create the object and train it
    knn.train(Xtr, Ytr, K) # where K is an integer, e.g. 1 for nearest neighbor prediction
    YvaHat = knn.predict(Xva) # get estimates of y for each data point in Xva

    ml.plotClassify2D( knn, Xtr, Ytr, axis=plt ) # make 2D classification plot with data (Xtr,Ytr)
    plt.close()

## b ##

K=[1,2,5,10,50,100,200]
errTrain = []
errValidation = []
for i,k in enumerate(K):
Пример #27
0
'''

# %%
import sklearn.neighbors
import sklearn.decomposition
import sklearn.model_selection
import sklearn.metrics
import sklearn.cross_validation

# create K-nearest neighbor learner

# Different K values for nearest points
nearest = [1, 3, 5, 15, 55, 105]

# Subsampling a smaller part of the data
X_train, X_valid, Y_train, Y_valid = ml.splitData(X_data, Y_data, 0.80)

parameters = {'n_neighbors': nearest}

knearest = sklearn.neighbors.KNeighborsClassifier()

clf = sklearn.model_selection.GridSearchCV(knearest, parameters, cv=10)

clf.fit(X_train, Y_train)

dimensions = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

accuracy = []
params = []

Test = X_test.shape[0]
import numpy as np
import mltools as ml
import matplotlib.pyplot as plt
import random
import mltools.logistic2 as lc2
reload(lc2)

X = np.genfromtxt("data/X_train.txt", delimiter=None)
Y = np.genfromtxt("data/Y_train.txt", delimiter=None)
learner = lc2.logisticClassify2()

Xt, Xv, Yt, Yv = ml.splitData(X[:, 0:14], Y, 0.8)
Xt, Yt = ml.shuffleData(Xt, Yt)
Xt, _ = ml.transforms.rescale(Xt)
learner.classes = np.unique(Yt)
wts = [
    0.5, 1, -0.25, ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2)
]
#wts = np.append(wts,[((random.random()-0.5)*2)])
#wts = [0.5 ,1]
learner.theta = wts
lc2.train(learner, Xt, Yt, 0.01, 1e-5, 10000, plot=1, reg=0)
plt.show()
print learner.err(Xt, Yt)
Xte = np.genfromtxt("data/X_test.txt", delimiter=None)
Пример #29
0
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import seaborn as sns
from sklearn import model_selection
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR

np.random.seed(0)

data = np.array(pd.read_csv('data/white.csv'))
data = np.array(list(set([tuple(t) for t in data])))

X = data[:, 0:11]
X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)
Y = data[:, -1]
Xtr, Xte, Ytr, Yte = ml.splitData(X, Y, 0.75)


def calc_mse(y1, y2):
    summation = 0
    n = len(y1)
    for i in range(0, n):
        difference = y1[i] - y2[i]
        squared_difference = difference**2
        summation = summation + squared_difference
    MSE = summation / n
    return MSE


clf = MLPRegressor(activation='logistic',
                   solver='lbfgs',
#!/usr/bin/env python
"""2016W-CS178: Homework 2, Problem1"""

import numpy as np
import mltools as ml
import matplotlib.pyplot as plt

data = np.genfromtxt("data/curve80.txt")
features = (data[:, 0])[:, np.newaxis]
targets = data[:, 1]
train_features, test_features, train_targets, test_targets = ml.splitData(
    features, targets, 0.75)
train_data_point = train_features.shape[0]
test_data_points = test_features.shape[0]
train_error = []
test_error = []
cross_error = []
cross_fold = 5
degrees = range(1, 20, 3)
#degrees = (1, 3, 5, 7, 10, 18)

plt.figure(1, (17, 7))
plt.subplot(1, 2, 1)
plt.scatter(train_features, train_targets, color='b', label='training data')
plt.scatter(test_features, test_targets, color='r', label='test data')

for degree in degrees:
    # cross validate
    c_error_d = np.array([])
    for iFold in range(cross_fold):
        Xt, Xv, Yt, Yv = ml.crossValidate(train_features, train_targets,
Пример #31
0
# Import all required libraries
from __future__ import division

import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

np.random.seed(0)
get_ipython().magic(u'matplotlib inline')

#import the X and Y training data
X = np.genfromtxt('C:\data\X_train.txt', delimiter=None)
Y = np.genfromtxt('C:\data\Y_train.txt', delimiter=None)
X, Y = ml.shuffleData(X, Y)

[Xtr, Xva, Ytr, Yva] = ml.splitData(X, Y)
Xte = np.genfromtxt('C:\data\X_test.txt', delimiter=None)


class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners

    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

Пример #32
0
    # print(temp.shape)
    fmax.append(np.max(temp))
    fmin.append(np.min(temp))
    fmean.append(np.mean(temp))
    fvar.append(np.var(temp))
    i += stepsize

value = [1]*int(valuestep/5)+[2]*int(valuestep/5)+[3]*int(valuestep/5)+[4]*int(valuestep/5)+[5]*int(valuestep/5-rest)
print(len(value))
# print('datapointsnum:', len(fmax))
# print(fmax)
value = np.array(value).T
dataset = np.array([fmax,fmin,fmean,fvar]).T
print(dataset.shape)
dataset, value = ml.shuffleData(dataset, value)
Xtr, Xva, Ytr, Yva = ml.splitData(dataset, value, 0.75);

learner = svm.SVC(decision_function_shape='ovo')
learner.fit(Xtr,Ytr)
Yhat = learner.predict(Xva)
sum=0
for a in range(len(Yhat)):
    sum += (Yhat[a]!=Yva[a])

print(sum)
print(sum/len(Yhat)*100,"%")