def __init__( self, X=None, Y=None, params={}, split_percentage=0.75, output_file='predictions.csv' ): if X is None: X = np.genfromtxt('data/kaggle.X1.train.fragment.txt', delimiter=',') if Y is None: Y = np.genfromtxt('data/kaggle.Y.train.fragment.txt', delimiter=',') self.Xtr, self.Xte, self.Ytr, self.Yte = splitData(X, Y, split_percentage) self.params = params self.output_file = output_file
def predictLinearRegress(attributeList, starTargetList): print("\nLinear Regression") starTargetList = np.array(starTargetList) Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75) lr = ml.linear.linearRegress(Xtrain, Ytrain) yHatInitial = lr.predict(Xtest) print("MSE test: ", mean_squared_error(yHatInitial, Ytest)) print("RMSE test: ", math.sqrt(mean_squared_error(yHatInitial, Ytest))) incorrect = 0 total = 0 for i, value in enumerate(yHatInitial): if(abs(yHatInitial[i] - Ytest[i]) > 0.5): incorrect += 1 total += 1 ratioIncorrect = float(float(incorrect) / float(total)) print("Ratio incorrect: " + str(ratioIncorrect)) onesCol = np.ones((len(Xtrain),1)) Xtrain = np.concatenate((onesCol, Xtrain), 1) onesCol = np.ones((len(Xtest),1)) Xtest = np.concatenate((onesCol, Xtest), 1) m, n = np.shape(Xtrain) clf = SGDRegressor(loss="squared_loss") clf.fit(Xtrain, Ytrain) yHat = clf.predict(Xtest) print("MSE after GD: ", mean_squared_error(yHat, Ytest)) print("RMSE after GD: ", math.sqrt(mean_squared_error(yHat, Ytest))) incorrect = 0 total = 0 for i, value in enumerate(yHat): if(abs(yHat[i] - Ytest[i]) > 0.5): incorrect += 1 total += 1 ratioIncorrect = float(float(incorrect) / float(total)) print("Ratio incorrect: " + str(ratioIncorrect))
def predictRandomForests(attributeList, starTargetList): print("\nRandom Forests") starTargetList = np.array(starTargetList) Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75) RFModel = RandomForestRegressor(n_estimators=200) RFModel.fit(Xtrain, Ytrain) yHat = RFModel.predict(Xtest) total = 0 numIncorrect = 0 for i, value in enumerate(Ytest): if abs(Ytest[i] - yHat[i]) > 0.5: numIncorrect += 1 total += 1 print("MSE Test: ", mean_squared_error(yHat, Ytest)) print("RMSE Test: ", math.sqrt(mean_squared_error(yHat, Ytest))) print("Ratio Incorrect: " + str(float(numIncorrect / total)))
def predictXGBoosting(attributeList, starTargetList): print("\nExtreme Gradient Boosting") starTargetList = np.array(starTargetList) Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75) xgb_model = xgboost.XGBRegressor(missing=np.nan, max_depth=11, n_estimators=400, learning_rate=0.03, nthread=4, subsample=0.85, colsample_bytree=0.75, seed=4242) xgb_model.fit(Xtrain, Ytrain, early_stopping_rounds=20, eval_metric="rmse", eval_set=[(Xtest, Ytest)]) yHat = xgb_model.predict(Xtest) total = 0 numIncorrect = 0 for i, value in enumerate(Ytest): if abs(Ytest[i] - yHat[i]) > 0.5: numIncorrect += 1 total += 1 print("MSE Test: ", mean_squared_error(yHat, Ytest)) print("RMSE Test: ", math.sqrt(mean_squared_error(yHat, Ytest))) print("Ratio Incorrect: " + str(float(numIncorrect / total)))
def setup_code(xTrainFile, yTrainFile): X1 = np.genfromtxt(xTrainFile,delimiter=",") Y = np.genfromtxt(yTrainFile,delimiter=",") Xtr,Xte,Ytr,Yte = ml.splitData(X1,Y,0.80) M = Xtr.shape[0] Mv= Xte.shape[0] #maxDepth ######################## nBags = 6000 YtHat = np.zeros((M,nBags)) YvHat = np.zeros((Mv,nBags)) rforest = [None] * nBags maxDepth = 40 lowestMaxDepth = LowestMSE() nFeatures = 60 minParent = 8 for l in range(1,nBags): print "bags", l Xi,Yi = ml.bootstrapData(Xtr,Ytr, M) rforest[l] = dtree.treeRegress() rforest[l].train(Xi,Yi,maxDepth=maxDepth) YtHat[:,l] = rforest[l].predict(Xtr)[:,0] # predict on training data YvHat[:,l] = rforest[l].predict(Xte)[:,0] mseT = ((Ytr - YtHat[:,0:l].mean(axis=1))**2).mean() mseV = ((Yte - YvHat[:,0:l].mean(axis=1))**2).mean() lowestMaxDepth.set(mseV, l, maxDepth, minParent, l) print "Lowest" print lowestMaxDepth
def predictKNN(attributeList, starTargetList): print("\nKNN") K = [1]#, 20, 50, 100, 500, 1000, 1500, 2000] starTargetList = np.array(starTargetList) Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75) for i in range(0, 1): knn = ml.knn.knnClassify() knn.train(Xtrain, Ytrain, K[i]) YtestHat = knn.predict(Xtest) total = 0 numIncorrect = 0 for i, value in enumerate(Ytest): if abs(Ytest[i] - YtestHat[i]) > 0.5: numIncorrect += 1 total += 1 print("MSE test: ", mean_squared_error(YtestHat, Ytest)) print("RMSE test: ", math.sqrt(mean_squared_error(YtestHat, Ytest))) print("Ratio Incorrect: " + str(float(numIncorrect / total)))
return result def calc_error(Y_val, Y_hat_val): i = 0 mistakes = 0 while (i < Y_val.size): if (Y_val[i] != Y_hat_val[i]): mistakes = mistakes + 1 i = i + 1 error_rate = mistakes / Y_val.size return error_rate # split data into 80/20 train/validation AOne_tr, AOne_va, TOne_tr, TOne_va = ml.splitData(Activity_One, Target_One, 0.8) ATwo_tr, ATwo_va, TTwo_tr, TTwo_va = ml.splitData(Activity_Two, Target_Two, 0.8) AThree_tr, AThree_va, TThree_tr, TThree_va = ml.splitData( Activity_Three, Target_Three, 0.8) AFour_tr, AFour_va, TFour_tr, TFour_va = ml.splitData(Activity_Four, Target_Four, 0.8) #AFive_tr, AFive_va, TFive_tr, TFive_va = ml.splitData(Activity_Five, Target_Five, 0.8) # transform target arrays into target matrices with 1 column TOne_tr = np.reshape(TOne_tr, [TOne_tr.size, 1]) TOne_va = np.reshape(TOne_va, [TOne_va.size, 1]) TTwo_tr = np.reshape(TTwo_tr, [TTwo_tr.size, 1]) TTwo_va = np.reshape(TTwo_va, [TTwo_va.size, 1]) TThree_tr = np.reshape(TThree_tr, [TThree_tr.size, 1]) TThree_va = np.reshape(TThree_va, [TThree_va.size, 1])
import warnings warnings.filterwarnings('ignore') np.random.seed(0) # Data loading X = np.genfromtxt("data/X_train.txt", delimiter=None) Y = np.genfromtxt("data/Y_train.txt", delimiter=None) X_test = np.genfromtxt("data/X_test.txt", delimiter=None) # Test data Xte = np.genfromtxt("data/X_test.txt", delimiter=None) # Train and Validation splits Xtr, Xval, Ytr, Yval = ml.splitData(X, Y, 0.75) # Taking a subsample of the data so that trains faster. You should train on whole data for homework and Kaggle. Xt, Yt = Xtr[:4000], Ytr[:4000] # flatten y into a 1-D array Ytf = np.ravel(Yt) # instantiate a logistic regression model, and fit with X and y model = LogisticRegression() model = model.fit(Xt, Ytf) # check the accuracy on the training set print(model.score(Xt, Ytf)) # predict class labels for the test set
# ====================== Training level-0 learners =================== n_trees = 50 # level 0 learners: clfs = [ ExtraTreesRegressor(n_estimators = n_trees *2), RandomForestRegressor(n_estimators = n_trees), GradientBoostingRegressor(n_estimators = n_trees) ] # split data into (X1, Y1) and (X2, Y2) # (X1, Y1) are used to train the 3 level-0 learners # X2 is used to geneate temp_train from the three level-0 learners # (temp_train, Y2) are used to train the level-1 learner X1,X2,Y1,Y2 = ml.splitData(X,Y,0.75) # temp_train are the intermediate training data, ie, outputs of the 3 level-0 learners, also inputs of the level-1 learner temp_train = np.zeros(( len(Y2) ,len(clfs) )) temp_test=np.zeros(( Xtest.shape[0] ,len(clfs) )) for i, clf in enumerate(clfs): clf.fit(X1,Y1) # train each level-0 learner temp_train[:,i] = clf.predict(X2) # intermediate data for level-1 learner given data X2 are generated temp_test[:,i] = clf.predict(Xtest) # intermediate data for level-1 learner given data Xtest are also generated # ====================== Training the level-1 learner =================== # level-1 learner # cv = 5: 5 folds cross validation alphas = [0.0001, 0.005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0]
def main() : iris = np.genfromtxt("data/iris.txt", delimiter=None) Y = iris[:,-1] X = iris[:, 0:-1] print X.shape # Part 2 # for f in X.T: # plt.hist(f) # plt.show() # Part 3 for f in X.T: print "Mean: ", np.mean(f) print "Standard deviation: ", np.std(f) # Part 4 # pairs = [[0, 1, 4], [0, 2, 4], [0, 3, 4]] # colors = ['r*', 'g*', 'b*'] # for p in pairs: # for feature in iris[:, p]: # plt.plot(feature[0], feature[1], colors[int(feature[2])]) # plt.show() # Question 2 # Part 1) # XX = X[:, [0, 1]] # np.random.seed(1) # XX, Y = ml.shuffleData(XX, Y) # np.random.seed(1) # XXtr, XXva, Ytr, Yva = ml.splitData(XX, Y, 0.75) # K = [1, 5, 10, 50]; # for k in K: # knn = ml.knn.knnClassify() # knn.train(XXtr, Ytr, k) # ml.plotClassify2D(knn, XXtr, Ytr, axis=plt) # plt.title("K = ", k) # plt.show() # Part 2 np.random.seed(1) X, Y = ml.shuffleData(X, Y) np.random.seed(1) Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75) XXtr = Xtr[:, [0,1]] XXva = Xva[:, [0,1]] K = [1, 2, 5, 10, 50, 100, 200]; trainErr =[] validErr = [] for i,k in enumerate(K): knn = ml.knn.knnClassify() knn.train(XXtr, Ytr, k) YHat = knn.predict(XXtr) trainErr.append( np.sum(YHat != Ytr)*1.0/len(YHat) ) YHat = knn.predict(XXva); validErr.append( np.sum(YHat != Yva)*1.0/len(YHat) ) print "K = ", k, ": Error rate on training data = ", trainErr[i], ", on validation data = ", validErr[i] plt.semilogx(K, trainErr, color = "r", label = "Error on Training Data") plt.semilogx(K, validErr, color = "g", label = "Error on ") plt.show() trainErr = [] validErr = [] for i, k in enumerate(K): knn = ml.knn.knnClassify() knn.train(Xtr, Ytr, k) YHat = knn.predict(Xtr) trainErr.append(np.sum(YHat != Ytr) * 1.0 / len(YHat)) YHat = knn.predict(Xva); validErr.append(np.sum(YHat != Yva) * 1.0 / len(YHat)) print "K = ", k, ": Error rate on training data = ", trainErr[i], ", on validation data = ", validErr[i] plt.semilogx(K, trainErr, color="r", label="Error on Training Data") plt.semilogx(K, validErr, color="g", label="Error on Validation Data") plt.show() print "OK, I'm done."
import numpy as np import matplotlib.pyplot as plt import mltools as ml # (a) Loading the data from the curve80.txt file, and splitting to # 75-25, training and test data data = np.genfromtxt("data/curve80.txt", delimiter=None) X = data[:, 0] # First column is feature X = X[:, np.newaxis] # code expects shape (M,N) so make sure it's 2-dimensional Y = data[:, 1] # Second column is the result Xtr, Xte, Ytr, Yte = ml.splitData(X, Y, 0.75) # split data set 75/25 Ytr = Ytr[:, np.newaxis] Yte = Yte[:, np.newaxis] # (b) Plotting the linear regression prediction function, and training data, # finding the regression coefficients, finding MSE of train and test data lr = ml.linear.linearRegress(Xtr, Ytr) # create and train model xs = np.linspace(0, 10, 200) # densely sample possible x-values xs = xs[:, np.newaxis] # force "xs" to be an Mx1 matrix ys = lr.predict(xs) # make predictions at xs plt.scatter(Xtr, Ytr, c='red') # Plotting the training data points plt.plot(xs, ys, c='black') # Plotting the predictor line plt.title('Regression Function') plt.show() print 'Regression Coefficients\t=\t', lr.theta YTrainPred = lr.predict(Xtr) YTestPred = lr.predict(Xte) mseTrain = np.mean((YTrainPred - Ytr)**2) mseTest = np.mean((YTestPred - Yte)**2) print 'Mean Square Error on Training Data\t=\t', mseTrain
Y = iris[:,-1] X = iris[:,0:-3] # Note: indexing with ":" indicates all values (in this case, all rows); # indexing with a value ("0", "1", "-1", etc.) extracts only that one value (here, columns); # indexing rows/columns with a range ("1:-1") extracts any row/column in that range. import mltools as ml # We'll use some data manipulation routines in the provided class code X,Y = ml.shuffleData(X,Y); # shuffle data randomly # (This is a good idea in case your data are ordered in some pathological way, # as the Iris data are) Xtr,Xte,Ytr,Yte = ml.splitData(X,Y, 0.75); # split data into 75/25 train/test """ K = 50 #for nearest neighbor prediction knn = ml.knn.knnClassify() # create the object and train it knn.train(Xtr, Ytr, K) # where K is an integer, e.g. 1 for nearest neighbor prediction YteHat = knn.predict(Xte) # get estimates of y for each data point in Xte ml.plotClassify2D( knn, Xtr, Ytr ); # make 2D classification plot with data (Xtr,Ytr) """ errTrain = [0] * 7; K=[1,2,5,10,50,100,200]; for i,k in enumerate(K): learner = ml.knn.knnClassify(Xtr, Ytr, k) # TODO: complete code to train model Yhat = learner.predict(Xtr) # TODO: complete code to predict results on training data
import numpy as np import matplotlib.pyplot as plt import mltools as ml def po_re(x, d, p): return ml.transforms.rescale(ml.transforms.fpoly(x, d, False), p)[0] # 1.(a) np.random.seed(0) data = np.genfromtxt("data/curve80.txt", delimiter=None) X = data[:, 0] X = X[:, np.newaxis] # code expects shape (M,N) so make sure it's 2-dimensional Y = data[:, 1] # doesn't matter for Y Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75) # split data set 75/25 # 1.(b) lr = ml.linear.linearRegress(Xtr, Ytr) # create and train model xs = np.linspace(0, 10, 200) # densely sample possible x-values xs = xs[:, np.newaxis] # force "xs" to be an Mx1 matrix ys = lr.predict(xs) # make predictions at xs print('Theta for linear regression:', lr.theta) plt.scatter(Xtr, Ytr, label='training data') plt.scatter(Xva, Yva, label='validation data') plt.plot(xs, ys, c='g', label='degree=1') plt.legend(loc='lower right') ax = plt.axis() plt.show() # plt.savefig('figure/figure_1_b')
from sklearn.feature_selection import f_classif from sklearn.feature_selection import VarianceThreshold import pickle from pybrain.structure import FeedForwardNetwork from pybrain.structure import LinearLayer, SigmoidLayer from pybrain.structure import FullConnection from pybrain.structure import TanhLayer X = np.genfromtxt("data/kaggle.X1.train.txt",delimiter=',') Y = np.genfromtxt("data/kaggle.Y.train.txt",delimiter=',') Xtest = np.genfromtxt("data/kaggle.X1.test.txt",delimiter=',') #X = SelectKBest(f_classif, k=35).fit_transform(X, Y) #X = VarianceThreshold(threshold=(.8*.2)).fit_transform(X) Xtr,Xte,Ytr,Yte = ml.splitData(X,Y,0.8) #testdat = open('testdat.csv','w') netbags = [] for iter in range(100): for moment in [0.3]: for learnRate in [0.05]: for epochs in [30]: for depth in [3]: for hidw in [8]: Xboot, Yboot = ml.bootstrapData(Xtr,np.array([Ytr]).T,Xtr.shape[0]//50) print Xboot.shape
import numpy as np import mltools as ml import matplotlib.pyplot as plt X = np.genfromtxt("data/X_train.txt", delimiter=None) Y = np.genfromtxt("data/Y_train.txt", delimiter=None) Xt, Xv, Yt, Yv = ml.splitData(X, Y, 0.01) err_k_t = [None] * 15 err_k_v = [None] * 15 for i in range(1, 15, 1): Xt, Xv, Yt, Yv = ml.splitData(X[:, 0:i], Y, 0.01) knn = ml.knn.knnClassify() knn.train(Xt, Yt) knn.K = 3 print i err_k_t[i] = knn.err(Xt, Yt) err_k_v[i] = knn.err(Xv, Yv) print err_k_t, err_k_v plt.plot(err_k_t, 'g-', err_k_v, 'r') plt.legend(('Training Error Rate', 'Validation Error Rate'), 'upper right') #plt.xlabel('number of neighbor K') plt.xlabel('number of feature') plt.ylabel('error rate') plt.show() """knn = ml.knn.knnClassify() knn.train(Xt, Yt) knn.K = 3 Xte = np.genfromtxt("data/X_test.txt", delimiter=None) Ypred = knn.predictSoft(Xte)
#!/usr/bin/env python """2016W-CS178: Homework 1, Problem2""" import numpy import matplotlib.pyplot as plt import mltools iris = numpy.genfromtxt("data/iris.txt") Y = iris[:, -1] X = iris[:, 0:2] # feature 1 & 2 X, Y = mltools.shuffleData(X, Y) trainX, testX, trainY, testY = mltools.splitData(X, Y, 0.75) # problem 2(a) plt.figure(1, (12, 9)) for i, k in enumerate([1, 5, 10, 50]): learner = mltools.knn.knnClassify() learner.train(trainX, trainY, k) plt.subplot(2, 2, i + 1) mltools.plot_classify_2d(learner, trainX, trainY) plt.grid(1) plt.xlabel('feature 1') plt.ylabel('feature 2') plt.title('Iris KNN: Feature 1 & 2, K = %d' % k) plt.show() plt.close(1) # problem 2(b) K = [1, 2, 5, 10, 50, 100, 200]
import numpy as np import mltools as ml import matplotlib.pyplot as plt from sklearn.neural_network import MLPClassifier X = np.genfromtxt("data/X_train.txt", delimiter=None) Y = np.genfromtxt("data/Y_train.txt", delimiter=None) Xtr, Xv, Ytr, Yv = ml.splitData(X, Y, 0.8) err_t = [None] * 10 err_v = [None] * 10 for i in range(1, 10, 1): Xtr, Xv, Ytr, Yv = ml.splitData(X[:, 0:i], Y, 0.8) clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=[14] * i, random_state=1) clf.fit(Xtr, Ytr) Yet = clf.predict(Xtr) err_t[i] = np.mean(Yet != Ytr) Yev = clf.predict(Xv) err_v[i] = np.mean(Yev != Yv) print err_t, err_v plt.plot(err_t, 'g-', err_v, 'r-') plt.legend(('Training Error Rate', 'Validation Error Rate'), 'upper left') plt.xlabel('Hidden Layer Size') #plt.xlabel('Number of Features') plt.ylabel('error rate') plt.show() """Xte = np.genfromtxt("data/X_test.txt", delimiter=None) clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(14,14,14,14,14), random_state=1) clf.fit(Xtr, Ytr)
def split_numpy(data, train_fraction=0): """ - Split Numpy Array Into Features & Target Values - Verify Shape Of Features & Target Values Parameters ---------- data: Numpy Array train_fraction : Fraction that specifies train/test split If 0 no split Occurs and function returns only X,Y Returns ------- X: Features Y: Target Values Examples -------- >>> split_numpy(data, train_fraction=0) tuple(X, Y) """ # Split Into Features & Target Values X = data[:,0:-1] # N-1 Columns Of Data ( Scalar Features -> X Values) Y = data[:,-1] # Last Column of Data ( Target Values -> Y values) # Assert Shape if len(X.shape) != 2: X = X[:,np.newaxis] # Code expects shape (M,N) so make sure it's 2-dimensional if len(Y.shape) != 2: Y = Y[:,np.newaxis] # Code expects shape (M,N) so make sure it's 2-dimensional assert(len(X.shape) == 2 ) assert(len(Y.shape) == 2 ) # Split Data into Test & Train if train_fraction != 0: # Split data into 75/25 train/test X_train,X_test,Y_train,Y_test = ml.splitData(X,Y, train_fraction) # Assert Shape if len(X_train.shape) != 2: X_train = np.array(X_train[:,np.newaxis]) if len(X_test.shape) != 2: X_test = np.array(X_test[:,np.newaxis]) if len(Y_train.shape) != 2: Y_train = np.array(Y_train[:,np.newaxis]) if len(Y_test.shape) != 2: Y_test = np.array(Y_test[:,np.newaxis]) assert (len(X_train.shape) == 2) assert (len(X_test.shape) == 2) assert (len(Y_train.shape) == 2) assert (len(Y_test.shape) == 2) #print("Train Shape (x, y): (",X_train.shape,",",Y_train.shape,")") #print("Test Shape (x, y): (",X_test.shape,",",Y_test.shape,")") return X_train,X_test,Y_train,Y_test # If no test/train split return data return X,Y
from sklearn.model_selection import RandomizedSearchCV from sklearn.ensemble import GradientBoostingClassifier from pprint import pprint import numpy as np np.random.seed(0) import matplotlib.pyplot as plt import mltools as ml X = np.loadtxt("data/X_train.txt") Y = np.loadtxt("data/Y_train.txt") ## shuffle and split # Xtr,Ytr = ml.shuffleData(X,Y) Xtr = X[:10000] Ytr = Y[:10000] Xtr,Xva,Ytr,Yva = ml.splitData(Xtr,Ytr,0.25) print(Xtr.shape) print(Xva.shape) # Parameters n_estimators = [1000,1500,2000] max_features = ['auto', 'log2'] max_depth = [1,4,7] learning_rate = [1,.1,0.01,0.001] min_samples_split = [2, 6, 10] min_samples_leaf = [2, 5, 10] # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores # params = {
from sklearn.feature_selection import RFE import matplotlib.pyplot as plt from sklearn.svm import SVR import numpy as np import mltools as ml %matplotlib inline X = np.genfromtxt("kaggle.X1.train.txt",delimiter=",") Y = np.genfromtxt("kaggle.Y.train.txt",delimiter=",") [Xt,Xv,Yt,Yv] = ml.splitData(X,Y,0.75) svc = SVR(kernel="linear") # this will literally take about forever. on 60K points of data, ~7 hours. # features to select = features that will be kept # step size is number of features to eliminate each recursive evaluation of the function. # while it shortens the time somewhat, it also produces more error. RFE(estimator=svc, n_features_to_select=70, step=10) rfe.fit(Xt, Yt) print rfe.ranking_.shape ranking = rfe.ranking_.reshape((1,91)) # Plotting the ranking is helpful to see which features, corresponding to their # indices in the feature vector, are kept. Features with ranking #1 are kept. plt.matshow(ranking) plt.colorbar() plt.title("Ranking of pixels with RFE") plt.show()
#Created on Tue Mar 14 19:25:08 2017 # #@author: Malav #""" import numpy as np np.random.seed(0) import mltools as ml #import matplotlib.pyplot as plt # use matplotlib for plotting with inline plots from sklearn.ensemble import BaggingClassifier #%matplotlib inline X = np.genfromtxt("X_train.txt", delimiter=' ') Y = np.genfromtxt("Y_train.txt", delimiter=' ') Xt, Xv, Yt, Yv = ml.splitData(X, Y, 0.90) Xe = np.genfromtxt('X_test.txt', delimiter=' ') def auc(soft, Y): """Manual AUC function for applying to soft prediction vectors""" indices = np.argsort(soft) # sort data by score value Y = Y[indices] sorted_soft = soft[indices] # compute rank (averaged for ties) of sorted data dif = np.hstack(([True], np.diff(sorted_soft) != 0, [True])) r1 = np.argwhere(dif).flatten() r2 = r1[0:-1] + 0.5 * (r1[1:] - r1[0:-1]) + 0.5 rnk = r2[np.cumsum(dif[:-1]) - 1]
import mltools as ml # We'll use some data manipulation routines in the provided class code # Make sure the "mltools" directory is in a directory on your Python path, e.g., # export PYTHONPATH=${PYTHONPATH}:/path/to/parent/dir # or add it to your path inside Python: # import sys # sys.path.append('/path/to/parent/dir/'); # X,Y = ml.shuffleData(X,Y); # shuffle data randomly # (This is a good idea in case your data are ordered in some pathological way, # as the Iris data are) # Xtr,Xte,Ytr,Yte = ml.splitData(X,Y, 0.75); # split data into 75/25 train/test # (a) # Use only first two features of X X_new, Y_new = ml.shuffleData(X[:, [0, 1]], Y) Xtr, Xte, Ytr, Yte = ml.splitData(X_new, Y_new, 0.75) # Visualize classification boundary for varying values of K = [1,5,10,50] for K in [1, 5, 10, 50]: knn = ml.knn.knnClassify(Xtr, Ytr, K) ml.plotClassify2D(knn, Xtr, Ytr) # (b) Prediction/ error for training set and test set K = [1, 2, 5, 10, 50, 100, 200] errTrain = np.zeros(7) errTest = np.zeros(7) for i, k in enumerate(K): learner = ml.knn.knnClassify(Xtr, Ytr, k) Yhat_tr = learner.predict(Xtr) Yhat_te = learner.predict(Xte) errTrain[i] = (np.sum(Yhat_tr != Ytr)) / len(Ytr)
3, figsize=(10, 3), ) for index, (x, y) in enumerate(((1, 2), (1, 3), (1, 4))): plots[index].scatter(x=X[:, x - 1], y=X[:, y - 1], c=Y[:]) plots[index].set_xlabel("Features: ({}, {})".format(x, y)) # %% [markdown] # # Problem 2: kNN predictions # %% [markdown] # ## 1. Classification boundary for varying values of K = [1, 5, 10, 50] for features (1, 2) # %% import mltools as ml np.random.seed(0) X, Y = ml.shuffleData(X, Y) Xtr, Xva, Ytr, Yva = ml.splitData(X[:, :2], Y, 0.75) knn = ml.knn.knnClassify() for K in (1, 5, 10, 50): knn.train(Xtr, Ytr, K) ml.plotClassify2D(knn, Xtr, Ytr) # %% [markdown] # ## 2. The error rate (number of misclassifications) on both the training and validation data as a function of K = [1, 2, 5, 10, 50, 100, 200] for features (1, 2). # %% Xtr, Xva, Ytr, Yva = ml.splitData(X[:, :2], Y, 0.75) K_values = [1, 2, 5, 10, 50, 100, 200] errTrain = [0] * len(K_values) errVal = [0] * len(K_values) for i, K in enumerate(K_values): learner = ml.knn.knnClassify(Xtr, Ytr, K)
import numpy as np import matplotlib.pyplot as plt import mltools as ml from sklearn.metrics import mean_squared_error dataXtr = np.genfromtxt("/Users/dharshanbj/Desktop/X_train.txt",delimiter=None) dataYtr = np.genfromtxt("/Users/dharshanbj/Desktop/Y_train.txt",delimiter=None) dataXte = np.genfromtxt("/Users/dharshanbj/Desktop/X_test.txt",delimiter=None) X=dataXtr[:20000] Y=dataYtr[:20000] Xtest=dataXte[:,:] Xtr,Xva,Ytr,Yva = ml.splitData(X,Y,0.5) #first 10000 training ,next 10000 is testing data set Xtr=np.array(Xtr) # learner takes array as an input Ytr=np.array(Ytr) Xtest=np.array(Xtest) # 2(B) # In[5]: #train the learner learner = ml.dtree.treeClassify(Xtr,Ytr, maxDepth=50) #predict values of y for training data
import numpy as np import matplotlib.pyplot as plt import sys sys.path.append('/path/to/parent/dir/') iris = np.genfromtxt("data/iris.txt", delimiter=None) # load the data Y = iris[:, -1] X = iris[:, 0:2] # Note: indexing with ":" indicates all values (in this case, all rows); # indexing with a value ("0", "1", "-1", etc.) extracts only that value (here, columns); # indexing rows/columns with a range ("1:-1") extracts any row/column in that range. import mltools as ml #We'll use some data manipulation routines in the provided class code #Make sure the "mltools" directory is in a directory on your Python path, e.g., #export PYTHONPATH=$\$${PYTHONPATH}:/path/to/parent/dir # or add it to your path inside Python: X, Y = ml.shuffleData(X, Y) # shuffle data randomly # (This is a good idea in case your data are ordered in some pathological way, # as the Iris data are) Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75) # split data into 75/25 train/validation knn = ml.knn.knnClassify() # create the object and train it knn.train(Xtr, Ytr, 1) # where K is an integer, e.g. 1 for nearest neighbor prediction YvaHat = knn.predict(Xva) # get estimates of y for each data point in Xva ml.plotClassify2D(knn, Xtr, Ytr) # make 2D classification plot with data (Xtr,Ytr)
# We'll use some data manipulation routines in the provided class code # Make sure the "mltools" directory is in a directory on your Python path, e.g., # export PYTHONPATH=${PYTHONPATH}:/path/to/parent/dir # or add it to your path inside Python: # import sys # sys.path.append('/path/to/parent/dir/'); X,Y = ml.shuffleData(X,Y); # shuffle data randomly # (This is a good idea in case your data are ordered in some pathological way, # as the Iris data are) Xtr,Xva,Ytr,Yva = ml.splitData(X,Y, 0.75); # split data into 75/25 train/validation for K in [1, 5, 10, 50]: ## visualize classification boundary knn = ml.knn.knnClassify() # create the object and train it knn.train(Xtr, Ytr, K) # where K is an integer, e.g. 1 for nearest neighbor prediction YvaHat = knn.predict(Xva) # get estimates of y for each data point in Xva ml.plotClassify2D( knn, Xtr, Ytr, axis=plt ) # make 2D classification plot with data (Xtr,Ytr) plt.close() ## b ## K=[1,2,5,10,50,100,200] errTrain = [] errValidation = [] for i,k in enumerate(K):
''' # %% import sklearn.neighbors import sklearn.decomposition import sklearn.model_selection import sklearn.metrics import sklearn.cross_validation # create K-nearest neighbor learner # Different K values for nearest points nearest = [1, 3, 5, 15, 55, 105] # Subsampling a smaller part of the data X_train, X_valid, Y_train, Y_valid = ml.splitData(X_data, Y_data, 0.80) parameters = {'n_neighbors': nearest} knearest = sklearn.neighbors.KNeighborsClassifier() clf = sklearn.model_selection.GridSearchCV(knearest, parameters, cv=10) clf.fit(X_train, Y_train) dimensions = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] accuracy = [] params = [] Test = X_test.shape[0]
import numpy as np import mltools as ml import matplotlib.pyplot as plt import random import mltools.logistic2 as lc2 reload(lc2) X = np.genfromtxt("data/X_train.txt", delimiter=None) Y = np.genfromtxt("data/Y_train.txt", delimiter=None) learner = lc2.logisticClassify2() Xt, Xv, Yt, Yv = ml.splitData(X[:, 0:14], Y, 0.8) Xt, Yt = ml.shuffleData(Xt, Yt) Xt, _ = ml.transforms.rescale(Xt) learner.classes = np.unique(Yt) wts = [ 0.5, 1, -0.25, ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2) ] #wts = np.append(wts,[((random.random()-0.5)*2)]) #wts = [0.5 ,1] learner.theta = wts lc2.train(learner, Xt, Yt, 0.01, 1e-5, 10000, plot=1, reg=0) plt.show() print learner.err(Xt, Yt) Xte = np.genfromtxt("data/X_test.txt", delimiter=None)
from sklearn.preprocessing import StandardScaler from sklearn.metrics import roc_auc_score import seaborn as sns from sklearn import model_selection from sklearn.metrics import mean_absolute_error from sklearn.svm import SVR np.random.seed(0) data = np.array(pd.read_csv('data/white.csv')) data = np.array(list(set([tuple(t) for t in data]))) X = data[:, 0:11] X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) Y = data[:, -1] Xtr, Xte, Ytr, Yte = ml.splitData(X, Y, 0.75) def calc_mse(y1, y2): summation = 0 n = len(y1) for i in range(0, n): difference = y1[i] - y2[i] squared_difference = difference**2 summation = summation + squared_difference MSE = summation / n return MSE clf = MLPRegressor(activation='logistic', solver='lbfgs',
#!/usr/bin/env python """2016W-CS178: Homework 2, Problem1""" import numpy as np import mltools as ml import matplotlib.pyplot as plt data = np.genfromtxt("data/curve80.txt") features = (data[:, 0])[:, np.newaxis] targets = data[:, 1] train_features, test_features, train_targets, test_targets = ml.splitData( features, targets, 0.75) train_data_point = train_features.shape[0] test_data_points = test_features.shape[0] train_error = [] test_error = [] cross_error = [] cross_fold = 5 degrees = range(1, 20, 3) #degrees = (1, 3, 5, 7, 10, 18) plt.figure(1, (17, 7)) plt.subplot(1, 2, 1) plt.scatter(train_features, train_targets, color='b', label='training data') plt.scatter(test_features, test_targets, color='r', label='test data') for degree in degrees: # cross validate c_error_d = np.array([]) for iFold in range(cross_fold): Xt, Xv, Yt, Yv = ml.crossValidate(train_features, train_targets,
# Import all required libraries from __future__ import division import numpy as np import matplotlib.pyplot as plt import mltools as ml np.random.seed(0) get_ipython().magic(u'matplotlib inline') #import the X and Y training data X = np.genfromtxt('C:\data\X_train.txt', delimiter=None) Y = np.genfromtxt('C:\data\Y_train.txt', delimiter=None) X, Y = ml.shuffleData(X, Y) [Xtr, Xva, Ytr, Yva] = ml.splitData(X, Y) Xte = np.genfromtxt('C:\data\X_test.txt', delimiter=None) class BaggedTree(ml.base.classifier): def __init__(self, learners): """Constructs a BaggedTree class with a set of learners. """ self.learners = learners def predictSoft(self, X): """Predicts the probabilities with each bagged learner and average over the results. """ n_bags = len(self.learners) preds = [self.learners[l].predictSoft(X) for l in range(n_bags)] return np.mean(preds, axis=0)
# print(temp.shape) fmax.append(np.max(temp)) fmin.append(np.min(temp)) fmean.append(np.mean(temp)) fvar.append(np.var(temp)) i += stepsize value = [1]*int(valuestep/5)+[2]*int(valuestep/5)+[3]*int(valuestep/5)+[4]*int(valuestep/5)+[5]*int(valuestep/5-rest) print(len(value)) # print('datapointsnum:', len(fmax)) # print(fmax) value = np.array(value).T dataset = np.array([fmax,fmin,fmean,fvar]).T print(dataset.shape) dataset, value = ml.shuffleData(dataset, value) Xtr, Xva, Ytr, Yva = ml.splitData(dataset, value, 0.75); learner = svm.SVC(decision_function_shape='ovo') learner.fit(Xtr,Ytr) Yhat = learner.predict(Xva) sum=0 for a in range(len(Yhat)): sum += (Yhat[a]!=Yva[a]) print(sum) print(sum/len(Yhat)*100,"%")