import numpy as np import mltools as ml import matplotlib.pyplot as plt from sklearn.neural_network import MLPClassifier from sklearn.preprocessing import StandardScaler from sklearn.metrics import roc_auc_score X = np.genfromtxt('data/X_train.txt') Y = np.genfromtxt('data/Y_train.txt') Xte = np.genfromtxt('data/X_test.txt') X, Y = ml.shuffleData(X, Y) scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) Xte = scaler.transform(Xte) Xtr = X[0:120000] Ytr = Y[0:120000] Xtest = X[130000:] Ytest = Y[130000:] alpha = [1e-05, 0.0001, 0.001, 0.01, 0.1, 0.2] test_err = [] train_err = [] for i, k in enumerate(alpha): gbm0 = MLPClassifier(activation='tanh', alpha=k, hidden_layer_sizes=(80, )) gbm0.fit(Xtr, Ytr) Yhat = gbm0.predict_proba(Xtr) temp = roc_auc_score(Ytr, Yhat[:, 1])
''' import mltools as ml # We'll use some data manipulation routines in the provided class code # Make sure the "mltools" directory is in a directory on your Python path, e.g., # export PYTHONPATH=${PYTHONPATH}:/path/to/parent/dir # or add it to your path inside Python: # import sys # sys.path.append('/path/to/parent/dir/'); # X,Y = ml.shuffleData(X,Y); # shuffle data randomly # (This is a good idea in case your data are ordered in some pathological way, # as the Iris data are) # Xtr,Xte,Ytr,Yte = ml.splitData(X,Y, 0.75); # split data into 75/25 train/test # (a) # Use only first two features of X X_new, Y_new = ml.shuffleData(X[:, [0, 1]], Y) Xtr, Xte, Ytr, Yte = ml.splitData(X_new, Y_new, 0.75) # Visualize classification boundary for varying values of K = [1,5,10,50] for K in [1, 5, 10, 50]: knn = ml.knn.knnClassify(Xtr, Ytr, K) ml.plotClassify2D(knn, Xtr, Ytr) # (b) Prediction/ error for training set and test set K = [1, 2, 5, 10, 50, 100, 200] errTrain = np.zeros(7) errTest = np.zeros(7) for i, k in enumerate(K): learner = ml.knn.knnClassify(Xtr, Ytr, k) Yhat_tr = learner.predict(Xtr) Yhat_te = learner.predict(Xte)
import numpy as np import matplotlib.pyplot as plt import mltools as ml import mltools.logistic2 as lc2 iris = np.genfromtxt("data/iris.txt", delimiter=None) X, Y = iris[:, 0:2], iris[:, -1] # get first two features & target X, Y = ml.shuffleData(X, Y) # reorder randomly (important later) X, params = ml.rescale(X) # works much better on rescaled data XA, YA = X[Y < 2, :], Y[Y < 2] # get class 0 vs 1 XB, YB = X[Y > 0, :], Y[Y > 0] # get class 1 vs 2 # (a) Scatter plot of the two classes to exhibit seperability plt.title('Linearly Seperable Data') plt.scatter(XA[:, 0], XA[:, 1], c=YA) plt.show() plt.title('Linearly Non-seprabale Data') plt.scatter(XB[:, 0], XB[:, 1], c=YB) plt.show() # (b) Plotting a boundary with the class data points, by modifying plotBoundary() learner = lc2.logisticClassify2() # Initializing the logisic classifier learner.classes = np.unique(YA) # Picking uniqe values as the class labels wts = np.zeros(shape=(1, 3)) wts[0, :] = [0.5, 1, -0.25] # Assigning weights learner.theta = wts learner.plotBoundary(XA, YA) # Plotting decision boundary # Performing above actions for the XB-YB split of the data learner = lc2.logisticClassify2()
TThree_va_extr = TThree_va[:-n] TFour_tr_extr = TFour_tr[:-n] TFour_va_extr = TFour_va[:-n] #TFive_tr_extr = TFive_tr[:-n] #TFive_va_extr = TFive_va[:-n] X_train = np.concatenate( [AOne_tr_extr, ATwo_tr_extr, AThree_tr_extr, AFour_tr_extr], axis=0) Y_train = np.concatenate( [TOne_tr_extr, TTwo_tr_extr, TThree_tr_extr, TFour_tr_extr], axis=0) X_val = np.concatenate( [AOne_va_extr, ATwo_va_extr, AThree_va_extr, AFour_va_extr], axis=0) Y_val = np.concatenate( [TOne_va_extr, TTwo_va_extr, TThree_va_extr, TFour_va_extr], axis=0) X_train, Y_train = ml.shuffleData(X_train, Y_train) X_val, Y_val = ml.shuffleData(X_val, Y_val) # train on the features + perform hold validation clf = svm.SVC(gamma=0.001, C=100.) clf.fit(X_train, Y_train) Y_hat_val = clf.predict(X_val) error_rate = calc_error(Y_val, Y_hat_val) print("Hold out validation error rate:") print(error_rate) np.savetxt('y_val.csv', Y_val, delimiter=',') np.savetxt('y_hat_val.csv', Y_hat_val, delimiter=',')
import numpy as np import mltools as ml import matplotlib.pyplot as plt import random import mltools.logistic2 as lc2 reload(lc2) X = np.genfromtxt("data/X_train.txt", delimiter=None) Y = np.genfromtxt("data/Y_train.txt", delimiter=None) learner = lc2.logisticClassify2() Xt, Xv, Yt, Yv = ml.splitData(X[:, 0:14], Y, 0.8) Xt, Yt = ml.shuffleData(Xt, Yt) Xt, _ = ml.transforms.rescale(Xt) learner.classes = np.unique(Yt) wts = [ 0.5, 1, -0.25, ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2) ] #wts = np.append(wts,[((random.random()-0.5)*2)]) #wts = [0.5 ,1] learner.theta = wts lc2.train(learner, Xt, Yt, 0.01, 1e-5, 10000, plot=1, reg=0) plt.show() print learner.err(Xt, Yt) Xte = np.genfromtxt("data/X_test.txt", delimiter=None)
## Problem 1 ## ## part a ## import numpy as np import mltools as ml import matplotlib.pyplot as plt iris = np.genfromtxt("data/iris.txt", delimiter=None) X, Y = iris[:, 0:2], iris[:, -1] # get first two features & target X, Y = ml.shuffleData(X, Y) # reorder randomly (important later) X, _ = ml.rescale(X) # works much better on rescaled data XA, YA = X[Y < 2, :], Y[Y < 2] # get class 0 vs 1 XB, YB = X[Y > 0, :], Y[Y > 0] # get class 1 vs 2' X0, Y0 = X[Y == 0, :], Y[Y == 0] #class 0 X1, Y1 = X[Y == 1, :], Y[Y == 1] #class 1 X2, Y2 = X[Y == 2, :], Y[Y == 2] #class 2 plt.scatter(X0[:, 0], X0[:, 1], c='Blue') plt.scatter(X1[:, 0], X1[:, 1], c="Red") plt.close() plt.scatter(X1[:, 0], X1[:, 1], c='Blue') plt.scatter(X2[:, 0], X2[:, 1], c="Red") plt.close() ## part b ## from logisticClassify2 import *
# Note: indexing with ":" indicates all values (in this case, all rows); # indexing with a value ("0", "1", "-1", etc.) extracts only that one value (here, columns); # indexing rows/columns with a range ("1:-1") extracts any row/column in that range. import mltools as ml # We'll use some data manipulation routines in the provided class code # Make sure the "mltools" directory is in a directory on your Python path, e.g., # export PYTHONPATH=${PYTHONPATH}:/path/to/parent/dir # or add it to your path inside Python: # import sys # sys.path.append('/path/to/parent/dir/'); X,Y = ml.shuffleData(X,Y); # shuffle data randomly # (This is a good idea in case your data are ordered in some pathological way, # as the Iris data are) Xtr,Xva,Ytr,Yva = ml.splitData(X,Y, 0.75); # split data into 75/25 train/validation for K in [1, 5, 10, 50]: ## visualize classification boundary knn = ml.knn.knnClassify() # create the object and train it knn.train(Xtr, Ytr, K) # where K is an integer, e.g. 1 for nearest neighbor prediction YvaHat = knn.predict(Xva) # get estimates of y for each data point in Xva ml.plotClassify2D( knn, Xtr, Ytr, axis=plt ) # make 2D classification plot with data (Xtr,Ytr) plt.close() ## b ##
#!/usr/bin/env python """2016W-CS178: Homework 3, Problem1""" import numpy as np import matplotlib.pyplot as plt import mltools as ml iris = np.genfromtxt("data/iris.txt") features = iris[:, 0:2] targets = iris[:, -1] features, targets = ml.shuffleData(features, targets) features, _ = ml.transforms.rescale(features) # sub1: class 0 and class 1 features_sub1 = features[targets < 2, :] targets_sub1 = targets[targets < 2] # sub2: class 1 and class 2 features_sub2 = features[targets > 0, :] targets_sub2 = targets[targets > 0] learner = ml.logistic2.logisticClassify2(features_sub1, targets_sub1, plot=1) learner2 = ml.logistic2.logisticClassify2(features_sub2, targets_sub2, plot=2) plt.figure(3, figsize=(15, 7)) plt.subplot(121) learner.plotBoundary(features_sub1, targets_sub1) plt.legend() plt.subplot(122) learner2.plotBoundary(features_sub2, targets_sub2) plt.legend()
import mltools as ml ##### PROBLEM 2 ##### iris = np.genfromtxt("data/iris.txt", delimiter=None) # Note: indexing with ":" indicates all values (in this case, all rows) # indexing with a value ("0", "1", "-1", etc.) extracts only that one value (here, columns); # indexing rows/columns with a range ("1:-1") extracts any row/column in that range. Y = iris[:, -1] # last column (0, 1, 2, 3, -1) X = iris[:, 0:2] # takes first 2 columns out of 5 print(Y) print(X) X, Y = ml.shuffleData(X, Y) # Shuffles the ordered Iris data # Xtr = 75% of X[0:2] # Xva = 25% of X[0:2] Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75) # split it into 75/25 train/validation ## ##knn = ml.knn.knnClassify() #create object and train it ##knn.train(Xtr, Ytr, 1) #where K is an integer, e.g. 1 for nearest neighbor prediction ##YvaHat = knn.predict(Xva) #get estimates of y for each data point in Xva ## ##ml.plotClassify2D( knn, Xtr, Ytr ) # make 2D classification plot with data (Xtr,Ytr) ##plt.title("K = 1") ##plt.show() ##
from tensorflow.python.client import device_lib from sklearn.model_selection import GridSearchCV print(device_lib.list_local_devices()) np.random.seed(0) # Data Loading X = np.genfromtxt('data/X_train.txt', delimiter=None) Y = np.genfromtxt('data/Y_train.txt', delimiter=None) # The test data Xte = np.genfromtxt('data/X_test.txt', delimiter=None) Xtr, Xva, Ytr, Yva = ml.splitData(X, Y) Xtr, Ytr = ml.shuffleData(Xtr, Ytr) # Taking a subsample of the data so that trains faster. Xt, Yt = Xtr[:10000], Ytr[:10000] XtS, params = ml.rescale(Xt) XvS, _ = ml.rescale(Xva, params) XteS, _ = ml.rescale(Xte, params) # Settled on some initial variables such epochs=700, batch_size=1000, # loss = 'binary_crossentropy', optimizer = 'adam', metrics = 'accuracy' # and activation = 'relu' via references found online & trial/error scores = [] num_hidden_layers = [1, 5, 10, 30, 50, 100]
#!/usr/bin/env python """2016W-CS178: Homework 1, Problem2""" import numpy import matplotlib.pyplot as plt import mltools iris = numpy.genfromtxt("data/iris.txt") Y = iris[:, -1] X = iris[:, 0:2] # feature 1 & 2 X, Y = mltools.shuffleData(X, Y) trainX, testX, trainY, testY = mltools.splitData(X, Y, 0.75) # problem 2(a) plt.figure(1, (12, 9)) for i, k in enumerate([1, 5, 10, 50]): learner = mltools.knn.knnClassify() learner.train(trainX, trainY, k) plt.subplot(2, 2, i + 1) mltools.plot_classify_2d(learner, trainX, trainY) plt.grid(1) plt.xlabel('feature 1') plt.ylabel('feature 2') plt.title('Iris KNN: Feature 1 & 2, K = %d' % k) plt.show() plt.close(1) # problem 2(b) K = [1, 2, 5, 10, 50, 100, 200]
def main() : iris = np.genfromtxt("data/iris.txt", delimiter=None) Y = iris[:,-1] X = iris[:, 0:-1] print X.shape # Part 2 # for f in X.T: # plt.hist(f) # plt.show() # Part 3 for f in X.T: print "Mean: ", np.mean(f) print "Standard deviation: ", np.std(f) # Part 4 # pairs = [[0, 1, 4], [0, 2, 4], [0, 3, 4]] # colors = ['r*', 'g*', 'b*'] # for p in pairs: # for feature in iris[:, p]: # plt.plot(feature[0], feature[1], colors[int(feature[2])]) # plt.show() # Question 2 # Part 1) # XX = X[:, [0, 1]] # np.random.seed(1) # XX, Y = ml.shuffleData(XX, Y) # np.random.seed(1) # XXtr, XXva, Ytr, Yva = ml.splitData(XX, Y, 0.75) # K = [1, 5, 10, 50]; # for k in K: # knn = ml.knn.knnClassify() # knn.train(XXtr, Ytr, k) # ml.plotClassify2D(knn, XXtr, Ytr, axis=plt) # plt.title("K = ", k) # plt.show() # Part 2 np.random.seed(1) X, Y = ml.shuffleData(X, Y) np.random.seed(1) Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75) XXtr = Xtr[:, [0,1]] XXva = Xva[:, [0,1]] K = [1, 2, 5, 10, 50, 100, 200]; trainErr =[] validErr = [] for i,k in enumerate(K): knn = ml.knn.knnClassify() knn.train(XXtr, Ytr, k) YHat = knn.predict(XXtr) trainErr.append( np.sum(YHat != Ytr)*1.0/len(YHat) ) YHat = knn.predict(XXva); validErr.append( np.sum(YHat != Yva)*1.0/len(YHat) ) print "K = ", k, ": Error rate on training data = ", trainErr[i], ", on validation data = ", validErr[i] plt.semilogx(K, trainErr, color = "r", label = "Error on Training Data") plt.semilogx(K, validErr, color = "g", label = "Error on ") plt.show() trainErr = [] validErr = [] for i, k in enumerate(K): knn = ml.knn.knnClassify() knn.train(Xtr, Ytr, k) YHat = knn.predict(Xtr) trainErr.append(np.sum(YHat != Ytr) * 1.0 / len(YHat)) YHat = knn.predict(Xva); validErr.append(np.sum(YHat != Yva) * 1.0 / len(YHat)) print "K = ", k, ": Error rate on training data = ", trainErr[i], ", on validation data = ", validErr[i] plt.semilogx(K, trainErr, color="r", label="Error on Training Data") plt.semilogx(K, validErr, color="g", label="Error on Validation Data") plt.show() print "OK, I'm done."
temp = d[i:i+windowsize] # print(temp.shape) fmax.append(np.max(temp)) fmin.append(np.min(temp)) fmean.append(np.mean(temp)) fvar.append(np.var(temp)) i += stepsize value = [1]*int(valuestep/5)+[2]*int(valuestep/5)+[3]*int(valuestep/5)+[4]*int(valuestep/5)+[5]*int(valuestep/5-rest) print(len(value)) # print('datapointsnum:', len(fmax)) # print(fmax) value = np.array(value).T dataset = np.array([fmax,fmin,fmean,fvar]).T print(dataset.shape) dataset, value = ml.shuffleData(dataset, value) Xtr, Xva, Ytr, Yva = ml.splitData(dataset, value, 0.75); learner = svm.SVC(decision_function_shape='ovo') learner.fit(Xtr,Ytr) Yhat = learner.predict(Xva) sum=0 for a in range(len(Yhat)): sum += (Yhat[a]!=Yva[a]) print(sum) print(sum/len(Yhat)*100,"%")
Yte = test[:,1] Z = data[:,2] Zte = test[:,2] # print("Xte", Xte.shape) datatr, valuetr = func(X,Y,Z) # print(np.mean(datatr[0:49,3])) # print(np.mean(datatr[49:98,3])) datate, valuete = func(Xte,Yte,Zte) print("te",valuete) # datatr = datatr[:,1:3]; # datate = datate[:,1:3]; print(datatr.shape, " ", len(valuetr)) # print("valuetr", valuetr.shape) datatr, valuetr = ml.shuffleData(datatr, valuetr) # Xva, Xtr, Yva, Ytr = ml.splitData(datatr, valuetr, 0.0625) # Xtr, Xva, Ytr, Yva = ml.splitData(datatr, valuetr, 0.8) # Xtr, Ytr = ml.shuffleData(datatr,valuetr) learner = svm.SVC(decision_function_shape='ovo') learner.fit(datatr,valuetr) Yhat = learner.predict(datate) print("yhat",Yhat) sum=0 for a in range(len(Yhat)): sum += (Yhat[a]!= 4) # print("Yhat", Yhat[a], "v", valuete[a]) print(sum)
ent_4_0 = (2.0/3)*math.log(3.0/2,2) + (1.0/3)*math.log(3.0,2) information_gain_4 = (7.0/10)*(entropy_y - ent_4_1) + (3.0/10)*(entropy_y - ent_4_0) print('Information gain for feature 4:, %0.4f' %(information_gain_4)) # x5 information gain ent_5_1 = (1.0/3)*math.log(3.0,2) + (2.0/3)*math.log(3.0/2,2) ent_5_0 = (3.0/7)*math.log(7.0/3,2) + (4.0/7)*math.log(7.0/4,2) information_gain_5 = (3.0/10)*(entropy_y - ent_5_1) + (7.0/10)*(entropy_y - ent_5_0) print('Information gain for feature 5:, %0.4f' %(information_gain_5)) #question 2.1 xt = np.genfromtxt('data/X_train.txt', delimiter=None) yt = np.genfromtxt('data/Y_train.txt', delimiter=None) xt,yt = ml.shuffleData(xt,yt) for i in range(xt.shape[1]): print('minimum of x%d:, %0.4f' %(i,min(xt[:,0]))) print('maximum of x%d:, %0.4f' %(i,max(xt[:,0]))) print('mean of x%d:, %0.4f' %(i,np.mean(xt[:,0]))) print('mean of x%d:, %0.4f' %(i,np.var(xt[:,0]))) print() #question 2.2 xt_0_10000 = xt[0:10000] yt_0_10000 = yt[0:10000] xv_10000_20000 = xt[10000:20000] yv_10000_20000 = yt[10000:20000]