예제 #1
0
import numpy as np
import mltools as ml
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

X = np.genfromtxt('data/X_train.txt')
Y = np.genfromtxt('data/Y_train.txt')
Xte = np.genfromtxt('data/X_test.txt')

X, Y = ml.shuffleData(X, Y)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
Xte = scaler.transform(Xte)

Xtr = X[0:120000]
Ytr = Y[0:120000]
Xtest = X[130000:]
Ytest = Y[130000:]

alpha = [1e-05, 0.0001, 0.001, 0.01, 0.1, 0.2]
test_err = []
train_err = []

for i, k in enumerate(alpha):
    gbm0 = MLPClassifier(activation='tanh', alpha=k, hidden_layer_sizes=(80, ))
    gbm0.fit(Xtr, Ytr)
    Yhat = gbm0.predict_proba(Xtr)
    temp = roc_auc_score(Ytr, Yhat[:, 1])
예제 #2
0
'''
import mltools as ml
# We'll use some data manipulation routines in the provided class code
# Make sure the "mltools" directory is in a directory on your Python path, e.g.,
# export PYTHONPATH=${PYTHONPATH}:/path/to/parent/dir
# or add it to your path inside Python:
# import sys
# sys.path.append('/path/to/parent/dir/');
# X,Y = ml.shuffleData(X,Y); # shuffle data randomly
# (This is a good idea in case your data are ordered in some pathological way,
# as the Iris data are)
# Xtr,Xte,Ytr,Yte = ml.splitData(X,Y, 0.75); # split data into 75/25 train/test

# (a)
# Use only first two features of X
X_new, Y_new = ml.shuffleData(X[:, [0, 1]], Y)
Xtr, Xte, Ytr, Yte = ml.splitData(X_new, Y_new, 0.75)
# Visualize classification boundary for varying values of K = [1,5,10,50]

for K in [1, 5, 10, 50]:
    knn = ml.knn.knnClassify(Xtr, Ytr, K)
    ml.plotClassify2D(knn, Xtr, Ytr)

# (b) Prediction/ error for training set and test set
K = [1, 2, 5, 10, 50, 100, 200]
errTrain = np.zeros(7)
errTest = np.zeros(7)
for i, k in enumerate(K):
    learner = ml.knn.knnClassify(Xtr, Ytr, k)
    Yhat_tr = learner.predict(Xtr)
    Yhat_te = learner.predict(Xte)
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
import mltools.logistic2 as lc2

iris = np.genfromtxt("data/iris.txt", delimiter=None)
X, Y = iris[:, 0:2], iris[:, -1]  # get first two features & target
X, Y = ml.shuffleData(X, Y)  # reorder randomly (important later)
X, params = ml.rescale(X)  # works much better on rescaled data
XA, YA = X[Y < 2, :], Y[Y < 2]  # get class 0 vs 1
XB, YB = X[Y > 0, :], Y[Y > 0]  # get class 1 vs 2

# (a) Scatter plot of the two classes to exhibit seperability
plt.title('Linearly Seperable Data')
plt.scatter(XA[:, 0], XA[:, 1], c=YA)
plt.show()

plt.title('Linearly Non-seprabale Data')
plt.scatter(XB[:, 0], XB[:, 1], c=YB)
plt.show()

# (b) Plotting a boundary with the class data points, by modifying plotBoundary()
learner = lc2.logisticClassify2()  # Initializing the logisic classifier
learner.classes = np.unique(YA)  # Picking uniqe values as the class labels
wts = np.zeros(shape=(1, 3))
wts[0, :] = [0.5, 1, -0.25]  # Assigning weights
learner.theta = wts
learner.plotBoundary(XA, YA)  # Plotting decision boundary

# Performing above actions for the XB-YB split of the data
learner = lc2.logisticClassify2()
예제 #4
0
TThree_va_extr = TThree_va[:-n]
TFour_tr_extr = TFour_tr[:-n]
TFour_va_extr = TFour_va[:-n]
#TFive_tr_extr = TFive_tr[:-n]
#TFive_va_extr = TFive_va[:-n]

X_train = np.concatenate(
    [AOne_tr_extr, ATwo_tr_extr, AThree_tr_extr, AFour_tr_extr], axis=0)
Y_train = np.concatenate(
    [TOne_tr_extr, TTwo_tr_extr, TThree_tr_extr, TFour_tr_extr], axis=0)
X_val = np.concatenate(
    [AOne_va_extr, ATwo_va_extr, AThree_va_extr, AFour_va_extr], axis=0)
Y_val = np.concatenate(
    [TOne_va_extr, TTwo_va_extr, TThree_va_extr, TFour_va_extr], axis=0)

X_train, Y_train = ml.shuffleData(X_train, Y_train)
X_val, Y_val = ml.shuffleData(X_val, Y_val)

# train on the features + perform hold validation

clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(X_train, Y_train)

Y_hat_val = clf.predict(X_val)
error_rate = calc_error(Y_val, Y_hat_val)
print("Hold out validation error rate:")
print(error_rate)

np.savetxt('y_val.csv', Y_val, delimiter=',')
np.savetxt('y_hat_val.csv', Y_hat_val, delimiter=',')
import numpy as np
import mltools as ml
import matplotlib.pyplot as plt
import random
import mltools.logistic2 as lc2
reload(lc2)

X = np.genfromtxt("data/X_train.txt", delimiter=None)
Y = np.genfromtxt("data/Y_train.txt", delimiter=None)
learner = lc2.logisticClassify2()

Xt, Xv, Yt, Yv = ml.splitData(X[:, 0:14], Y, 0.8)
Xt, Yt = ml.shuffleData(Xt, Yt)
Xt, _ = ml.transforms.rescale(Xt)
learner.classes = np.unique(Yt)
wts = [
    0.5, 1, -0.25, ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2), ((random.random() - 0.5) * 2),
    ((random.random() - 0.5) * 2)
]
#wts = np.append(wts,[((random.random()-0.5)*2)])
#wts = [0.5 ,1]
learner.theta = wts
lc2.train(learner, Xt, Yt, 0.01, 1e-5, 10000, plot=1, reg=0)
plt.show()
print learner.err(Xt, Yt)
Xte = np.genfromtxt("data/X_test.txt", delimiter=None)
예제 #6
0
## Problem 1 ##

## part a ##

import numpy as np
import mltools as ml
import matplotlib.pyplot as plt

iris = np.genfromtxt("data/iris.txt", delimiter=None)
X, Y = iris[:, 0:2], iris[:, -1]  # get first two features & target
X, Y = ml.shuffleData(X, Y)  # reorder randomly (important later)
X, _ = ml.rescale(X)  # works much better on rescaled data

XA, YA = X[Y < 2, :], Y[Y < 2]  # get class 0 vs 1
XB, YB = X[Y > 0, :], Y[Y > 0]  # get class 1 vs 2'

X0, Y0 = X[Y == 0, :], Y[Y == 0]  #class 0
X1, Y1 = X[Y == 1, :], Y[Y == 1]  #class 1
X2, Y2 = X[Y == 2, :], Y[Y == 2]  #class 2

plt.scatter(X0[:, 0], X0[:, 1], c='Blue')
plt.scatter(X1[:, 0], X1[:, 1], c="Red")
plt.close()

plt.scatter(X1[:, 0], X1[:, 1], c='Blue')
plt.scatter(X2[:, 0], X2[:, 1], c="Red")
plt.close()

## part b ##

from logisticClassify2 import *
예제 #7
0
# Note: indexing with ":" indicates all values (in this case, all rows);
# indexing with a value ("0", "1", "-1", etc.) extracts only that one value (here, columns);
# indexing rows/columns with a range ("1:-1") extracts any row/column in that range.

import mltools as ml

# We'll use some data manipulation routines in the provided class code
# Make sure the "mltools" directory is in a directory on your Python path, e.g.,
# export PYTHONPATH=${PYTHONPATH}:/path/to/parent/dir
# or add it to your path inside Python:

# import sys

# sys.path.append('/path/to/parent/dir/');

X,Y = ml.shuffleData(X,Y); # shuffle data randomly

# (This is a good idea in case your data are ordered in some pathological way,
# as the Iris data are)

Xtr,Xva,Ytr,Yva = ml.splitData(X,Y, 0.75); # split data into 75/25 train/validation

for K in [1, 5, 10, 50]: ## visualize classification boundary
    knn = ml.knn.knnClassify() # create the object and train it
    knn.train(Xtr, Ytr, K) # where K is an integer, e.g. 1 for nearest neighbor prediction
    YvaHat = knn.predict(Xva) # get estimates of y for each data point in Xva

    ml.plotClassify2D( knn, Xtr, Ytr, axis=plt ) # make 2D classification plot with data (Xtr,Ytr)
    plt.close()

## b ##
예제 #8
0
#!/usr/bin/env python

"""2016W-CS178: Homework 3, Problem1"""


import numpy as np
import matplotlib.pyplot as plt
import mltools as ml


iris = np.genfromtxt("data/iris.txt")
features = iris[:, 0:2]
targets = iris[:, -1]
features, targets = ml.shuffleData(features, targets)
features, _ = ml.transforms.rescale(features)
# sub1: class 0 and class 1
features_sub1 = features[targets < 2, :]
targets_sub1 = targets[targets < 2]
# sub2: class 1 and class 2
features_sub2 = features[targets > 0, :]
targets_sub2 = targets[targets > 0]

learner = ml.logistic2.logisticClassify2(features_sub1, targets_sub1, plot=1)
learner2 = ml.logistic2.logisticClassify2(features_sub2, targets_sub2, plot=2)
plt.figure(3, figsize=(15, 7))
plt.subplot(121)
learner.plotBoundary(features_sub1, targets_sub1)
plt.legend()
plt.subplot(122)
learner2.plotBoundary(features_sub2, targets_sub2)
plt.legend()
예제 #9
0
import mltools as ml

##### PROBLEM 2 #####
iris = np.genfromtxt("data/iris.txt", delimiter=None)

# Note: indexing with ":" indicates all values (in this case, all rows)

# indexing with a value ("0", "1", "-1", etc.) extracts only that one value (here, columns);
# indexing rows/columns with a range ("1:-1") extracts any row/column in that range.
Y = iris[:, -1]  # last column (0, 1, 2, 3, -1)
X = iris[:, 0:2]  # takes first 2 columns out of 5

print(Y)
print(X)

X, Y = ml.shuffleData(X, Y)  # Shuffles the ordered Iris data

# Xtr = 75% of X[0:2]
# Xva = 25% of X[0:2]
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75)
# split it into 75/25 train/validation

##
##knn = ml.knn.knnClassify() #create object and train it
##knn.train(Xtr, Ytr, 1)     #where K is an integer, e.g. 1 for nearest neighbor prediction
##YvaHat = knn.predict(Xva)  #get estimates of y for each data point in Xva
##
##ml.plotClassify2D( knn, Xtr, Ytr ) # make 2D classification plot with data (Xtr,Ytr)
##plt.title("K = 1")
##plt.show()
##
예제 #10
0
from tensorflow.python.client import device_lib
from sklearn.model_selection import GridSearchCV

print(device_lib.list_local_devices())

np.random.seed(0)

# Data Loading
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)

# The test data
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)

Xtr, Xva, Ytr, Yva = ml.splitData(X, Y)
Xtr, Ytr = ml.shuffleData(Xtr, Ytr)

# Taking a subsample of the data so that trains faster.
Xt, Yt = Xtr[:10000], Ytr[:10000] 

XtS, params = ml.rescale(Xt)
XvS, _ = ml.rescale(Xva, params)
XteS, _ = ml.rescale(Xte, params)

# Settled on some initial variables such epochs=700, batch_size=1000, 
# loss = 'binary_crossentropy', optimizer = 'adam', metrics = 'accuracy'
# and activation = 'relu' via references found online & trial/error

scores = []
num_hidden_layers = [1, 5, 10, 30, 50, 100]
#!/usr/bin/env python
"""2016W-CS178: Homework 1, Problem2"""

import numpy
import matplotlib.pyplot as plt
import mltools

iris = numpy.genfromtxt("data/iris.txt")
Y = iris[:, -1]
X = iris[:, 0:2]  # feature 1 & 2
X, Y = mltools.shuffleData(X, Y)
trainX, testX, trainY, testY = mltools.splitData(X, Y, 0.75)

# problem 2(a)
plt.figure(1, (12, 9))

for i, k in enumerate([1, 5, 10, 50]):
    learner = mltools.knn.knnClassify()
    learner.train(trainX, trainY, k)
    plt.subplot(2, 2, i + 1)
    mltools.plot_classify_2d(learner, trainX, trainY)
    plt.grid(1)
    plt.xlabel('feature 1')
    plt.ylabel('feature 2')
    plt.title('Iris KNN: Feature 1 & 2, K = %d' % k)

plt.show()
plt.close(1)

# problem 2(b)
K = [1, 2, 5, 10, 50, 100, 200]
예제 #12
0
def main() :
    iris = np.genfromtxt("data/iris.txt", delimiter=None)
    Y = iris[:,-1]
    X = iris[:, 0:-1]
    print X.shape
    # Part 2
    # for f in X.T:
    #     plt.hist(f)
    #     plt.show()
    # Part 3
    for f in X.T:
        print "Mean: ", np.mean(f)
        print "Standard deviation: ", np.std(f)
    # Part 4
    # pairs = [[0, 1, 4], [0, 2, 4], [0, 3, 4]]
    # colors = ['r*', 'g*', 'b*']
    # for p in pairs:
    #     for feature in iris[:, p]:
    #         plt.plot(feature[0], feature[1], colors[int(feature[2])])
    # plt.show()
    # Question 2
    # Part 1)
    # XX = X[:, [0, 1]]
    # np.random.seed(1)
    # XX, Y = ml.shuffleData(XX, Y)
    # np.random.seed(1)
    # XXtr, XXva, Ytr, Yva = ml.splitData(XX, Y, 0.75)
    # K = [1, 5, 10, 50];
    # for k in K:
    #     knn = ml.knn.knnClassify()
    #     knn.train(XXtr, Ytr, k)
    #     ml.plotClassify2D(knn, XXtr, Ytr, axis=plt)
    #     plt.title("K = ", k)
    #     plt.show()
    # Part 2
    np.random.seed(1)
    X, Y = ml.shuffleData(X, Y)
    np.random.seed(1)
    Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, 0.75)
    XXtr = Xtr[:, [0,1]]
    XXva = Xva[:, [0,1]]
    K = [1, 2, 5, 10, 50, 100, 200];
    trainErr  =[]
    validErr = []
    for i,k in enumerate(K):
        knn = ml.knn.knnClassify()
        knn.train(XXtr, Ytr, k)
        YHat = knn.predict(XXtr)
        trainErr.append( np.sum(YHat != Ytr)*1.0/len(YHat) )
        YHat = knn.predict(XXva);
        validErr.append( np.sum(YHat != Yva)*1.0/len(YHat) )
        print "K = ", k, ": Error rate on training data = ", trainErr[i], ", on validation data = ", validErr[i]
    plt.semilogx(K, trainErr, color = "r", label = "Error on Training Data")
    plt.semilogx(K, validErr, color = "g", label = "Error on ")
    plt.show()

    trainErr = []
    validErr = []
    for i, k in enumerate(K):
        knn = ml.knn.knnClassify()
        knn.train(Xtr, Ytr, k)
        YHat = knn.predict(Xtr)
        trainErr.append(np.sum(YHat != Ytr) * 1.0 / len(YHat))
        YHat = knn.predict(Xva);
        validErr.append(np.sum(YHat != Yva) * 1.0 / len(YHat))
        print "K = ", k, ": Error rate on training data = ", trainErr[i], ", on validation data = ", validErr[i]
    plt.semilogx(K, trainErr, color="r", label="Error on Training Data")
    plt.semilogx(K, validErr, color="g", label="Error on Validation Data")
    plt.show()
    print "OK, I'm done."
예제 #13
0
    temp = d[i:i+windowsize]
    # print(temp.shape)
    fmax.append(np.max(temp))
    fmin.append(np.min(temp))
    fmean.append(np.mean(temp))
    fvar.append(np.var(temp))
    i += stepsize

value = [1]*int(valuestep/5)+[2]*int(valuestep/5)+[3]*int(valuestep/5)+[4]*int(valuestep/5)+[5]*int(valuestep/5-rest)
print(len(value))
# print('datapointsnum:', len(fmax))
# print(fmax)
value = np.array(value).T
dataset = np.array([fmax,fmin,fmean,fvar]).T
print(dataset.shape)
dataset, value = ml.shuffleData(dataset, value)
Xtr, Xva, Ytr, Yva = ml.splitData(dataset, value, 0.75);

learner = svm.SVC(decision_function_shape='ovo')
learner.fit(Xtr,Ytr)
Yhat = learner.predict(Xva)
sum=0
for a in range(len(Yhat)):
    sum += (Yhat[a]!=Yva[a])

print(sum)
print(sum/len(Yhat)*100,"%")



예제 #14
0
Yte = test[:,1]
Z = data[:,2]
Zte = test[:,2]

# print("Xte", Xte.shape)
datatr, valuetr = func(X,Y,Z)
# print(np.mean(datatr[0:49,3]))
# print(np.mean(datatr[49:98,3]))

datate, valuete = func(Xte,Yte,Zte)
print("te",valuete)
# datatr = datatr[:,1:3];
# datate = datate[:,1:3];
print(datatr.shape, "  ", len(valuetr))
# print("valuetr", valuetr.shape)
datatr, valuetr = ml.shuffleData(datatr, valuetr)

# Xva, Xtr, Yva, Ytr = ml.splitData(datatr, valuetr, 0.0625)
# Xtr, Xva, Ytr, Yva = ml.splitData(datatr, valuetr, 0.8)
# Xtr, Ytr = ml.shuffleData(datatr,valuetr)
learner = svm.SVC(decision_function_shape='ovo')
learner.fit(datatr,valuetr)
Yhat = learner.predict(datate)

print("yhat",Yhat)
sum=0
for a in range(len(Yhat)):
    sum += (Yhat[a]!= 4)
    # print("Yhat", Yhat[a], "v", valuete[a])

print(sum)
예제 #15
0
ent_4_0 = (2.0/3)*math.log(3.0/2,2) + (1.0/3)*math.log(3.0,2)
information_gain_4 = (7.0/10)*(entropy_y - ent_4_1) + (3.0/10)*(entropy_y - ent_4_0)
print('Information gain for feature 4:, %0.4f' %(information_gain_4))

# x5 information gain 
ent_5_1 = (1.0/3)*math.log(3.0,2) + (2.0/3)*math.log(3.0/2,2)
ent_5_0 = (3.0/7)*math.log(7.0/3,2) + (4.0/7)*math.log(7.0/4,2)
information_gain_5 = (3.0/10)*(entropy_y - ent_5_1) + (7.0/10)*(entropy_y - ent_5_0)
print('Information gain for feature 5:, %0.4f' %(information_gain_5))


#question 2.1

xt = np.genfromtxt('data/X_train.txt', delimiter=None)
yt = np.genfromtxt('data/Y_train.txt', delimiter=None)
xt,yt = ml.shuffleData(xt,yt)
for i in range(xt.shape[1]):
    print('minimum of x%d:, %0.4f' %(i,min(xt[:,0])))
    print('maximum of x%d:, %0.4f' %(i,max(xt[:,0])))
    print('mean of x%d:, %0.4f' %(i,np.mean(xt[:,0])))
    print('mean of x%d:, %0.4f' %(i,np.var(xt[:,0])))
    print()

#question 2.2

xt_0_10000 = xt[0:10000]
yt_0_10000 = yt[0:10000]

xv_10000_20000 = xt[10000:20000]
yv_10000_20000 = yt[10000:20000]