def MLP_test(): from preprocess import process_data, partition_data print('processing data...') X, y = process_data(collapse=False, encode=True, normalize=True, predict_missing=True, k_predict=3) [test, validate, train] = partition_data(X, y) print('fitting model... ') from sklearn.neural_network import MLPClassifier model = MLPClassifier(hidden_layer_sizes=(1000, 2000, 1000, 100, 50), verbose=False) model.fit(train[0], train[1]) valid_prob = model.predict_proba(validate[0]) print(valid_prob[0:5]) print(validate[1][0:5]) from cross_entropy import cross_entropy print(valid_prob.shape, validate[1].shape) print('cross entropy:', cross_entropy(validate[1], valid_prob)) from risk import empirical_risk print('mse:', empirical_risk('mse', valid_prob, validate[1])) from sklearn.metrics import accuracy_score print('accuracy', accuracy_score(validate[1], model.predict(validate[0])))
def cross_validate(): net_hidden_layers = [ (100), (1000), (100, 100), (1000, 1000), (1000, 100), (1000, 100, 100), (1000, 1000, 100), (1000, 1000, 100, 100), (1000, 2000, 100, 500, 100), (2000, 1000, 500, 100, 50), ] models = [FFNN(h) for h in net_hidden_layers] from preprocess import process_data, partition_data print('processing data...') X, y = process_data(collapse=False, encode=True, normalize=True, predict_missing=True, k_predict=3) from cross_validation import cross_validation r = cross_validation(X, y, models) print(r) i = np.argmin(r) print('best model...', net_hidden_layers[i]) model = FFNN(net_hidden_layers[i]) partitioned_data = partition_data(X, y, partitions=[0.2, 0.8]) train = partitioned_data[1] valid = partitioned_data[0] model.fit(train[0], train[1]) p = model.predict(valid[0]) from evaluate import evaluate print(valid[1].shape, p.shape) evaluate(valid[1], p)
def cross_validation(X, y, models): k = len(models) partitioned_data = preprocess.partition_data( X, y, partitions=[1 / k for _ in range(k)]) # print(partitioned_data) r = [] for i, model in enumerate(models): valid = partitioned_data[i] train_X = [] train_y = [] primed = False for j in range(k): if i == j: continue if not primed: train_X = partitioned_data[j][0] train_y = partitioned_data[j][1] primed = True else: train_X = np.append(train_X, partitioned_data[j][0], axis=0) train_y = np.append(train_y, partitioned_data[j][1], axis=0) try: model.fit(train_X, train_y) p = model.predict(valid[0]) r.append(risk.empirical_risk('mse', p, valid[1])) except Exception as e: print(e) r.append(100000) # big number because it didn't work return r
from evaluate import evaluate print(valid[1].shape, p.shape) evaluate(valid[1], p) if __name__ == '__main__': from preprocess import process_data, partition_data print('processing data...') X, y = process_data(collapse=False, encode=True, normalize=True, predict_missing=True, k_predict=3) partitioned_data = partition_data(X, y, partitions=[0.2, 0.8]) train = partitioned_data[1] valid = partitioned_data[0] #model = FFNN((1000, 100, 100), num_iterations=500) model = FFNN((1000, 10000, 1000, 100, 50), num_iterations=500) model.fit(train[0], train[1]) from evaluate import evaluate evaluate(train[1], model.predict(train[0])) evaluate(valid[1], model.predict(valid[0])) from sklearn.metrics import roc_curve
print('processing data...') usecols = [i for i in range(0,26)] + [i for i in range(87,98)] + [161, 163] + [i for i in range(219,228)] + [279] X, y = preprocess.process_data(usecols=usecols, collapse=True, normalize=True, encode=False, predict_missing=True, k_predict=3) print('performing cross-validation...') models = [kNN(i) for i in range(1,10)] r = cross_validation(X, y, models) print(r) k = np.argmin(r) + 1 print('evaluated best model (k:',k,')...') partitioned_data = preprocess.partition_data(X, y, partitions=[0.2,0.8]) train = partitioned_data[1] valid = partitioned_data[0] model = kNN(k) model.fit(train[0], train[1]) p = model.predict(valid[0]) from evaluate import evaluate print('evaluating valid, train for presence') evaluate(valid[1], p) evaluate(train[1], model.predict(train[0])) # now, we'll have a kNN for if it's arrhythmia or not. Here's an idea: have a *different* predictor exclusively for # classes!
A = sigmoid(np.dot(w.T, X) + b) for i in range(A.shape[0]): Y_prediction[0,i] = 1 if A[0,i] > 0.5 else 0 assert(Y_prediction.shape == (m, 1)) return Y_prediction def model(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5): print(X_train.shape) w, b = initialize_with_zeros(X_train.shape[1]) # gradient descent w, b, dw, db, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate) Y_prediction_test = predict(w, b, X_test) Y_prediction_train = predict(w, b, X_train) print('train: {} %'.format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100)) print('train: {} %'.format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100)) if __name__ == "__main__": import preprocess data = preprocess.process_data() [test, train] = preprocess.partition_data(data, [0.2,0.8]) model(train[:,0:-1],train[:,-1],test[:,0:-1],test[:,-1], 2000, 0.005)