def __init__(self, model_name, w=None, learning_param=None, debug=True): # Set weights self.w = w # Set debug object if debug: self.dbg = debugger.Debugger(['loss', 'w']) else: self.dbg = None """Depending on the chosen model, we choose the approriate output, loss prediction, and learning functions. """ if model_name == 'logistic_regression': self.model_output = misc.lr_output self.compute_loss = cost.compute_loss_ce self.predict_output = misc.map_prediction max_iters = learning_param['max_iters'] gamma = learning_param['gamma'] self.learn = lambda y, x, w, dbg: impl.logistic_regression(y, x, w, max_iters, gamma, dbg) if model_name == 'reg_logistic_regression': self.model_output = misc.lr_output self.compute_loss = cost.compute_loss_reg_ce self.predict_output = misc.map_prediction max_iters = learning_param['max_iters'] gamma = learning_param['gamma'] lambda_ = learning_param['lambda_'] self.learn = lambda y, x, w, dbg: impl.reg_logistic_regression(y, x, lambda_, w, max_iters, gamma, dbg) if model_name == 'least_squares_GD': self.model_output = np.dot self.compute_loss = cost.compute_loss_ls self.predict_output = misc.predict_ls max_iters = learning_param['max_iters'] gamma = learning_param['gamma'] self.learn = lambda y, x, w, dbg: impl.least_squares_GD(y, x, w, max_iters, gamma, dbg) if model_name == 'ridge_regression': self.model_output = np.dot self.compute_loss = cost.compute_loss_ls self.predict_output = misc.predict_ls lambda_ = learning_param['lambda_'] self.learn = lambda y, x, w, dbg: impl.ridge_regression(y, x, lambda_) if model_name == 'least_squares': self.model_output = np.dot self.compute_loss = cost.compute_loss_ls self.predict_output = misc.predict_ls self.learn = lambda y, x, w, dbg: impl.least_squares(y, x)
def cross_validation(y, x, degree, k, k_indices,method, error, feature_augmentation, hyperparams): """""" from helpers_data import feature_processing, feat_augmentation, standardize, build_poly from implementations import ridge_regression, least_squares, least_squares_GD, least_squares_SGD, logistic_regression, reg_logistic_regression # get k'th subgroup in test, others in train te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_te = y[te_indice] y_tr = y[tr_indice] x_te = x[te_indice] x_tr = x[tr_indice] x_tr, y_tr, median = feature_processing (x_tr, y_tr, 'mean', replace_feature = True, suppr_outliers = hyperparams[-1], threshold = 3, ref_median=[]) x_te, y_te, _= feature_processing (x_te, y_te, 'mean', replace_feature = True, suppr_outliers = False, threshold = 3, ref_median=median) tx_tr_aug = [] tx_te_aug = [] if feature_augmentation: tx_tr_aug, index = feat_augmentation(x_tr, 0.003) tx_te_aug, _ = feat_augmentation(x_te, 0.003, False, index) # form data with polynomial degree tx_tr = build_poly(x_tr, degree, feature_augmentation, tx_tr_aug) tx_te = build_poly(x_te, degree, feature_augmentation, tx_te_aug) tx_tr, mean, std = standardize(tx_tr) tx_te, _, _ = standardize(tx_te, mean, std) #print('Mean and std of each feature in train set: {} , {}'.format(tx_tr.mean(axis = 0),tx_tr.std(axis = 0))) #print('Mean and std of each feature in test set: {} , {}'.format(tx_te.mean(axis = 0),tx_te.std(axis = 0))) if method == 'rr': w,_ = ridge_regression(y_tr, tx_tr, hyperparams[0]) # ridge regression elif method == 'ls': w,_ = least_squares(y_tr, tx_tr) # least square elif method == 'lsGD': w,_ = least_squares_GD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # gradient descent elif method == 'lsSGD': w,_ = least_squares_SGD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2], hyperparams[3]) # stoch GD elif method == 'log': w,_ = logistic_regression(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # logistic reg elif method == 'rlog': w,_ =reg_logistic_regression(y_tr, tx_tr, hyperparams[3], np.zeros(tx_tr.shape[1]), hyperparams[1], hyperparams[2]) # regularised logistic reg else: raise NotImplementedError if method == 'log': loss_tr = cal_loglike(y_tr, tx_tr, w) loss_te = cal_loglike(y_te, tx_te, w) elif method == 'rlog': loss_tr = cal_loglike_r(y_tr, tx_tr, w, hyperparams[3]) loss_te = cal_loglike_r(y_te, tx_te, w, hyperparams[3]) else : # calculate the loss for train and test data loss_tr = compute_loss(y_tr, tx_tr, w, error) loss_te = compute_loss(y_te, tx_te, w, error) y_pred = predict_labels(np.array(w).T, tx_te) acc = accuracy(y_te,y_pred) return loss_tr, loss_te, w, acc
def pipeline(tx_train, y_train, tx_val, y_val, degrees, gamma, lambda_, epochs, verbose): """ Run the model training and evaluation on the given parameters """ # Perform data cleaning (missing values, constant features, outliers, standardization) data_cleaner = DataCleaning() tx_train = data_cleaner.fit_transform(tx_train) tx_val = data_cleaner.transform(tx_val) # Perform feature engineering feature_generator = FeatureEngineering() x_train = feature_generator.fit_transform(tx=tx_train, degree=degrees) x_val = feature_generator.transform(tx=tx_val) # Initialize values initial_w = np.zeros(x_train.shape[1]) # Train model w, _ = reg_logistic_regression(y_train, x_train, lambda_, initial_w, epochs, gamma, verbose) # Perform inference on validation pred = predict_labels(weights=w, data=x_val, logistic=True) evaluator = Evaluation(y_val, pred) return evaluator.get_f1(), evaluator.get_accuracy()
def cross_validation(y, tx, mlfunction, split_number=5, lambda_=1e-6, gamma=0.001): '''Performs a ml_function given as parameters using cross validation on the training set split_number folds (5 as default value) ''' # define empty lists to store train/test losses and accuracy train_loss_ = [] test_loss_ = [] train_accuracy_ = [] test_accuracy_ = [] # get k_indices k_indices = build_k_indices(len(y), split_number) for ki in range(len(k_indices)): # set the k'th indices as test, and others as training set #train_idx = np.asarray([k_indices[i] for i in np.delete( np.arange(len(k_indices)), ki)]).flatten() test_idx = np.asarray(k_indices[ki]) train_idx = np.delete(np.arange(len(y)), test_idx) train_tX = tx[train_idx] train_y = y[train_idx] test_tX = tx[test_idx] test_y = y[test_idx] if (mlfunction == 'ridge_regression'): w, loss = impl.ridge_regression(train_y, train_tX, lambda_) elif (mlfunction == 'least_squares'): w, loss = impl.least_squares(train_y, train_tX) elif (mlfunction == 'logistic_regression'): w, loss = impl.logistic_regression(train_y, train_tX) elif (mlfunction == 'reg_logistic_regression'): w, loss = impl.reg_logistic_regression(train_y, train_tX, lambda_) elif (mlfunction == 'least_squares_sgd'): w, loss = impl.least_squares_SGD(train_y, train_tX, gamma) elif (mlfunction == 'least_squares_gd'): w, loss = impl.least_squares_GD(train_y, train_tX, gamma) else: print('ERROR: ml_function not recognized') print( 'least_squares, least_squares_gd, least_squares_sgd, logistic_regression, reg_logistic_regression' ) return None # Calculate different losses and accuracy train_loss_.append(impl.compute_loss_mse(train_y, train_tX, w)) test_loss_.append(impl.compute_loss_mse(test_y, test_tX, w)) train_accuracy_ = impl.compute_accuracy(train_y, train_tX, w) test_accuracy_ = impl.compute_accuracy(test_y, test_tX, w) return np.mean(train_loss_), np.mean(test_loss_), np.mean( train_accuracy_), np.mean(test_accuracy_)
def get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size): """ Returns the learned weights 'w' (last weight vector) and the corresponding loss function by a given model. Parameters ---------- model: string The model y: ndarray The labels tx: ndarray The feature matrix initial_w: ndarray The initial weights max_iters: integer The number of steps to run gamma: integer The step size lambda_: integer The regularization parameter batch_size: integer The batch size Returns ------- tuple The learned weights """ if model == "MSE_GD": w, _ = least_squares_GD(y, tx, initial_w, max_iters, gamma) elif model == "MSE_SGD": w, _ = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma) elif model == "MSE_OPT": w, _ = least_squares(y, tx) elif model == "MSE_OPT_REG": w, _ = ridge_regression(y, tx, lambda_) elif model == "LOG_GD": w, _ = logistic_regression(y, tx, initial_w, max_iters, gamma) elif model == "LOG_REG_GD": w, _ = reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma) elif model == "LOG_REG_L1": w, _ = reg_logistic_regression_L1(y, tx, lambda_, initial_w, max_iters, gamma) elif model == "MSE_GD_L1": w, _ = least_squares_GD_L1(y, tx, lambda_, initial_w, max_iters, gamma) else: raise UnknownModel return w
def cross_validation_lrr(y, x, k_indices, k, lambda_, gamma, max_iters, w_initial): """train and test regularized logistic regression model using cross validation""" x_test = x[k_indices[k]] x_train = np.delete(x, [k_indices[k]], axis=0) y_test = y[k_indices[k]] y_train = np.delete(y, [k_indices[k]], axis=0) opt_w, loss = imp.reg_logistic_regression(y_train,x_train,lambda_,w_initial,max_iters,gamma) loss_te = imp.compute_loss_lrr(y_test, x_test,opt_w) return loss_te, opt_w
def test_reg_logistic_regression(y_train, tx_train, y_test, tx_test): """ Tests reg_logistic_regression method on the splitted data set and reports percentage of correct predictions. Args: y_train: training labels after the splitting tx_train: training features after the splitting y_test: test labels after the splitting tx_test: test features after the splitting """ print('\nTesting reg_logistic_regression...') w, _ = reg_logistic_regression(y_train, tx_train, 0.1, np.zeros(tx_train.shape[1]), 3000, 1e-06) report_prediction_accuracy_logistic(y_test, tx_test, w) print('... testing completed.')
def best_model_predictions(data_obj, jet, degrees): """ This method splits the data based on the jet value trains the model and gets the predictions on the test dataset. :param data_obj: DataLoader obj :param jet: int, the jet value :param degrees: int, the polynomial degree :return: pred: np.array with the predicted labels ids: np.array with the row index """ print('Training for Jet {jet}'.format(jet=jet)) # Split data based on jet value for train and val datasets y, tx = get_jet_data_split(data_obj.y, data_obj.tx, jet) ids_test, tx_test = get_jet_data_split(data_obj.ids_test, data_obj.test, jet) # Perform data cleaning (missing values, constant features, outliers, standardization) data_cleaner = DataCleaning() tx = data_cleaner.fit_transform(tx) tx_test = data_cleaner.transform(tx_test) # Perform feature engineering feature_generator = FeatureEngineering() tx = feature_generator.fit_transform(tx, degrees) tx_test = feature_generator.transform(tx_test) # Initialize values initial_w = np.zeros((tx.shape[1])) lambda_ = 1e-06 gamma = 1e-06 max_iter = 1000 # Train model w, loss = reg_logistic_regression(y, tx, lambda_, initial_w, max_iter, gamma, verbose=True) # Perform inference on test set pred = predict_labels(w, tx_test, True) return ids_test, pred
def find_optimal_w(tX, y, implementation, log_initial_w, log_max_iters, log_gamma, decreasing_gamma, log_regulator, ridge_lambda): """ Find the optimal weights by training the data set Parameters ---------- tX: array The feature matrices y: array The output log_initial_w: array inital weights in order to perform GD or SGD log_max_iters: integer number of iterations to perform GD or SGD log_gamma: float gamma parameter to perform GD or SGD log_regulator: float lambda to perform logistic regression ridge_lambda: float lambda to perform ridge regression Return ------ optimal_w = array Optimal weights. """ optimal_w = None if implementation == 0: optimal_w, _ = impl.least_squares(y, tX) if implementation == 1: optimal_w, _ = impl.ridge_regression(y, tX, ridge_lambda) if implementation == 2: optimal_w, _ = impl.reg_logistic_regression(y, tX, log_regulator, log_initial_w, log_max_iters, log_gamma, decreasing_gamma) return optimal_w
def cross_validation(x, y, k, mode, gamma=None, lambda_=None, max_iters=None, initial_w=None): """ INPUT: @x : input data, dimensions (NxD) @y : target labels, (Nx1) array @k : number of folds OUTPUT: """ D = x.shape[1] #randomly permute data maybe? x_split = np.array_split(x, k, axis=0) y_split = np.array_split(y, k, axis=0) #initialize weights and metrics weights = list() acc = list() tpr = list() fpr = list() losses = list() #loop over folds for fold in range(k): #create model #train_ind = [i for i in range(k) if i!=fold] #val_ind = [i for i in range(k) if i==fold] #pdb.set_trace() x_train = [x_split[i] for i in range(k) if i != fold] y_train = [y_split[i] for i in range(k) if i != fold] x_train = np.concatenate(x_train, axis=0) y_train = np.concatenate(y_train, axis=0) x_val = x_split[fold] y_val = y_split[fold] #model = Proj1_Model(x_train, y_train, mode) #train model for fold #weights[k] = model.train() """here the choice of method""" if mode == 'linear_regression_eq': update, loss = imp.least_squares(y_train, x_train) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'ridge_regression_eq': update, loss = imp.ridge_regression(y_train, x_train, lambda_) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'linear_regression_GD': update, loss = imp.least_squares_GD(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'linear_regression_SGD': update, loss = imp.least_squares_SGD(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'logistic_regression': update, loss = imp.logistic_regression(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) predicted_prob = H.sigmoid(predictions) #pdb.set_trace() pr_bool = predicted_prob > 0.5 elif mode == 'reg_logistic_regression': update, loss = imp.reg_logistic_regression(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) predicted_prob = H.sigmoid(predictions) #pdb.set_trace() pr_bool = predicted_prob > 0.5 weights.append(update) losses.append(loss) pr_bool = predictions >= np.mean(predictions) y_bool = y_val == 1 correct = pr_bool == y_bool tp = np.logical_and(correct, y_bool) fp = np.logical_and(np.logical_not(correct), pr_bool) #tp = [i for i in range(len(pr_bool)) if (pr_bool[i] == True and y_bool[i] == True)] #all_p = [i for i in range(len(pr_bool)) if y_bool == True] #fp = [i for i in range(len(pr_bool)) if (pr_bool == True and y_bool == False)] #all_n = [i for i in range(len(pr_bool)) if y_bool == False] #print('True signal samples:' + str(sum(y_val)) + ' - Predicted signal samples:' + str(sum(pr_bool))) acc.append(sum(correct) / float(len(y_val))) tpr.append(sum(tp) / float(sum(y_bool))) fpr.append(sum(fp) / float(sum(np.logical_not(y_bool)))) #acc[k] = model.acc() #tpr[k] = model.tpr() #fpr[k] = model.fpr() return acc, tpr, fpr, losses
def cross_validation(y, tX, gamma, method='logistic_regression'): """Cross validation for logistic regression @param gamma: learning rate @return : the average accuracy over the four fold validations """ N, D = tX.shape # Logistic regression parameters max_iters = 100 batch_size = N / 100 # Cross validation parameters seed = 1 k_fold = 4 k_indices = build_k_indices(y, k_fold, seed) N_fold = N * (k_fold - 1) / k_fold N_test = N / k_fold acc = [] for k in range(k_fold): yTr = np.array([]) xTr = np.zeros((0, D)) for i in range(k_fold): if i == k: yTe = y[k_indices[i]] xTe = tX[k_indices[i]] else: yTr = np.append(yTr, y[k_indices[i]], axis=0) xTr = np.append(xTr, tX[k_indices[i]], axis=0) initial_w = np.zeros(tX.shape[1]) if method == 'logistic_regression': initial_w = np.zeros((tX.shape[1], 1)) w, loss = logistic_regression(yTr, xTr, initial_w, max_iters, gamma) y_est = sigmoid(np.dot(xTe, w)) y_label = [0 if i < 0.5 else 1 for i in y_est] elif method == 'reg_logistic_regression': initial_w = np.zeros((tX.shape[1], 1)) lambda_ = 0.1 w, loss = reg_logistic_regression(yTr, xTr, lambda_, initial_w, max_iters, gamma) y_est = sigmoid(np.dot(xTe, w)) y_label = [0 if i < 0.5 else 1 for i in y_est] elif method == 'least_squares_GD': w, loss = least_squares_GD(yTr, xTr, initial_w, max_iters, gamma) y_label = predict_labels(w, xTe) elif method == 'least_squares_SGD': w, loss = least_squares_SGD(yTr, xTr, initial_w, max_iters, gamma) y_label = predict_labels(w, xTe) elif method == 'least_squares': w, loss = least_squares(yTr, xTr) y_label = predict_labels(w, xTe) elif method == 'ridge_regression': w, loss = ridge_regression(yTr, xTr, 0.1) y_label = predict_labels(w, xTe) else: raise Exception('Invalid method') corr = [ True if i == yTe[ind] else False for ind, i in enumerate(y_label) ] acc.append(sum(corr) / N_test) # print("Fold: {f}, Accuracy: {acc}, Loss:{loss}".format(f=k, acc=acc[k], loss=loss)) return (sum(acc) / k_fold), acc
train_data_log_svm = preprocessing_pipeline(train_data_split, degree=deg, norm_first=False) train_set_folds = k_fold_cross_split_data(train_classes_split, train_data_log_svm, k_indices) for j, lambda_ in enumerate(POSSIBLE_LAMBDA_LOG): folds_train_accuracy = [] folds_validation_accuracy = [] # Train a Regularized Ridge Regression model on each fold for x_train, y_train, x_test, y_test in train_set_folds: initial_w = np.zeros((x_train.shape[1], )) try: w, train_loss = reg_logistic_regression( y_train, x_train, lambda_, initial_w, 350, 3e-1, 1) folds_train_accuracy.append( compute_accuracy(predict_labels(w, x_train), y_train)) folds_validation_accuracy.append( compute_accuracy(predict_labels(w, x_test), y_test)) except Exception: pass train_accuracy_matrix[jet_num, 1, i, j] = \ (np.mean(folds_train_accuracy), np.std(folds_train_accuracy)) validation_accuracy_matrix[jet_num, 1, i, j] = \ (np.mean(folds_validation_accuracy), np.std(folds_validation_accuracy)) for j, lambda_ in enumerate(POSSIBLE_LAMBDA_SVM): folds_train_accuracy = []
# change [-1, 1] labels to [0, 1] y = y / 2 + 0.5 N, d = tX.shape #initial weights randomly generated w0 = 10 * np.random.rand(d + 1, 1) # remplace -999 values with the mean of the other ones tX = replace_data(tX) # normalize data to std 1 and 0 mean tX = normalize_data(tX) w, L = reg_logistic_regression(y, tX, lambda_=0.001, initial_w=w0, max_iters=10, gamma=5e-7) y_pred = predict_labels(w, tX, 0.5) N = y_pred.size # accuracy test on train set for sanity n_err = 0 for i in range(N): if (y_pred[i] != y[i]): n_err = n_err + 1 print("train accuracy :", n_err / N)
split_data_by_categorical_column(test_classes, test_data, test_ids, PRI_JET_NUM_INDEX) # We achieved our best results using Regularized Logistic Regression, # so we only load only those previously computed optimal params to generate the submission logistic_best_params = np.load("results/logistic_best_params.npy", allow_pickle=True) logistic_best_models = [] for (lambda_, deg, gamma), train_classes_split, train_data_split in \ zip(logistic_best_params, train_classes_jet_num_splits, train_data_jet_num_splits): data_split, columns_to_remove, mean, std = preprocessing_pipeline(train_data_split, degree=np.int(deg), cross_term=True, norm_first=False) initial_w = np.zeros((data_split.shape[1],)) w, loss = reg_logistic_regression(train_classes_split, data_split, lambda_, initial_w, 500, gamma, 1) print(f'Loss: {loss:.3f} Accuracy : {compute_accuracy(predict_labels(w, data_split), train_classes_split)}') logistic_best_models.append((w, loss, columns_to_remove, mean, std)) # Calculate the predictions for each of the 4 subsets using the weights and then combine them results = None for (w, _, col_to_rm, mean, std), (_, deg, _), test_classes_split, test_data_split, test_ids_split in \ zip(logistic_best_models, logistic_best_params, test_classes_jet_num_splits, test_data_jet_num_splits, test_ids_jet_num_splits): test_data_split, _, _, _ = preprocessing_pipeline(test_data_split, degree=np.int(deg), columns_to_remove=col_to_rm, cross_term=True, norm_first=False, mean=mean, std=std) pred = predict_labels(w, test_data_split) out = np.stack((test_ids_split, pred), axis=-1) results = out if results is None else np.vstack((results, out))
# Logistic regression print("Logistic Regression \n --------------") X, y = datasets.load_breast_cancer(return_X_y = True) X, _, _ = implementations.standardize_numpy(X) tx = np.c_[np.ones(X.shape[0]), X] initial_w = np.zeros(tx.shape[1]) w_log_gd, loss_log_gr = implementations.logistic_regression(y, tx, initial_w, max_iters, gamma, verbose=False) y_pred_log_gd = implementations.logistic_prediction(tx, w_log_gd) w_log_gd_reg, loss_log_gd_reg = implementations.reg_logistic_regression(y, tx, lambda_, 2, initial_w, max_iters, gamma, verbose=False, early_stopping=True, tol = 0.0001, patience = 5) y_pred_log_gd_reg = implementations.logistic_prediction(tx, w_log_gd_reg) print(f"Logistic regression gd : {implementations.accuracy(y, y_pred_log_gd)}") print(f"Logistic regression reg: {implementations.accuracy(y, y_pred_log_gd_reg)}") y_pred_log_sk = LogisticRegression().fit(X, y).predict(X) print(f"Sklearn logistic regression : {implementations.accuracy(y, y_pred_log_sk)}") y_pred_log_reg = LogisticRegression(C=1/lambda_, max_iter = 1000).fit(X,y).predict(X) print(f"Sklearn reg logistic regression: {implementations.accuracy(y, y_pred_log_reg)}")
g, l, avg_test_accuracy_RLR = cross_validation_RLR(X_train, y_train, k_fold=4, seed=1) #%% Testing functions #np.random.seed(42) gamma = 0.2 lambda_ = 4E-5 w, loss = least_squares(y = y_train, tx = X_train) # w, loss = least_squares_SGD(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma) # w, loss = ridge_regression(y = y_train, tx = X_train, lambda_ = lambda_) # w, loss = logistic_regression(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*10, max_iters = 125000, gamma = gamma) # w, loss = reg_logistic_regression(y = y_train, tx = X_train, lambda_ = lambda_, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma) plt.plot(w) #%% Predictive step y_test = X_test @ w plt.hist(y_test, bins=200) y_pred = predict_labels(w, X_test) #%% Create submission create_csv_submission(test.Id, y_pred, 'submission.csv')
gamma = 5e-3 lambda_ = 6e-7 #w, loss = least_squares(y_train, tX_train) #w, loss = least_squares_GD(y_train, tX_train, initial_w = w0, max_iters = max_iters, gamma = gamma) #w, loss = least_squares_SGD(y_train, tX_train, w0, max_iters = max_iters, gamma = gamma) #w, loss = ridge_regression(y_train, tX_train, lambda_) #w, loss = logistic_regression(y_train, tX_train, initial_w = w0, max_iters = max_iters, gamma = gamma) w, loss = reg_logistic_regression(y_train, tX_train, lambda_=lambda_, initial_w=w0, max_iters=max_iters, gamma=gamma) ########################################################################## #### Calculate the train accuracy ########################################################################## N = y_train.size # TRAIN test accuracy for sanity n_err = len( np.where( y_train != predict_01_labels(w, tX_train, 0.5).reshape(y_train.shape)) [0]) print("train accuracy :", 1 - n_err / N)
def train_test(data_list, test_interval, val_num, test_list, whitening=True, method='ls', name_list=['A', 'B', 'AB', 'BC', 'ABC', 'D'], max_iters=1000, gamma=0.01, lambda_=0.001, epsilon=1e-9, fan_out_list=[25, 10], out_dim=2, lr=0.001, lam=0.0005, batch_size=100, num_epoch=100): """ Train the model and test the accuracy. Note that there are 6 models to be trained that deal with 6 types of data. Return: accuracy_list: list of accuracies for the models loss_list: collection of final losses. (for neural network, we collect all the losses) recall_list: collection of recalls (only for neural net) precision_list: collection of precision (only for neural net) #######Parameters###### data_list collect different type of data in a list test_interval collect indices which indicate the range of data being trained val_num the number of the data subset in k-fold, note 0 <= val_num <= k - 1 test_list collect indices of redundant feature of all data types whitening a boolean value for data whitening method a string that is either 'log', 'ls' or 'dl' - 'log': logistic regression - 'ls': least squares (or ridge regression if lambda_ > 0) - 'dl': deep learning method (neural network) name_list collect all names of the data type max_iters maximum iterations for logistic regression gamma step size for each iteration in logistic regression lambda_ parameter for l2 regularization in least squares and logistic regression epsilon parameter for ZCA data whitening, should be a small positive fan_out_list the list that collects the number of neurons in hidden layer of the neural network out_dim output dimension at last layer in the neural network (before softmax) lr learning rate when optimizing the neural network lam parameter for weight decay (l2 regularization) batch_size batch size for stochastic gradient descent in neural network num_epoch number of epochs when optimizing the neural network """ # collect all parameters such as data whitening and weights W_collection = [] b_collection = [] M_list = [] mean_list = [] accuracy_list = [] w_list = [] loss_list = [] precision_list = [] recall_list = [] # iterate through all data types, train and test each model on a specific method for i in range(len(name_list)): print('Training for data ', name_list[i]) x, y, dim = extract_train_data(test_list[i], data_list[i]) # only use some part of data for training, rest is for testing i1 = test_interval[i][0] i2 = test_interval[i][1] index = list(range(0, i1)) + list(range(i2, len(y))) x_tr = x[index, :] y_tr = y[index] x_tst = x[i1:i2, :] y_tst = y[i1:i2] # print('Dummy accuracy is ', np.sum(y_tst == -1) / len(y_tst)) # we use training data to obtain transformation if whitening: M, mean = data_whitening(x, epsilon) x_tr = np.dot(x_tr - mean, M) x_tst = np.dot(x_tst - mean, M) M_list.append(M) mean_list.append(mean) x_tr = build_poly(x_tr) x_tst = build_poly(x_tst) print('Length of data point: ', x_tr.shape[1]) if method == 'ls': # least squares / ridge regression w, loss = imp.ridge_regression(y_tr, x_tr, lambda_) # w = np.dot(x_tr.T, y_tr) / lambda_ # loss = 0 accuracy = imp.evaluate(w, x_tst, y_tst) accuracy_list.append(accuracy) w_list.append(w) elif method == 'log': # logistic regression initial_w = np.random.rand(dim + 1) w, loss = imp.reg_logistic_regression(y_tr, x_tr, lambda_, initial_w, max_iters, gamma) accuracy = imp.evaluate(w, x_tst, y_tst) accuracy_list.append(accuracy) w_list.append(w) elif method == 'dl': # deep learning method fan_out = fan_out_list[i] y_tr = y_tr.astype(np.int8) y_tr[y_tr == -1] = 0 y_tst = y_tst.astype(np.int8) y_tst[y_tst == -1] = 0 inst = sim.SimNet(fan_out, x_tr[:, 1:].T, y_tr, out_dim, lr, lam, batch_size, num_epoch) loss = inst.optimize() accuracy, precision, recall = inst.test(x_tst[:, 1:].T, y_tst) recall_list.append(recall) precision_list.append(precision) W_collection.append(inst.W_list) b_collection.append(inst.b_list) accuracy_list.append(accuracy) else: raise ValueError loss_list.append(loss) print('For data ', name_list[i], ', the average accuracy is: ', accuracy, '\n') # Save all parameters if whitening: np.save('./parameters/data_whitening/mean_list_val' + str(val_num), np.array(mean_list)) np.save('./parameters/data_whitening/M_list_val' + str(val_num), np.array(M_list)) if method == 'dl': np.save('./parameters/neural_net/W_collection_dl_val' + str(val_num), np.array(W_collection)) np.save('./parameters/neural_net/b_collection_dl_val' + str(val_num), np.array(b_collection)) elif method == 'ls': np.save('./parameters/ridge/w_' + method + '_val' + str(val_num), np.array(w_list)) elif method == 'log': np.save('./parameters/logistic/w_' + method + '_val' + str(val_num), np.array(w_list)) return accuracy_list, precision_list, recall_list, loss_list