#generate random dataset of the Franke function with noise FrankeDS = dataset(0) FrankeDS.generate_franke(150, 0.05) #Normalize dataset FrankeDS.normalize_dataset() #Divide in train and test if CV: FrankeDS.sort_in_k_batches(k) else: FrankeDS.sort_train_test(ratio=0.2, random=False) #Make model FrankeModel = fit(FrankeDS) #Create polynomial design matrix for train and test sets X_train = FrankeModel.create_design_matrix(deg=deg) if CV: # Run k-fold CV algorithm and fit models. sample = sampling(FrankeDS) sample.kfold_cross_validation(method, deg=deg, lambd=lambd, Niterations=Niterations) # Print metrics print("Cross-validation batches: k = ", k) print('Best train mse is in arg ', np.argmin(sample.mse_train), ' : ',
DF = dataset.df DF = DF.drop(columns=[ "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6" ]) DF = DF.drop(columns=[ "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6" ]) DF = DF.drop(columns=["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]) #Print and plot some info about data. statistics.print_info_dataframe(dataset.df, DF) statistics.print_info_input_output(dataset.XTrain, dataset.yTrain) plot_traits(DF, show=showplot, save=saveplot) model = fit(dataset) model.fit_logistic_regression(delta=0.0001, iterations=iterations) model.test_logistic_regression(data="test") plt.title("Evolution of the accuracy score.") plt.plot(np.linspace(1, iterations, iterations), model.training_score) plt.show() print( "Score, own model: ", statistics.calc_accuracy(pred=model.prediction_test, target=model.y_test_target)) model.fit_logistic_regression_sklearn() """ ## Neural network - not done yet.
#Normal decision trees with pruning, or XGBoost? XGBoost = True # Load dataset with open('datasets.pkl', 'rb') as input: Datasets = pickle.load(input) AME12 = Datasets[0] AME16 = Datasets[1] testset = Datasets[2] #Divide in train and test AME16.sort_train_test(AME12, useAME12=False) #Make model AME16Fit = fit(AME16) #Create polynomial design matrix for train and test sets X_train = AME16Fit.create_design_matrix(deg=0) X_test = AME16Fit.create_design_matrix(x=AME16.test_x_1d, deg=0) #Initialize inputs for Neural Network y_train = AME16.y_1d[:, np.newaxis] y_test = AME16.test_y_1d[:, np.newaxis] n_samples = X_train.shape[0] ###### grid search ####### #Initialize vectors for saving values depth_vals = np.linspace(1, 10, 10) lmbd_vals = np.hstack((np.array([0.0]), np.logspace(-6, -1, 6)))
def kfold_cross_validation(self, method, descent_method='SGD-skl', deg=0, Niterations=100, lambd=0.01, eta=0.000005, m=5, verbose=False): """Method that implements the k-fold cross-validation algorithm. It takes as input the method we want to use. if "least squares" an ordinary OLS will be evaulated. if "ridge" then the ridge method will be used, and respectively the same for "lasso".""" inst = self.inst lowest_mse = 1e5 self.mse = [] self.R2 = [] self.mse_train = [] self.R2_train = [] self.bias = [] self.variance = [] self.accuracy = [] self.design_matrix = fit(inst) self.rocaucs = [] self.area_ratios = [] #whole_DM = self.design_matrix.create_design_matrix(deg=deg).copy() #design matrix for the whole dataset #whole_y = inst.y_1d.copy() #save the whole output for i in range(self.inst.k): #pick the i-th set as test inst.sort_training_test_kfold(i) inst.fill_array_test_training() self.design_matrix.create_design_matrix( deg=deg ) #create design matrix for the training set, and evaluate if method == 'OLS': y_train, beta_train = self.design_matrix.fit_design_matrix_numpy( ) elif method == "Ridge": y_train, beta_train = self.design_matrix.fit_design_matrix_ridge( lambd) elif method == "LASSO": y_train, beta_train = self.design_matrix.fit_design_matrix_lasso( lambd, maxiter=Niterations) elif method == 'logreg': y_train, beta_train = self.design_matrix.fit_design_matrix_logistic_regression( descent_method=descent_method, eta=eta, Niteration=Niterations, m=m, verbose=verbose) else: sys.exit("Wrongly designated method: ", method, " not found") #Find out which values get predicted by the training set X_test = self.design_matrix.create_design_matrix(x=inst.test_x_1d, N=inst.N_testing, deg=deg) y_pred = self.design_matrix.test_design_matrix(beta_train, X=X_test) #Take the real target values from the test datset for comparison (and also a rescaled set) y_test = inst.test_y_1d _, y_test_rescaled = inst.rescale_back(x=inst.test_x_1d, y=inst.test_y_1d, split=True) target = y_test_rescaled.astype(int) #Calculate the prediction for the whole dataset #whole_y_pred = self.design_matrix.test_design_matrix(beta_train, X=whole_DM) if method == 'logreg': # Statistically evaluate the training set with test and predicted solution. y_pred_onehot = np.column_stack((1 - y_pred, y_pred)) accuracy_batch = statistics.calc_accuracy(target, y_pred) rocaucs_batch = statistics.calc_rocauc(target, y_pred) max_area_test = statistics.calc_cumulative_auc( target, make_onehot(target)) area_ratio_batch = (statistics.calc_cumulative_auc( target, y_pred_onehot) - 0.5) / (max_area_test - 0.5) self.accuracy.append(accuracy_batch) self.rocaucs.append(rocaucs_batch) self.area_ratios.append(area_ratio_batch) else: # Statistically evaluate the training set with test and predicted solution. mse, calc_r2 = statistics.calc_statistics(y_test, y_pred) # Statistically evaluate the training set with itself mse_train, calc_r2_train = statistics.calc_statistics( inst.y_1d, y_train) # Get the values for the bias and the variance bias, variance = statistics.calc_bias_variance(y_test, y_pred) self.mse.append(mse) self.R2.append(calc_r2) self.mse_train.append(mse_train) self.R2_train.append(calc_r2_train) self.bias.append(bias) self.variance.append(variance) # If needed/wanted: if abs(mse) < lowest_mse: lowest_mse = abs(mse) self.best_predicting_beta = beta_train
CDds = credit_card_dataset(filename) #polishing the dataset, and divide into data and target data CDds.CreditCardPolish() #Normalize dataset CDds.normalize_dataset() #Divide in train and test if CV: CDds.sort_in_k_batches(k) else: CDds.sort_train_test(ratio=0.2, random=False) #Make model model = fit(CDds) #Fit model model.create_simple_design_matrix() if CV: # Run k-fold CV algorithm and fit models. sample = sampling(CDds) sample.kfold_cross_validation(method, deg=deg, descent_method=desc_method, eta=input_eta, Niterations=Niterations, m=m) # Print metrics
# Normalize the dataset and divide in samples dataset.normalize_dataset() dataset.sort_in_k_batches(k) # Run k-fold algorithm and fit models. sample = sampling(dataset) sample.kfold_cross_validation(k, method, deg=deg) # Calculate statistics print("Batches: k = ", k) statistics.print_mse(sample.mse) statistics.print_R2(sample.R2) # Plotting the best fit with the lowest mse. dataset.reload_data() fitted = fit(dataset) fitted.create_design_matrix(deg=deg) z_model_norm = fitted.test_design_matrix(sample.best_predicting_beta) rescaled_dataset = dataset.rescale_back(z=z_model_norm) z_model = rescaled_dataset[2] # Generate analytical solution for plotting purposes analytical = data_generate() analytical.generate_franke(n, noise=0) # Plot plot_3d(rescaled_dataset[0], rescaled_dataset[1], z_model, analytical.x_mesh, analytical.y_mesh, analytical.z_mesh, ["surface", "scatter"]) try: os.remove("backup_data.npz")
def kfold_cross_validation(self, k, method, deg=5, lambd=1): """Method that implements the k-fold cross-validation algorithm. It takes as input the method we want to use. if "least squares" an ordinary OLS will be evaulated. if "ridge" then the ridge method will be used, and respectively the same for "lasso".""" inst = self.inst lowest_mse = 1e5 self.mse = [] self.R2 = [] self.mse_train = [] self.R2_train = [] self.bias = [] self.variance = [] design_matrix = fit(inst) whole_DM = design_matrix.create_design_matrix( deg=deg).copy() #design matrix for the whole dataset whole_z = inst.z_1d.copy() #save the whole output for i in range(self.inst.k): #pick the i-th set as test inst.sort_training_test_kfold(i) inst.fill_array_test_training() design_matrix.create_design_matrix( deg=deg ) #create design matrix for the training set, and evaluate if method == "least squares": z_train, beta_train = design_matrix.fit_design_matrix_numpy() elif method == "ridge": z_train, beta_train = design_matrix.fit_design_matrix_ridge( lambd) elif method == "lasso": z_train, beta_train = design_matrix.fit_design_matrix_lasso( lambd) else: sys.exit("Wrongly designated method: ", method, " not found") #Find out which values get predicted by the training set X_test = design_matrix.create_design_matrix(x=inst.test_x_1d, y=inst.test_y_1d, z=inst.test_z_1d, N=inst.N_testing, deg=deg) z_pred = design_matrix.test_design_matrix(beta_train, X=X_test) #Take the real values from the dataset for comparison z_test = inst.test_z_1d #Calculate the prediction for the whole dataset whole_z_pred = design_matrix.test_design_matrix(beta_train, X=whole_DM) # Statistically evaluate the training set with test and predicted solution. mse, calc_r2 = statistics.calc_statistics(z_test, z_pred) # Statistically evaluate the training set with itself mse_train, calc_r2_train = statistics.calc_statistics( inst.z_1d, z_train) # Get the values for the bias and the variance bias, variance = statistics.calc_bias_variance(z_test, z_pred) self.mse.append(mse) self.R2.append(calc_r2) self.mse_train.append(mse_train) self.R2_train.append(calc_r2_train) self.bias.append(bias) self.variance.append(variance) # If needed/wanted: if abs(mse) < lowest_mse: lowest_mse = abs(mse) self.best_predicting_beta = beta_train
fifth order. Also adding MSE and R^2 score.""" # Load data from previously saved file deg = 5 dataset = data_generate() dataset.load_data() # Or you can generate directly. #dataset = data_generate() #dataset.generate_franke(n=100, noise=0.2) # Normalize the dataset dataset.normalize_dataset() # Fit design matrix fitted_model = fit(dataset) # Ordinary least square fitting fitted_model.create_design_matrix(deg) z_model_norm, beta = fitted_model.fit_design_matrix_numpy() # Statistical evaluation mse, calc_r2 = statistics.calc_statistics(dataset.z_1d, z_model_norm) print("Mean square error: ", mse, "\n", "R2 score: ", calc_r2) # Scale back the dataset rescaled_dataset = dataset.rescale_back(z=z_model_norm) #x_model = rescaled_dataset[0] #y_model = rescaled_dataset[1] z_model = rescaled_dataset[2]