def sgdm(m, degrees, n_epochs, b, eta, noise=0, gamma = 0): #stocastic gradient decent with momentum np.random.seed(1337) x = np.random.rand(m,degrees) #+1? y = np.random.rand(m,degrees) #+1? X_mesh, Y_mesh = np.meshgrid(x, y) z = f.FrankeFunction(X_mesh, Y_mesh) + noise*np.random.randn(X_mesh.shape[0], Y_mesh.shape[0]) z= np.ravel(z) X = f.X_make(X_mesh,Y_mesh, degrees) #SPLIT AND SCALE X_tr, X_te, z_tr, z_te = train_test_split(X,z, test_size=0.3) scaler = StandardScaler() # X_tr = scaler.fit(X_tr).transform(X_tr) # z_tr = scaler.transform(z_tr.reshape(-1,1)) # z_te = scaler.fit(z_te).transform(z_te) # removes the mean and scales each feature/variable to unit variance scaler.fit(X_tr) # compute the mean and std to be used for later scaling X_tr= scaler.transform(X_tr) # perform standardization by centering and scaling X_te = scaler.transform(X_te) # fit to data, then transform it z_tr = z_tr.reshape(-1,1) z_te = z_te.reshape(-1,1) scaler.fit(z_tr) z_tr = scaler.transform(z_tr) z_te = scaler.transform(z_te) l = int((degrees+1)*(degrees+2)/2) #length of design matrix row beta = np.random.randn(l,1) #length of a design matrix row #b = int(m/batch_num) #batch size batch_num = int(m/b) if m%batch_num: print('warning; batch number and dataset not compatible') v = 0 mse_eval = np.zeros(n_epochs) index_array = np.arange(m) #indexes of rows batch_array= np.arange(batch_num) batch_array *=b for epoch in range(n_epochs): np.random.shuffle(index_array) for i in range(batch_num): #m is number of batches xi = X_tr[index_array[batch_array[i]]: (index_array[(batch_array[i]+1)])] zi = z_tr[index_array[batch_array[i]]: (index_array[(batch_array[i]+1)])] gradients = 2/b * xi.T @ (xi @ beta - zi.reshape(-1,1)) #derived from cost function #eta = 0.001#learning_rate(epoch*m+i) v = gamma*v + eta*gradients beta = beta - v z_eval = X_te.dot(beta) mse_eval[epoch] = f.MSE(z_te, z_eval) beta_ols = f.OLS(X_tr, z_tr) z_ols = X_te.dot(beta_ols) mse_beta = f.MSE(z_te, z_ols) return beta, mse_eval, mse_beta
def train(self, epochs, batch_size, x, y, activation, derivative,\ xvalidation, yvalidation, verbose=False): tmp = int(len(y) / batch_size) Niter = min(200, tmp) indexes = np.arange(len(y)) cost = np.empty([epochs]) self.cost_val = list() self.cost_train = list() for i in range(epochs): for j in range(Niter): datapoints = np.random.choice(indexes, size=batch_size, replace=False) batch_x = x[datapoints, :] batch_y = y[datapoints] self.feed(batch_x, activation) self.back(batch_x, batch_y, derivative) pred_val = self.feed_out(xvalidation, activation) pred_train = self.feed_out(batch_x, activation) if self.mode == 'regression': self.cost_val.append( fx.MSE(pred_val.ravel(), yvalidation.ravel())) self.cost_train.append( fx.MSE(pred_train.ravel(), batch_y.ravel())) if self.mode == 'classification': self.cost_val.append( lrf.cost_log_ols(pred_val.ravel(), yvalidation.T)) self.cost_train.append( lrf.cost_log_ols(pred_train.ravel(), batch_y.T)) if i > self.early_stop_nochange: avg_indx_full = np.arange(i - self.early_stop_nochange, i) avg_indx_full.astype(int) avg_indx = np.arange(i - 5, i) avg_indx.astype(int) if -self.early_stop_tol < np.mean( np.array(self.cost_val)[avg_indx]) - np.mean( np.array(self.cost_val)[avg_indx_full]): break if verbose: print('Epoch', i + 1, 'loss', self.cost_val[i])
def bootstrap(x, y, max_deg, boots_num): np.random.seed(130) """ applies the bootstrap algorithm args: x,y (np.array): initial datapoints max_deg (int): boots_num (int): number of bootstraps """ x, y = np.meshgrid(x, y) z = np.ravel( f.FrankeFunction(x, y) + 0.5 * np.random.randn(np.shape(x)[0], np.shape(y)[1])) MSE_degree_values = np.zeros(max_deg) MSE_test_degree_values = np.zeros(max_deg) MSE_train_values = np.zeros(boots_num) MSE_test_values = np.zeros(boots_num) for k, deg in enumerate(range(1, max_deg)): #Degrees loop that contains K-fold X_design = f.X_make(x, y, deg) scaler = StandardScaler() X_tr, X_te, z_tr, z_te = train_test_split(X_design, z, test_size=0.2) scaler.fit(X_tr) X_train = scaler.transform(X_tr) X_test = scaler.transform(X_te) #doing this AFTER train test split. otherwise the test data #gets affected by the train data z_bootstrap = np.empty(int(len(z_tr))) index_array = np.arange(0, len(z_tr), 1) for i in range(boots_num): indx = resample(index_array, random_state=0) z_bootstrap = z_tr[indx] z_test = X_test.dot(f.OLS(X_train[indx, :], z_bootstrap)) z_train = X_train.dot(f.OLS(X_train[indx, :], z_bootstrap)) MSE_train_values[i] = f.MSE(z_tr, z_train) MSE_test_values[i] = f.MSE(z_te, z_test) MSE_degree_values[k] = np.sum(MSE_train_values) / boots_num MSE_test_degree_values[k] = np.sum(MSE_test_values) / boots_num return MSE_degree_values, MSE_test_degree_values
def SGD(self, n_epochs, batch_size, gamma=0.9, lmbda=0): """Stochastic gradient descent. Keyword arguments: n_epochs -- number of epochs batch_size -- size of minibatch gamma -- momentum parameter (default = 0.9) lmbda -- regularization parameter (default = 0) Exception: Exception raised when batch size does not result in an equal division of training data. """ n = self.X_train.shape[0] if n % batch_size: raise Exception("Batch number and dataset not compatible") n_batches = int(n/batch_size) beta = np.random.randn(self.X_train.shape[1], 1) # initialize beta v = 0 self.mse_epochs = np.zeros(n_epochs) index_array = np.arange(n) for epoch in range(n_epochs): np.random.shuffle(index_array) X_minibatches = np.split(self.X_train[index_array], n_batches) z_minibatches = np.split(self.z_train[index_array], n_batches) i = 0 for X_batch, z_batch in zip(X_minibatches, z_minibatches): # Calculate mean gradient of minibatch gradient = self.grad_cost_function(X_batch, z_batch, beta, batch_size, lmbda) # Update beta eta = self.learning_rate(epoch*n + i) v = gamma*v + eta*gradient beta = beta - v i += 1 z_tilde = self.X_test @ beta self.mse_epochs[epoch] = f.MSE(self.z_test, z_tilde)
def test_NN_MSE(): test_loss = fx.MSE(pred.ravel(), Y_test.T) test_loss_sk = mean_squared_error(Y_test.ravel(), pred_sk) assert (abs(test_loss_sk - test_loss) < 1e-1)
def cross_validation(n, maxdegree, noise, n_folds, method=f.OLS, seed=130, lmbda=0, datatype='Franke', filename='SRTM_data_Minneapolis'): """ cross_validation Input: n - number of datapoints before meshgrid maxdegree - max degree to iterate over noise - amount of noise n_folds - number of folds in cross validation method - regression method (OLS, Ridge, Lasso) seed - seed to random number generator lmbda - lambda value to use in Ridge and Lasso datatype - datatype to fit (Franke, Terrain) filename - file with terrain data Output: polydegree - array with model complexity MSE_mean - array with mean MSE from each cross validation MSE_best - MSE for the best fit R2Score_skl - array with R2Score for Scikit Learn cross validation R2Score_mean - array with mean R2Score from each cross validation """ if n % n_folds != 0: raise Exception("Can't divide data set in n_folds equally sized folds") polydegree = np.zeros(maxdegree) MSE_mean = np.zeros(maxdegree) MSE_mean_sklearn = np.zeros(maxdegree) R2Score_mean = np.zeros(maxdegree) R2Score_skl = np.zeros(maxdegree) # Make data np.random.seed(int(seed)) if datatype == 'Franke': x_train, x_test, y_train, y_test, z_train, z_test = f.FrankeData( n, noise, test_size=0.3) elif datatype == 'Terrain': x_train, x_test, y_train, y_test, z_train, z_test = f.TerrainData( n, filename) for degree in range(0, maxdegree): polydegree[degree] = degree # Create design matrix X_train = f.design_matrix(x_train, y_train, degree) # Shuffle data to get random folds index = np.arange(0, np.shape(X_train)[0], 1) np.random.seed(int(seed)) np.random.shuffle(index) X_train_random = X_train[index, :] z_train_random = z_train[index] # Split data in n_folds folds X_folds = np.array(np.array_split(X_train_random, n_folds)) z_folds = np.array(np.array_split(z_train_random, n_folds)) if method == f.OLS: clf = skl.LinearRegression() scores = cross_val_score(clf, X_train, z_train, cv=n_folds, scoring='neg_mean_squared_error') MSE_mean_sklearn[degree] = np.abs(np.mean(scores)) best_degree_sklearn = np.argmin(MSE_mean_sklearn) # Make fit to holy test data X_train_best = f.design_matrix(x_train, y_train, best_degree_sklearn) scaler = StandardScaler() scaler.fit(X_train_best) X_train_best_scaled = scaler.transform(X_train_best) X_test_best = f.design_matrix(x_test, y_test, best_degree_sklearn) X_test_best_scaled = scaler.transform(X_test_best) X_train_best_scaled[:, 0] = 1 X_test_best_scaled[:, 0] = 1 scaler.fit(z_train.reshape(-1, 1)) z_train_scaled = scaler.transform(z_train.reshape(-1, 1)) z_test_scaled = scaler.transform(z_test.reshape(-1, 1)) beta_best_sklearn = f.OLS(X_train_best_scaled, z_train_scaled) elif method == f.Ridge: clf = skl.Ridge() scores = cross_val_score(clf, X_train, z_train, cv=n_folds, scoring='neg_mean_squared_error') MSE_mean_sklearn[degree] = np.abs(np.mean(scores)) best_degree_sklearn = np.argmin(MSE_mean_sklearn) # Make fit to holy test data X_train_best = f.design_matrix(x_train, y_train, best_degree_sklearn) scaler = StandardScaler() scaler.fit(X_train_best) X_train_best_scaled = scaler.transform(X_train_best) X_test_best = f.design_matrix(x_test, y_test, best_degree_sklearn) X_test_best_scaled = scaler.transform(X_test_best) X_train_best_scaled[:, 0] = 1 X_test_best_scaled[:, 0] = 1 scaler.fit(z_train.reshape(-1, 1)) z_train_scaled = scaler.transform(z_train.reshape(-1, 1)) z_test_scaled = scaler.transform(z_test.reshape(-1, 1)) beta_best_sklearn = f.OLS(X_train_best_scaled, z_train_scaled) elif method == 'Lasso': clf_lasso = skl.Lasso(alpha=lmbda, fit_intercept=False) scores = cross_val_score(clf_lasso, X_train, z_train, cv=n_folds, scoring='neg_mean_squared_error') MSE_mean_sklearn[degree] = np.abs(np.mean(scores)) best_degree_sklearn = np.argmin(MSE_mean_sklearn) # Make fit to holy test data X_train_best = f.design_matrix(x_train, y_train, best_degree_sklearn) scaler = StandardScaler() scaler.fit(X_train_best) X_train_best_scaled = scaler.transform(X_train_best) X_test_best = f.design_matrix(x_test, y_test, best_degree_sklearn) X_test_best_scaled = scaler.transform(X_test_best) X_train_best_scaled[:, 0] = 1 X_test_best_scaled[:, 0] = 1 scaler.fit(z_train.reshape(-1, 1)) z_train_scaled = scaler.transform(z_train.reshape(-1, 1)) z_test_scaled = scaler.transform(z_test.reshape(-1, 1)) beta_best_sklearn = f.OLS(X_train_best_scaled, z_train_scaled) # cross validation for k in range(n_folds): # Validation data X_val = X_folds[k] z_val = np.reshape(z_folds[k], (-1, 1)) # Training data idx = np.ones(n_folds, dtype=bool) idx[k] = False X_train_fold = X_folds[idx] # Combine folds X_train_fold = np.reshape( X_train_fold, (X_train_fold.shape[0] * X_train_fold.shape[1], X_train_fold.shape[2])) z_train_fold = np.reshape(np.ravel(z_folds[idx]), (-1, 1)) # Scaling data scaler = StandardScaler( ) # removes the mean and scales each feature/variable to unit variance scaler.fit( X_train_fold ) # compute the mean and std to be used for later scaling X_train_fold_scaled = scaler.transform( X_train_fold ) # perform standardization by centering and scaling X_val_scaled = scaler.transform(X_val) # Set first column to one as StandardScaler sets it to zero X_train_fold_scaled[:, 0] = 1 X_val_scaled[:, 0] = 1 # scaler.fit(z_train_fold) # z_train_fold_scaled = scaler.transform(z_train_fold) # z_val_scaled = scaler.transform(z_val) z_train_fold_scaled = z_train_fold z_val_scaled = z_val # Choose method for calculating coefficients beta if method == f.OLS: beta_fold = method(X_train_fold_scaled, z_train_fold_scaled) z_tilde_fold = X_val_scaled @ beta_fold # z_tilde_fold_train = X_train_ elif method == f.Ridge: beta_fold = method(X_train_fold_scaled, z_train_fold_scaled, lmbda, degree) z_tilde_fold = X_val_scaled @ beta_fold elif method == 'Lasso': clf_lasso = skl.Lasso(alpha=lmbda, fit_intercept=False).fit( X_train_fold_scaled, z_train_fold_scaled) z_tilde_fold = clf_lasso.predict(X_val_scaled) MSE_mean[degree] += f.MSE(z_val_scaled, z_tilde_fold) R2Score_mean[degree] += f.R2Score(z_val_scaled, z_tilde_fold) MSE_mean[degree] /= n_folds R2Score_mean[degree] /= n_folds # # Cross-validation using Scikit-Learn # clf = skl.LinearRegression() # R2Score_skl[degree] = np.mean(cross_val_score(clf, X_train, z_train, scoring='r2', cv=n_folds)) # Find the degree with smallest MSE best_degree = np.argmin(MSE_mean) print(best_degree) # Make fit to holy test data X_train_best = f.design_matrix(x_train, y_train, best_degree) scaler.fit(X_train_best) X_train_best_scaled = scaler.transform(X_train_best) X_test_best = f.design_matrix(x_test, y_test, best_degree) X_test_best_scaled = scaler.transform(X_test_best) X_train_best_scaled[:, 0] = 1 X_test_best_scaled[:, 0] = 1 scaler.fit(z_train.reshape(-1, 1)) z_train_scaled = scaler.transform(z_train.reshape(-1, 1)) z_test_scaled = scaler.transform(z_test.reshape(-1, 1)) beta_best = f.OLS(X_train_best_scaled, z_train_scaled) z_tilde_best = X_test_best_scaled @ beta_best MSE_best = f.MSE(z_test_scaled, z_tilde_best) print(MSE_best) return polydegree, MSE_mean, MSE_best, R2Score_skl, R2Score_mean, beta_best, best_degree, MSE_mean_sklearn, best_degree_sklearn, beta_best_sklearn
def no_resampling(n, maxdegree, noise, method=f.OLS, lmbda=0, seed=7053): # arrays for plotting of error polydegree = np.zeros(maxdegree) MSE_OLS = np.zeros(maxdegree) R2Score_OLS = np.zeros(maxdegree) MSE_test = np.zeros(maxdegree) MSE_train = np.zeros(maxdegree) MSE_train_scaled = np.zeros(maxdegree) MSE_test_scaled = np.zeros(maxdegree) R2Score_scaled = np.zeros(maxdegree) # Make data np.random.seed(seed) x = np.sort(np.random.uniform(0, 1, n)) y = np.sort(np.random.uniform(0, 1, n)) x, y = np.meshgrid(x, y) # Franke Function z = np.ravel(f.FrankeFunction(x, y) + noise * np.random.randn(n, n)) for degree in range(0, maxdegree): polydegree[degree] = degree #Create design matrix X = f.design_matrix(x, y, degree) # Split in training and test data X_train, X_test, z_train, z_test = train_test_split(X, z.reshape(-1, 1), test_size=0.3) # OLS estimate train/test without scaled beta_OLS_train = f.OLS(X_train, z_train) ztilde_test = X_test @ beta_OLS_train ztilde_train = X_train @ beta_OLS_train MSE_train[degree] = f.MSE(z_train, ztilde_train) MSE_test[degree] = f.MSE(z_test, ztilde_test) # Scale data scaler = StandardScaler( ) # removes the mean and scales each feature/variable to unit variance scaler.fit( X_train) # compute the mean and std to be used for later scaling X_train_scaled = scaler.transform( X_train) # perform standardization by centering and scaling X_test_scaled = scaler.transform( X_test) # fit to data, then transform it scaler.fit(z_train) # z_train_scaled = scaler.transform(z_train) # z_test_scaled = scaler.transform(z_test) z_train_scaled = z_train z_test_scaled = z_test # Set the first column to 1 since StandardScaler sets it to 0 X_train_scaled[:, 0] = 1 X_test_scaled[:, 0] = 1 if method == f.OLS: beta_train_scaled = method(X_train_scaled, z_train_scaled) z_tilde_test_scaled = X_test_scaled @ beta_train_scaled z_tilde_train_scaled = X_train_scaled @ beta_train_scaled elif method == f.Ridge: beta_train_scaled = method(X_train_scaled, z_train_scaled, lmbda, degree) z_tilde_test_scaled = X_test_scaled @ beta_train_scaled z_tilde_train_scaled = X_train_scaled @ beta_train_scaled elif method == 'Lasso': clf_lasso = skl.Lasso(alpha=lmbda, fit_intercept=False).fit( X_train_scaled, z_train_scaled) z_tilde_test_scaled = clf_lasso.predict(X_test_scaled) z_tilde_train_scaled = clf_lasso.predict(X_train_scaled) MSE_train_scaled[degree] = f.MSE(z_train_scaled, z_tilde_train_scaled) MSE_test_scaled[degree] = f.MSE(z_test_scaled, z_tilde_test_scaled) R2Score_scaled[degree] = f.R2Score(z_test_scaled, z_tilde_test_scaled) return polydegree, MSE_train, MSE_test, MSE_train_scaled, MSE_test_scaled, R2Score_scaled # Start figure fig = plt.figure() ax = fig.gca(projection='3d') # Plot the surface # ztilde_plot = np.reshape(ztilde, (n, n)) # surf = ax.plot_surface(x, y, ztilde_plot, cmap=cm.coolwarm, linewidth=0, antialiased=False) # Customize the z axis ax.set_zlim(-0.10, 1.40) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f')) # Add a colar bar which maps values to colors fig.colorbar(surf, shrink=0.5, aspect=5) plt.show()
scaler.fit(X) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) print(f" z_train: {np.shape(z_train)}") print(f" X: {np.shape(X)}") print(f" X_train: {np.shape(X_train)}") z_test_scaled = X_test_scaled.dot( f.Ridge_func(X_train_scaled, z_train, 1E-5)) z_train_scaled = X_train_scaled.dot( f.Ridge_func(X_train_scaled, z_train, 1E-5)) print(f"x: {np.shape(x)}") print(f"y: {np.shape(y)}") MSEtrain[i] = f.MSE(z_train, z_train_scaled) MSEtest[i] = f.MSE(z_test, z_test_scaled) plt.plot(deg, MSEtest, label="test") plt.plot(deg, MSEtrain, label="train") plt.legend() plt.show() """ #optional plotting of surface z_plot = np.reshape(z_, (25000, 25000)) #print(np.shape(z_)) # Plot the surface. surf = ax.plot_surface(x, y, z_plot, cmap=cm.coolwarm, linewidth=0, antialiased=False) # Customize the z axis.
def OLS(self): """Calculates the ordinary least squares and its mean squared error.""" beta = f.OLS(self.X_train, self.z_train) z_tilde = self.X_test @ beta self.MSE = f.MSE(self.z_test, z_tilde)
mse_heatmap = np.zeros((len(array_lambda), len(array_eta))) index_array = np.arange(len(X_train)) for i, lmbda in enumerate(array_lambda): for j, eta in enumerate(array_eta): network.create_layers(hidden_act, output_act, seed) for k in range(n_epochs): np.random.shuffle(index_array) X_minibatches = np.split(X_train[index_array], n_batches) z_minibatches = np.split(z_train[index_array], n_batches) for l in range(n_batches): # eta = network.learning_rate(epoch*N + j, 2, 20) network.backprop(X_minibatches[l], z_minibatches[l], eta, lmbda) network.feedforward(X_test) mse_heatmap[i, j] = np.log10(f.MSE(z_test, network.layers[-1].a)) heatmap = sb.heatmap(mse_heatmap, annot=mse_heatmap, cmap='YlGnBu', xticklabels=array_eta, yticklabels=array_lambda, cbar_kws={'label': 'MSE'}) heatmap.set_xlabel('$\eta$', size=12) heatmap.set_ylabel('$\lambda$', size=12) heatmap.invert_xaxis() heatmap.set_title('RELU + Identity', size=16) plt.show()
def train(self, epochs, batch_size, x, y, activation, derivative,\ xvalidation, yvalidation, verbose=False): ''' inputs: epochs = max epochs batch_size = self explanatory x,y = the datset used for training activation = list of activation functions derivative = list of derivatives (i know this can be done better) xvalidation = validation design matrix used in early stopping yvalidation = validation output data used in early stopping verbose == False no printing of validation loss, == True validation loss is printed each epoch outputs: No outputs, but hopefully a well trained network ''' tmp = int(len(y) / batch_size) Niter = min(200, tmp) indexes = np.arange(len(y)) cost = np.empty([epochs]) self.cost_val = list() self.cost_train = list() for i in range(epochs): for j in range(Niter): datapoints = np.random.choice(indexes, size=batch_size, replace=False) batch_x = x[datapoints, :] batch_y = y[datapoints] self.feed(batch_x, activation) self.back(batch_x, batch_y, derivative) pred_val = self.feed_out(xvalidation, activation) pred_train = self.feed_out(batch_x, activation) if self.mode == 'regression': self.cost_val.append( fx.MSE(pred_val.ravel(), yvalidation.ravel())) self.cost_train.append( fx.MSE(pred_train.ravel(), batch_y.ravel())) if self.mode == 'classification': self.cost_val.append( lrf.cost_log_ols(pred_val.ravel(), yvalidation.T)) self.cost_train.append( lrf.cost_log_ols(pred_train.ravel(), batch_y.T)) if i > self.early_stop_nochange: avg_indx_full = np.arange(i - self.early_stop_nochange, i) avg_indx_full.astype(int) avg_indx = np.arange(i - 5, i) avg_indx.astype(int) if -self.early_stop_tol < np.mean( np.array(self.cost_val)[avg_indx]) - np.mean( np.array(self.cost_val)[avg_indx_full]): break if verbose: print('Epoch', i + 1, 'loss', self.cost_val[i])
scaler.fit(terrain) # compute the mean and std to be used for later scaling terrain_scaled = scaler.transform( terrain) # perform standardization by centering and scaling # Fixing a set of points terrain_scaled = terrain_scaled[:n, :n] # Create mesh of image pixel x = np.sort(np.linspace(0, 1, terrain_scaled.shape[0])) y = np.sort(np.linspace(0, 1, terrain_scaled.shape[1])) x, y = np.meshgrid(x, y) X = f.design_matrix(x, y, best_degree) z_tilde = X @ beta_best z_tilde = z_tilde.reshape(x.shape[0], x.shape[1]) print(f.MSE(terrain_scaled, z_tilde)) X_sklearn = f.design_matrix(x, y, best_degree_sklearn) z_tilde_sklearn = X_sklearn @ beta_best_sklearn z_tilde_sklearn = z_tilde_sklearn.reshape(x.shape[0], x.shape[1]) print(f.MSE(terrain_scaled, z_tilde_sklearn)) plt.subplot(131) plt.imshow(terrain_scaled, cmap='gist_rainbow') plt.subplot(132) plt.imshow(z_tilde, cmap='gist_rainbow') plt.subplot(133) plt.imshow(z_tilde_sklearn, cmap='gist_rainbow') plt.show()