def check_layernorm_backward(): print_formatted('Layernorm backward', 'bold', 'blue') np.random.seed(231) N, D = 4, 5 x = 5 * np.random.randn(N, D) + 12 gamma = np.random.randn(D) beta = np.random.randn(D) dout = np.random.randn(N, D) fx = lambda x: layernorm_forward(x, gamma, beta)[0] fg = lambda a: layernorm_forward(x, a, beta)[0] fb = lambda b: layernorm_forward(x, gamma, b)[0] dx_num = evaluate_numerical_gradient_array(fx, x, dout) da_num = evaluate_numerical_gradient_array(fg, gamma.copy(), dout) db_num = evaluate_numerical_gradient_array(fb, beta.copy(), dout) _, cache = layernorm_forward(x, gamma, beta) dx, dgamma, dbeta = layernorm_backward(dout, cache) print('(You should expect to see relative errors between 1e-12 and 1e-8)') print('dx error: ', relative_error(dx_num, dx)) print('dgamma error: ', relative_error(da_num, dgamma)) print('dbeta error: ', relative_error(db_num, dbeta)) print()
def check_spatial_groupnorm_backward(): print_formatted('Spatial groupnorm backward', 'bold', 'blue') np.random.seed(231) N, C, H, W = 2, 6, 4, 5 G = 2 x = 5 * np.random.randn(N, C, H, W) + 12 gamma = np.random.randn(1, C, 1, 1) beta = np.random.randn(1, C, 1, 1) dout = np.random.randn(N, C, H, W) gn_param = {} fx = lambda x: spatial_groupnorm_forward(x, gamma, beta, G, gn_param)[0] fg = lambda a: spatial_groupnorm_forward(x, gamma, beta, G, gn_param)[0] fb = lambda b: spatial_groupnorm_forward(x, gamma, beta, G, gn_param)[0] dx_num = evaluate_numerical_gradient_array(fx, x, dout) da_num = evaluate_numerical_gradient_array(fg, gamma, dout) db_num = evaluate_numerical_gradient_array(fb, beta, dout) _, cache = spatial_groupnorm_forward(x, gamma, beta, G, gn_param) dx, dgamma, dbeta = spatial_groupnorm_backward(dout, cache) print('(You should expect to see relative errors between 1e-12 and 1e-7)') print('dx error: ', relative_error(dx_num, dx)) print('dgamma error: ', relative_error(da_num, dgamma)) print('dbeta error: ', relative_error(db_num, dbeta)) print()
def check_dropout_fc_net(): print_formatted('Fully connected net with dropout', 'bold', 'blue') np.random.seed(231) N, D, H1, H2, C = 2, 15, 20, 30, 10 X = np.random.randn(N, D) y = np.random.randint(C, size=(N, )) print('Relative errors should be around e-6 or less.') print('It is fine if for dropout=1 you have W2 error on the order of e-5.') print() for dropout in [1, 0.75, 0.5]: print('Running check with dropout = ', dropout) model = FullyConnectedNet(input_dim=D, hidden_dims=[H1, H2], num_classes=C, weight_scale=5e-2, dropout=dropout, seed=123) loss, grads = model.loss(X, y) print('Initial loss: ', loss) for name in sorted(grads): f = lambda _: model.loss(X, y)[0] grad_num = evaluate_numerical_gradient(f, model.params[name], verbose=False, h=1e-5) print('%s relative error: %.2e' % (name, relative_error(grad_num, grads[name]))) print()
def overfit_small_data(plot=False): print_formatted('Overfitting small data', 'stage') num_train = 50 small_data = { 'X_train': X_train[:num_train], 'y_train': y_train[:num_train], 'X_val': X_val, 'y_val': y_val, } weight_scale = 3e-2 learning_rate = 1e-3 update_rule = 'adam' model = FullyConnectedNet(input_dim=3072, hidden_dims=[100, 100], num_classes=10, weight_scale=weight_scale) solver = Solver(model, small_data, update_rule=update_rule, optim_config={'learning_rate': learning_rate}, lr_decay=0.95, num_epochs=20, batch_size=25, print_every=10) solver.train() if plot: plot_stats('loss', solvers={'fc_net': solver}, filename='overfitting_loss_history.png')
def check_spatial_batchnorm_forward_train_time(): print_formatted('Train time spatial batchnorm forward', 'bold', 'blue') np.random.seed(231) N, C, H, W = 2, 3, 4, 5 x = 4 * np.random.randn(N, C, H, W) + 10 print('Before spatial batch normalization:') print(' Shape: ', x.shape) print(' Means: ', x.mean(axis=(0, 2, 3))) print(' Stds: ', x.std(axis=(0, 2, 3))) print() gamma, beta = np.ones(C), np.zeros(C) bn_param = {'mode': 'train'} out, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param) print('After spatial batch normalization:') print('(Means should be close to 0 and stds close to 1)') print(' Shape: ', out.shape) print(' Means: ', out.mean(axis=(0, 2, 3))) print(' Stds: ', out.std(axis=(0, 2, 3))) print() gamma, beta = np.asarray([3, 4, 5]), np.asarray([6, 7, 8]) out, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param) print('After spatial batch normalization (nontrivial gamma, beta):') print( '(Means should be close to beta [6, 7, 8] and stds close to gamma [3, 4, 5])' ) print(' Shape: ', out.shape) print(' Means: ', out.mean(axis=(0, 2, 3))) print(' Stds: ', out.std(axis=(0, 2, 3))) print()
def check_spatial_norms(): print_formatted('Check spatial norms', 'stage') check_spatial_batchnorm_forward_train_time() check_spatial_batchnorm_forward_test_time() check_spatial_batchnorm_backward() check_spatial_groupnorm_forward() check_spatial_groupnorm_backward()
def conv_net_overfitting(plot=False): print_formatted('Overfitting small data with convnet', 'stage') np.random.seed(231) num_train = 100 small_data = { 'X_train': X_train[:num_train], 'y_train': y_train[:num_train], 'X_val': X_val, 'y_val': y_val, } small_data['X_train'] = small_data['X_train'].reshape( (small_data['X_train'].shape[0], 32, 32, 3)).transpose(0, 3, 1, 2) small_data['X_val'] = small_data['X_val'].reshape( (small_data['X_val'].shape[0], 32, 32, 3)).transpose(0, 3, 1, 2) model = ThreeLayerConvNet(weight_scale=1e-2) solver = Solver(model, small_data, num_epochs=15, batch_size=50, update_rule='adam', optim_config={ 'learning_rate': 1e-3, }, print_every=1) solver.train() if plot: plot_stats('loss', 'train_val_acc', solvers={'convnet': solver}, filename='convnet_overfitting.png')
def train_conv_net(): print_formatted('Conv net', 'stage') data = { 'X_train': X_train.reshape((X_train.shape[0], 32, 32, 3)).transpose(0, 3, 1, 2), 'y_train': y_train, 'X_val': X_val.reshape((X_val.shape[0], 32, 32, 3)).transpose(0, 3, 1, 2), 'y_val': y_val, } model = ThreeLayerConvNet(weight_scale=0.001, hidden_dim=500, reg=0.001) solver = Solver(model, data, num_epochs=1, batch_size=50, update_rule='adam', optim_config={ 'learning_rate': 1e-3, }, print_every=20, checkpoint_name='convnet') solver.train()
def check_batchnorm(): print_formatted('Batchnorm checks', 'stage') check_batchnorm_forward_train_time() check_batchnorm_forward_test_time() check_batchnorm_backward() check_batchnorm_backward_alt() check_batchnorm_fc_net()
def check_spatial_groupnorm_forward(): print_formatted('Spatial groupnorm forward', 'bold', 'blue') np.random.seed(231) N, C, H, W = 2, 6, 4, 5 G = 2 x = 4 * np.random.randn(N, C, H, W) + 10 x_g = x.reshape((N * G, -1)) print('Before spatial group normalization:') print(' Shape: ', x.shape) print(' Means: ', x_g.mean(axis=1)) print(' Stds: ', x_g.std(axis=1)) print() gamma, beta = np.ones((1, C, 1, 1)), np.zeros((1, C, 1, 1)) bn_param = {'mode': 'train'} out, _ = spatial_groupnorm_forward(x, gamma, beta, G, bn_param) out_g = out.reshape((N * G, -1)) print('After spatial group normalization:') print('(Means should be close to 0 and stds close to 1)') print(' Shape: ', out.shape) print(' Means: ', out_g.mean(axis=1)) print(' Stds: ', out_g.std(axis=1)) print()
def check_batchnorm_forward_test_time(): print_formatted('Test time batchnorm forward', 'bold', 'blue') np.random.seed(231) N, D1, D2, D3 = 200, 50, 60, 3 W1 = np.random.randn(D1, D2) W2 = np.random.randn(D2, D3) bn_param = {'mode': 'train'} gamma = np.ones(D3) beta = np.zeros(D3) for t in range(50): X = np.random.randn(N, D1) a = np.maximum(0, X.dot(W1)).dot(W2) batchnorm_forward(a, gamma, beta, bn_param) bn_param['mode'] = 'test' X = np.random.randn(N, D1) a = np.maximum(0, X.dot(W1)).dot(W2) a_norm, _ = batchnorm_forward(a, gamma, beta, bn_param) print('After batch normalization (test-time):') print('(Means should be near 0 and stds near 1)') print_mean_std(a_norm, axis=0)
def check_batchnorm_backward(): print_formatted('Batchnorm backward', 'bold', 'blue') np.random.seed(231) N, D = 4, 5 x = 5 * np.random.randn(N, D) + 12 gamma = np.random.randn(D) beta = np.random.randn(D) dout = np.random.randn(N, D) bn_param = {'mode': 'train'} fx = lambda x: batchnorm_forward(x, gamma, beta, bn_param)[0] fg = lambda a: batchnorm_forward(x, a, beta, bn_param)[0] fb = lambda b: batchnorm_forward(x, gamma, b, bn_param)[0] dx_num = evaluate_numerical_gradient_array(fx, x, dout) da_num = evaluate_numerical_gradient_array(fg, gamma, dout) db_num = evaluate_numerical_gradient_array(fb, beta, dout) _, cache = batchnorm_forward(x, gamma, beta, bn_param) dx, dgamma, dbeta = batchnorm_backward(dout, cache) print('(You should expect to see relative errors between 1e-13 and 1e-8)') print('dx error: ', relative_error(dx_num, dx)) print('dgamma error: ', relative_error(da_num, dgamma)) print('dbeta error: ', relative_error(db_num, dbeta)) print()
def check_batchnorm_forward_train_time(): print_formatted('Train time batchnorm forward', 'bold', 'blue') np.random.seed(231) N, D1, D2, D3 = 200, 50, 60, 3 X = np.random.randn(N, D1) W1 = np.random.randn(D1, D2) W2 = np.random.randn(D2, D3) a = np.maximum(0, X.dot(W1)).dot(W2) print('Before batch normalization:') print_mean_std(a, axis=0) gamma = np.ones((D3, )) beta = np.zeros((D3, )) a_norm, _ = batchnorm_forward(a, gamma, beta, {'mode': 'train'}) print('After batch normalization (gamma=1, beta=0)') print('(Means should be close to 0 and stds close to 1)') print_mean_std(a_norm, axis=0) gamma = np.asarray([1.0, 2.0, 3.0]) beta = np.asarray([11.0, 12.0, 13.0]) a_norm, _ = batchnorm_forward(a, gamma, beta, {'mode': 'train'}) print('After batch normalization (gamma=', gamma, ', beta=', beta, ')') print('(Now means should be close to beta and stds close to gamma)') print_mean_std(a_norm, axis=0)
def check_layernorm_forward(): print_formatted('Layernorm forward', 'bold', 'blue') np.random.seed(231) N, D1, D2, D3 = 4, 50, 60, 3 X = np.random.randn(N, D1) W1 = np.random.randn(D1, D2) W2 = np.random.randn(D2, D3) a = np.maximum(0, X.dot(W1)).dot(W2) print('Before layer normalization:') print_mean_std(a, axis=1) gamma = np.ones(D3) beta = np.zeros(D3) print('After layer normalization (gamma=1, beta=0)') print('(Means should be close to 0 and stds close to 1)') a_norm, _ = layernorm_forward(a, gamma, beta) print_mean_std(a_norm, axis=1) gamma = np.asarray([3.0, 3.0, 3.0]) beta = np.asarray([5.0, 5.0, 5.0]) print('After layer normalization (gamma=', gamma, ', beta=', beta, ')') print('(Now means should be close to beta and stds close to gamma)') a_norm, _ = layernorm_forward(a, gamma, beta) print_mean_std(a_norm, axis=1)
def check_batchnorm_backward_alt(): print_formatted('Batchnorm backward alt', 'bold', 'blue') np.random.seed(231) N, D = 100, 500 x = 5 * np.random.randn(N, D) + 12 gamma = np.random.randn(D) beta = np.random.randn(D) dout = np.random.randn(N, D) bn_param = {'mode': 'train'} out, cache = batchnorm_forward(x, gamma, beta, bn_param) t1 = time.time() dx1, dgamma1, dbeta1 = batchnorm_backward(dout, cache) t2 = time.time() dx2, dgamma2, dbeta2 = batchnorm_backward_alt(dout, cache) t3 = time.time() print('dx difference: ', relative_error(dx1, dx2)) print('dgamma difference: ', relative_error(dgamma1, dgamma2)) print('dbeta difference: ', relative_error(dbeta1, dbeta2)) print( 'batchnorm_backward_alt is %.2f times faster the batchnorm_backward' % ((t2 - t1) / (t3 - t2))) print()
def compare_update_rules(plot=False): print_formatted('Update rules', 'stage') num_train = 4000 small_data = { 'X_train': X_train[:num_train], 'y_train': y_train[:num_train], 'X_val': X_val, 'y_val': y_val, } learning_rates = { 'sgd': 1e-2, 'sgd_momentum': 1e-2, 'nesterov_momentum': 1e-2, 'adagrad': 1e-4, 'rmsprop': 1e-4, 'adam': 1e-3 } solvers = {} for update_rule in [ 'sgd', 'sgd_momentum', 'nesterov_momentum', 'adagrad', 'rmsprop', 'adam' ]: print_formatted('running with ' + update_rule, 'bold', 'blue') model = FullyConnectedNet(input_dim=3072, hidden_dims=[100] * 5, num_classes=10, weight_scale=5e-2) solver = Solver(model, small_data, num_epochs=5, batch_size=100, update_rule=update_rule, optim_config={ 'learning_rate': learning_rates[update_rule], }, verbose=True) solvers[update_rule] = solver solver.train() print() if plot: plot_stats('loss', 'train_acc', 'val_acc', solvers=solvers, filename='update_rules_comparison.png')
def check_dropout_backward(): print_formatted('Dropout backward', 'bold', 'blue') np.random.seed(231) x = np.random.randn(10, 10) + 10 dout = np.random.randn(*x.shape) dropout_param = {'mode': 'train', 'p': 0.2, 'seed': 123} out, cache = dropout_forward(x, dropout_param) dx = dropout_backward(dout, cache) dx_num = evaluate_numerical_gradient_array( lambda xx: dropout_forward(xx, dropout_param)[0], x, dout) print('(Relative error should be around e-10 or less)') print('dx relative error: ', relative_error(dx, dx_num)) print()
def check_dropout_forward(): print_formatted('Dropout forward', 'bold', 'blue') np.random.seed(231) x = np.random.randn(500, 500) + 10 for p in [0.25, 0.4, 0.7]: out, _ = dropout_forward(x, {'mode': 'train', 'p': p}) out_test, _ = dropout_forward(x, {'mode': 'test', 'p': p}) print('Running tests with p = ', p) print('Mean of input: ', x.mean()) print('Mean of train-time output: ', out.mean()) print('Mean of test-time output: ', out_test.mean()) print('Fraction of train-time output set to zero: ', (out == 0).mean()) print('Fraction of test-time output set to zero: ', (out_test == 0).mean()) print()
def visualize_convnet_filters(): print_formatted('Visualizing convnet filters', 'stage') checkpoint = pickle.load(open('convnet_epoch_1.pkl', 'rb')) W1 = checkpoint['model'].params['W1'].transpose(0, 2, 3, 1) N, H, W, C = W1.shape grid_size = int(ceil(sqrt(N))) for i in range(N): img = W1[i] low, high = np.min(img), np.max(img) rgb_img = 255 * (img - low) / (high - low) plt.subplot(grid_size, grid_size, i + 1) plt.imshow(rgb_img.astype('uint8')) plt.axis('off') plt.gcf().set_size_inches(10, 10) plt.savefig('plots/convnet_filters.png')
def check_spatial_batchnorm_forward_test_time(): print_formatted('Test time spatial batchnorm forward', 'bold', 'blue') np.random.seed(231) N, C, H, W = 10, 4, 11, 12 bn_param = {'mode': 'train'} gamma = np.ones(C) beta = np.zeros(C) for t in range(50): x = 2.3 * np.random.randn(N, C, H, W) + 13 spatial_batchnorm_forward(x, gamma, beta, bn_param) bn_param['mode'] = 'test' x = 2.3 * np.random.randn(N, C, H, W) + 13 a_norm, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param) print('After spatial batch normalization (test-time):') print('(Means should be near 0 and stds near 1)') print(' means: ', a_norm.mean(axis=(0, 2, 3))) print(' stds: ', a_norm.std(axis=(0, 2, 3))) print()
def train_two_layer(plot=False): print_formatted('Two layer net', 'stage') model = TwoLayerNet(input_dim=3072, hidden_dim=100, num_classes=10) data = { 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val } solver = Solver(model, data, num_epochs=1, print_every=100, batch_size=100, lr_decay=0.95) solver.train() if plot: plot_stats('loss', 'train_val_acc', solvers={'two_layer_net': solver}, filename='two_layer_net_stats.png')
def train_with_layernorm(plot=False): print_formatted('Layer normalization', 'stage') hidden_dims = [100, 100, 100, 100, 100] weight_scale = 2e-2 num_train = 1000 small_data = { 'X_train': X_train[:num_train], 'y_train': y_train[:num_train], 'X_val': X_val, 'y_val': y_val, } print_formatted('without layernorm', 'bold', 'blue') model = FullyConnectedNet(input_dim=3072, hidden_dims=hidden_dims, num_classes=10, weight_scale=weight_scale) solver = Solver(model, small_data, update_rule='adam', optim_config={ 'learning_rate': 1e-3, }, num_epochs=10, batch_size=50, print_every=20) solver.train() print() print_formatted('with layernorm', 'bold', 'blue') ln_model = FullyConnectedNet(input_dim=3072, hidden_dims=hidden_dims, num_classes=10, weight_scale=weight_scale, normalization='layernorm') ln_solver = Solver(ln_model, small_data, update_rule='adam', optim_config={ 'learning_rate': 1e-3, }, num_epochs=10, batch_size=50, print_every=20) ln_solver.train() if plot: plot_stats('loss', 'train_acc', 'val_acc', solvers={ 'baseline': solver, 'with_norm': ln_solver }, filename='layernorm.png')
def check_batchnorm_fc_net(): print_formatted('Fully connected net with batchnorm', 'bold', 'blue') np.random.seed(231) N, D, H1, H2, C = 2, 15, 20, 30, 10 X = np.random.randn(N, D) y = np.random.randint(C, size=(N, )) print('Relative errors for W should be between 1e-4 ~ 1e-10.') print('Relative errors for b should be between 1e-8 ~ 1e-10.') print( 'Relative errors for gammas and betas should be between 1e-8 ~ 1e-9.') print() for reg in [0, 3.14]: print('Running check with reg = ', reg) model = FullyConnectedNet(input_dim=D, hidden_dims=[H1, H2], num_classes=C, weight_scale=5e-2, reg=reg, normalization='batchnorm') loss, grads = model.loss(X, y) print('Initial loss: ', loss) for name in sorted(grads): f = lambda _: model.loss(X, y)[0] grad_num = evaluate_numerical_gradient(f, model.params[name], verbose=False, h=1e-5) print('%s relative error: %.2e' % (name, relative_error(grad_num, grads[name]))) if reg == 0: print()
def train_with_dropout(plot=False): print_formatted('Dropout', 'stage') np.random.seed(231) num_train = 500 small_data = { 'X_train': X_train[:num_train], 'y_train': y_train[:num_train], 'X_val': X_val, 'y_val': y_val, } solvers = {} dropout_choices = [1, 0.25] for dropout in dropout_choices: if dropout == 1: print_formatted('without dropout, p = 1', 'bold', 'blue') else: print_formatted('with dropout, p = %.2f' % dropout, 'bold', 'blue') model = FullyConnectedNet(input_dim=3072, hidden_dims=[500], num_classes=10, dropout=dropout) solver = Solver(model, small_data, update_rule='adam', optim_config={ 'learning_rate': 5e-4, }, num_epochs=25, batch_size=100, print_every=100) solver.train() solvers[dropout] = solver if dropout == 1: print() if plot: plot_stats('train_acc', 'val_acc', solvers={ '1.00 dropout': solvers[1], '0.25 dropout': solvers[0.25] }, filename='dropout.png')
def train_best_fc_model(plot=False): print_formatted('Best fully connected net', 'stage') hidden_dims = [100, 100, 100] weight_scale = 2e-2 num_epochs = 10 dropout = 1 data = { 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'X_test': X_test, 'y_test': y_test, } print_formatted('training', 'bold', 'blue') model = FullyConnectedNet(input_dim=3072, hidden_dims=hidden_dims, num_classes=10, weight_scale=weight_scale, normalization='batchnorm', dropout=dropout) solver = Solver(model, data, update_rule='adam', optim_config={ 'learning_rate': 1e-3, }, num_epochs=num_epochs, batch_size=50, print_every=100) solver.train() print() if plot: plot_stats('loss', 'train_val_acc', solvers={'best_fc': solver}) print_formatted('evaluating', 'bold', 'blue') y_test_pred = np.argmax(model.loss(data['X_test']), axis=1) y_val_pred = np.argmax(model.loss(data['X_val']), axis=1) print('Validation set accuracy: ', (y_val_pred == data['y_val']).mean()) print('Test set accuracy: ', (y_test_pred == data['y_test']).mean())
def check_layernorm(): print_formatted('Layernorm checks', 'stage') check_layernorm_forward() check_layernorm_backward() check_layernorm_fc_net()
def check_dropout(): print_formatted('Dropout checks', 'stage') check_dropout_forward() check_dropout_backward() check_dropout_fc_net()
from batchnorm_checks import check_batchnorm from layernorm_checks import check_layernorm from dropout_checks import check_dropout from conv_checks import check_conv import numpy as np import pickle from math import ceil, sqrt import matplotlib.pyplot as plt from spatial_norms_checks import check_spatial_norms ''' Hyperparameters ''' subtract_mean = True normalize = False ''' Data ''' print_formatted('Load data', 'stage') X_train, y_train, X_val, y_val, X_test, y_test = load_CIFAR10_sample( 'datasets/cifar-10-batches-py', num_train=49000, num_val=1000, num_test=10000, mean_subtr=subtract_mean, norm=normalize) print('X_train shape:', X_train.shape) print('y_train shape:', y_train.shape) print('X_val shape:', X_val.shape) print('y_val shape:', y_val.shape) print('X_test shape:', X_test.shape) print('y_test shape:', y_test.shape) ''' Actions '''