def run(n_runs): N_iters = N_epochs N_meta_iter = n_runs parser, loss_fun = make_toy_funs() N_weight_types = len(parser.names) N_weights = parser.vect.size hyperparams = VectorParser() hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) hyperparams['V0'] = np.full(N_weights, init_V0) all_learning_curves = [] all_param_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, ii): learning_curve = [] params_curve = [] def callback(x, i): params_curve.append(x) learning_curve.append(loss_fun(x)) def indexed_loss_fun(w, log_L2_reg, j): return loss_fun(w) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = init_params V0 = cur_hyperparams['V0'] alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = 0.0 results = sgd3(indexed_loss_fun, loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.new_vect(np.zeros(hyperparams.vect.shape)) hypergrads['V0'] = results['dMd_v'] * 0 hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = (results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) all_param_curves.append(params_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num'] meta_results = {field : [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i, g): if i % N_meta_thin == 0: print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x)) meta_results['iter_num'].append(i) final_result = simple_sgd(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, meta_gamma) meta_results['all_learning_curves'] = all_learning_curves meta_results['all_param_curves'] = all_param_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def test_simple_sgd(): N_weights = 5 W0 = 0.1 * npr.randn(N_weights) (loss_fun, true_argmin) = make_optimization_problem(N_weights) x_min = simple_sgd(grad(loss_fun), W0) assert np.allclose(x_min, true_argmin, rtol=1e-3, atol=1e-4), \ "Diffs are: {0}".format(x_min - true_argmin)
def run(): N_iters = N_epochs parser, loss_fun = make_toy_funs() N_weight_types = len(parser.names) N_weights = parser.vect.size hyperparams = VectorParser() hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) hyperparams['V0'] = np.full(N_weights, init_V0) all_learning_curves = [] all_param_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, ii): learning_curve = [] params_curve = [] def callback(x, i): params_curve.append(x) learning_curve.append(loss_fun(x)) def indexed_loss_fun(w, log_L2_reg, j): return loss_fun(w) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = init_params V0 = cur_hyperparams['V0'] alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = 0.0 results = sgd3(indexed_loss_fun, loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['V0'] = results['dMd_v'] * 0 hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = (results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) all_param_curves.append(params_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num'] meta_results = {field : [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i, g): if i % N_meta_thin == 0: print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x)) meta_results['iter_num'].append(i) final_result = simple_sgd(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, meta_gamma) meta_results['all_learning_curves'] = all_learning_curves meta_results['all_param_curves'] = all_param_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) initial_hypergrad = hyperloss_grad( hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) final_result = simple_sgd(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser, parsed_init_hypergrad