def test_sgd_parser(): N_weights = 6 W0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 4 batch_idxs = BatchList(N_data, batch_size) parser = VectorParser() parser.add_shape('first', [2,]) parser.add_shape('second', [1,]) parser.add_shape('third', [3,]) N_weight_types = 3 alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) meta = 0.1 * npr.randn(N_weights*2) A = npr.randn(N_data, N_weights) def loss_fun(W, meta, i=None): idxs = batch_idxs.all_idxs if i is None else batch_idxs[i % len(batch_idxs)] sub_A = A[idxs, :] return np.dot(np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W) def full_loss(params): (W0, alphas, betas, meta) = params result = sgd_parsed(grad(loss_fun), kylist(W0, alphas, betas, meta), parser) return loss_fun(result, meta) d_num = nd(full_loss, (W0, alphas, betas, meta)) d_an_fun = grad(full_loss) d_an = d_an_fun([W0, alphas, betas, meta]) for i, (an, num) in enumerate(zip(d_an, d_num[0])): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num)
def make_parabola(d): parser = VectorParser() parser.add_shape('weights', d) dimscale = np.exp(np.linspace(-3, 3, d)) offset = npr.randn(d) def loss(w, X=0.0, T=0.0, L2_reg=0.0): return np.dot((w - offset) * dimscale, (w - offset)) return parser, loss
def make_toy_funs(): parser = VectorParser() parser.add_shape('weights', 2) def rosenbrock(x): return sum(100.0*(x[1:]-x[:-1]**2.0)**2.0 + (1-x[:-1])**2.0) def loss(W_vect, X=0.0, T=0.0, L2_reg=0.0): return 500 * logit(rosenbrock(W_vect) / 500) return parser, loss
def make_toy_funs(): parser = VectorParser() parser.add_shape('weights', 2) def rosenbrock(x): return sum(100.0 * (x[1:] - x[:-1]**2.0)**2.0 + (1 - x[:-1])**2.0) def loss(W_vect, X=0.0, T=0.0, L2_reg=0.0): return 500 * logit(rosenbrock(W_vect) / 500) return parser, loss
def make_toy_funs(): parser = VectorParser() parser.add_shape('weights', 2) def rosenbrock(w): x = w[1:] y = w[:-1] return sum(100.0 * (x - y**2.0)**2.0 + (1 - y)**2.0 + 200.0 * y) def loss(W_vect, X=0.0, T=0.0, L2_reg=0.0): return 800 * logit(rosenbrock(W_vect) / 500) return parser, loss
def make_toy_funs(): parser = VectorParser() parser.add_shape("weights", 2) def rosenbrock(w): x = w[1:] y = w[:-1] return sum(100.0 * (x - y ** 2.0) ** 2.0 + (1 - y) ** 2.0 + 200.0 * y) def loss(W_vect, X=0.0, T=0.0, L2_reg=0.0): return 800 * logit(rosenbrock(W_vect) / 500) return parser, loss
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) initial_hypergrad = hyperloss_grad( hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) avg_hypergrad = initial_hypergrad.copy() for i in xrange(1, N_meta_iter): avg_hypergrad += hyperloss_grad( hyperparams.vect, i) print i parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad) parser.vect = None # No need to pickle zeros return parser, parsed_init_hypergrad, parsed_avg_hypergrad
def make_transform(layer_corr): diag = np.eye(N_scripts) full = np.full((N_scripts, N_scripts), 1.0 / N_scripts) transform_parser = VectorParser() for i_layer, corr in enumerate(layer_corr): transform_parser[i_layer] = (1 - corr) * diag + corr * full return transform_parser
def run(superparams): alpha, log_scale_init, offset_init_std = superparams RS = RandomState((seed, "top_rs")) all_alphabets = omniglot.load_data() RS.shuffle(all_alphabets) train_alphabets = all_alphabets[:-N_test_alphabets] tests_alphabets = all_alphabets[-N_test_alphabets:] w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size hyperparams_0 = VectorParser() hyperparams_0['log_scale'] = log_scale_init * np.ones(N_weights) hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights) def reg_loss_fun(W, data, hyperparam_vect, reg_penalty): hyperparams = hyperparams_0.new_vect(hyperparam_vect) Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset'] return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 30 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = np.zeros(N_weights) W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False) results = defaultdict(list) def record_results(hyperparam_vect, i_hyper, g): # print "Meta iter {0}. Recording results".format(i_hyper) RS = RandomState((seed, i_hyper, "evaluation")) new_seed = RS.int32() def loss_fun(alphabets, report_train_loss): return np.mean([hyperloss(hyperparam_vect, new_seed, alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval)]) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype(np.float16)) results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True)) results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False)) record_results(hyperparams_0.vect, 0, None) return [results['train_loss'][0], results['valid_loss'][0]]
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): seed = i_hyper * 10**6 + i_iter idxs = npr.RandomState(seed).randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve = [] def callback(x, v, g, i_iter): if i_iter % N_batches == 0: learning_curve.append(loss_fun(x, **train_data)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) V0 = np.zeros(W0.size) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper, g): print "Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): N_iters = N_epochs parser, loss_fun = make_toy_funs() N_weight_types = len(parser.names) N_weights = parser.vect.size hyperparams = VectorParser() hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) hyperparams['V0'] = np.full(N_weights, init_V0) all_learning_curves = [] all_param_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, ii): learning_curve = [] params_curve = [] def callback(x, i): params_curve.append(x) learning_curve.append(loss_fun(x)) def indexed_loss_fun(w, log_L2_reg, j): return loss_fun(w) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = init_params V0 = cur_hyperparams['V0'] alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = 0.0 results = sgd3(indexed_loss_fun, loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['V0'] = results['dMd_v'] * 0 hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = (results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) all_param_curves.append(params_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num'] meta_results = {field : [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i, g): if i % N_meta_thin == 0: print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x)) meta_results['iter_num'].append(i) final_result = simple_sgd(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, meta_gamma) meta_results['all_learning_curves'] = all_learning_curves meta_results['all_param_curves'] = all_param_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def make_transform(N_scripts, corr): uncorrelated_mat = np.eye(N_scripts) fully_correlated_mat = np.full((N_scripts, N_scripts), 1.0 / N_scripts) transform_mat = (1 - corr) * uncorrelated_mat + corr * fully_correlated_mat transform_parser = VectorParser() for i_layer in range(N_layers): transform_parser[i_layer] = transform_mat return transform_parser
def build_hypervect(init_log_alphas, init_invlogit_betas, init_log_param_scale): hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) return hyperparams
def test_sgd_parser(): N_weights = 6 W0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 4 batch_idxs = BatchList(N_data, batch_size) parser = VectorParser() parser.add_shape('first', [ 2, ]) parser.add_shape('second', [ 1, ]) parser.add_shape('third', [ 3, ]) N_weight_types = 3 alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) meta = 0.1 * npr.randn(N_weights * 2) A = npr.randn(N_data, N_weights) def loss_fun(W, meta, i=None): idxs = batch_idxs.all_idxs if i is None else batch_idxs[ i % len(batch_idxs)] sub_A = A[idxs, :] return np.dot( np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W) def full_loss(params): (W0, alphas, betas, meta) = params result = sgd_parsed(grad(loss_fun), kylist(W0, alphas, betas, meta), parser) return loss_fun(result, meta) d_num = nd(full_loss, (W0, alphas, betas, meta)) d_an_fun = grad(full_loss) d_an = d_an_fun([W0, alphas, betas, meta]) for i, (an, num) in enumerate(zip(d_an, d_num[0])): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num)
def run(): parser, loss_fun = make_parabola(dimension) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def primal_optimizer(hyperparam_vect, i_hyper): learning_curve = [] def callback(x, i_iter): learning_curve.append(loss_fun(x)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(hash(i_hyper)).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(loss_fun), kylist(W0, alphas, betas, L2_reg), callback) callback(W_opt, N_iters) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): N_iters = N_epochs parser, loss_fun = make_toy_funs() N_weights = parser.vect.size hyperparams = VectorParser() hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) hyperparams['V0'] = np.full(N_weights, init_V0) forward_path = [] forward_learning_curve = [] def fwd_callback(x, i): print type(x[0]) forward_path.append(x.copy()) forward_learning_curve.append(loss_fun(x)) reverse_path = [] reverse_learning_curve = [] def reverse_callback(x, i): reverse_path.append(x.copy()) reverse_learning_curve.append(loss_fun(x)) def indexed_loss_fun(w, log_L2_reg, j): return loss_fun(w) cur_hyperparams = hyperparams W0 = init_params V0 = cur_hyperparams['V0'] alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = 0.0 sgd3_naive(indexed_loss_fun, W0, V0, alphas, betas, log_L2_reg, fwd_callback=fwd_callback, reverse_callback=reverse_callback) return forward_path, forward_learning_curve, reverse_path, reverse_learning_curve
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) N_weights = len(parser.vect) hyperparams = VectorParser() rs = RandomState((seed)) hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\ + rs.randn(N_weights) * init_log_L2_reg_noise hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) cur_primal_results = {} def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = np.exp(cur_hyperparams['log_L2_reg']) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) x, learning_curve_dict = cur_primal_results[ 'weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field] = cur_hyperparams[field] meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) initial_hypergrad = hyperloss_grad(hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser, parsed_init_hypergrad
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0 or i_iter == 0 or i_iter == N_iters - 1: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) learning_curve_dict['iteration'].append(i_iter) init_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(init_hyperparams['log_alphas']) betas = logit(init_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) # Now do a line search along that direction. log_stepsizes = np.linspace(-init_log_alphas, init_log_alphas*4, N_points_in_line_search) for log_stepsizes in log_stepsizes: hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), log_stepsizes) meta_callback(hyperparams.vect, 0) # Use the same random seed every time. parser.vect = None # No need to pickle zeros return meta_results, parser, log_stepsizes
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_iters, init_log_param_scale) # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState( npr.RandomState(global_seed + i_hyper).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % N_batches == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) #callback(W_opt, N_iters) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) initial_hypergrad = hyperloss_grad( hyperparams.vect, 0) hypergrads = np.zeros((N_meta_iter, len(initial_hypergrad))) for i in xrange(N_meta_iter): hypergrads[i] = hyperloss_grad( hyperparams.vect, i) print i avg_hypergrad = np.mean(hypergrads, axis=0) parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad) parser.vect = None # No need to pickle zeros return parser, parsed_avg_hypergrad
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) initial_hypergrad = hyperloss_grad(hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) avg_hypergrad = initial_hypergrad.copy() for i in xrange(1, N_meta_iter): avg_hypergrad += hyperloss_grad(hyperparams.vect, i) print i parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad) parser.vect = None # No need to pickle zeros return parser, parsed_init_hypergrad, parsed_avg_hypergrad
def run(): RS = RandomState((seed, "top_rs")) all_data = omniglot.load_flipped_alphabets() train_data, tests_data = random_partition(all_data, RS, [12, 3]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) transform_parser = make_transform([0] * N_layers) def get_layers(vect): layers = [] for i_layer in range(N_layers): weights_by_scripts = vect.reshape((N_scripts, N_weights)) weights_idxs, _ = w_parser.idxs_and_shapes[('weights', i_layer)] biases_idxs, _ = w_parser.idxs_and_shapes[('biases', i_layer)] assert weights_idxs.stop == biases_idxs.start layer_idxs = slice(weights_idxs.start, biases_idxs.stop) layers.append(weights_by_scripts[:, layer_idxs]) return layers def transform_weights(z_vect, transform_vect): z_layers = get_layers(z_vect) transform = transform_parser.new_vect(transform_vect) w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)] return np.concatenate(w_layers, axis=1).ravel() def likelihood_loss(w_vect, data): w = script_parser.new_vect(w_vect) return sum([ loss_fun(w[i], **script_data) for i, script_data in enumerate(data) ]) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def train_z(data, transform_vect, RS): def primal_loss(z_vect, transform_vect, i_primal, record_results=False): w_vect = transform_weights(z_vect, transform_vect) loss = likelihood_loss(w_vect, data) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss) / N_scripts) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_init_scale) return sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters) def train_sharing(): def hyperloss(transform_vect, i_hyper): RS = RandomState((seed, i_hyper, "hyperloss")) cur_train_data, cur_valid_data = random_partition( train_data, RS, [10, 2]) z_vect_final = train_z(cur_train_data, transform_vect, RS) w_vect_final = transform_weights(z_vect_final, transform_vect) return likelihood_loss(w_vect_final, cur_valid_data) / N_scripts hypergrad = grad(hyperloss) cur_transform_vect = make_transform([init_script_corr] * N_layers).vect for i_hyper in range(N_meta_iter): print "Hyper iter {0}".format(i_hyper) grad_transform = hypergrad(cur_transform_vect, i_hyper) cur_transform_vect = cur_transform_vect - grad_transform * meta_alpha return cur_transform_vect transform_vects, train_losses, tests_losses = {}, {}, {} transform_vects['no_sharing'] = make_transform([0, 0, 0]).vect transform_vects['full_sharing'] = make_transform([1, 0, 0]).vect transform_vects['learned_sharing'] = train_sharing() for name in transform_vects.keys(): RS = RandomState("final_training") tv = transform_vects[name] trained_z = train_z(train_data, tv, RS) trained_w = transform_weights(trained_z, tv) train_losses[name] = likelihood_loss(trained_w, train_data) / N_scripts tests_losses[name] = likelihood_loss(trained_w, tests_data) / N_scripts print "{0} : train: {1}, test: {2}".format(name, train_losses[name], tests_losses[name]) return transform_parser, transform_vects, train_losses, tests_losses
def run(): """Three different parsers: w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single script script_parser[i_script] : weights vector for each script transform_parser[i_layer] : transform matrix (scripts x scripts) for each alphabet""" RS = RandomState((seed, "top_rs")) train_data, valid_data, tests_data = omniglot.load_data_split( [11, 2, 2], RS, num_alphabets=N_scripts) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size transform_parser = make_transform(N_scripts, script_corr_init) script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) def get_layers(vect): layers = [] for i_layer in range(N_layers): weights_by_scripts = vect.reshape((N_scripts, N_weights)) weights_idxs, _ = w_parser.idxs_and_shapes[('weights', i_layer)] biases_idxs, _ = w_parser.idxs_and_shapes[('biases', i_layer)] assert weights_idxs.stop == biases_idxs.start layer_idxs = slice(weights_idxs.start, biases_idxs.stop) layers.append(weights_by_scripts[:, layer_idxs]) return layers def transform_weights(z_vect, transform_vect): z_layers = get_layers(z_vect) transform = transform_parser.new_vect(transform_vect) w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)] return np.concatenate(w_layers, axis=1).ravel() def total_loss(w_vect, data): w = script_parser.new_vect(w_vect) return sum([loss_fun(w[i], **script_data) for i, script_data in enumerate(data)]) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2_init) results = defaultdict(list) def hyperloss(transform_vect, i_hyper, record_results=True): RS = RandomState((seed, i_hyper, "hyperloss")) def primal_loss(z_vect, transform_vect, i_primal, record_results=False): w_vect = transform_weights(z_vect, transform_vect) loss = total_loss(w_vect, train_data) reg = regularization(z_vect) if VERBOSE and record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format( i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg)) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) w_vect_final = transform_weights(z_vect_final, transform_vect) valid_loss = total_loss(w_vect_final, valid_data) if record_results: results['valid_loss'].append(getval(valid_loss) / N_scripts) results['train_loss'].append(total_loss(w_vect_final, train_data) / N_scripts) results['tests_loss'].append(total_loss(w_vect_final, tests_data) / N_scripts) return valid_loss grad_transform = grad(hyperloss)(transform_parser.vect, 0, record_results=False) for i, d in enumerate(line_search_dists): new_transform_vect = transform_parser.vect - d * grad_transform hyperloss(new_transform_vect, 0, record_results=True) print "Hyper iter {0}".format(i) print "Results", {k : v[-1] for k, v in results.iteritems()} grad_transform_dict = transform_parser.new_vect(grad_transform).as_dict() return results, grad_transform_dict
def run(): (train_images, train_labels),\ (valid_images, valid_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests) batch_idxs = BatchList(N_train, batch_size) N_iters = N_epochs * len(batch_idxs) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def indexed_loss_fun(w, log_L2_reg, i): idxs = batch_idxs[i % len(batch_idxs)] partial_vects = [ np.full(parser[name].size, np.exp(log_L2_reg[i])) for i, name in enumerate(parser.names) ] L2_reg_vect = np.concatenate(partial_vects, axis=0) return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect) def train_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=train_images, T=train_labels) def valid_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=valid_images, T=valid_labels) def tests_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=tests_images, T=tests_labels) all_learning_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, i): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append( loss_fun(x, X=train_images, T=train_labels)) npr.seed(i) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [ np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names) ] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] results = sgd3(indexed_loss_fun, valid_loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['log_L2_reg'] = results['dMd_meta'] weights_grad = parser.new_vect(W0 * results['dMd_x']) hypergrads['log_param_scale'] = [ np.sum(weights_grad[name]) for name in parser.names ] hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = ( results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss'] meta_results = {field: [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) log_L2_reg = cur_hyperparams['log_L2_reg'] for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(train_loss_fun(x)) meta_results['valid_loss'].append(valid_loss_fun(x)) meta_results['tests_loss'].append(tests_loss_fun(x)) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_results['all_learning_curves'] = all_learning_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) #only uses two different regularization hyperparameters, one for each layer? N_weight_types = len(parser.names) # = 2 print(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_iters, init_log_param_scale) #don't update scale #TODO: remove scale from gradient, then? exact_metagrad = VectorParser() exact_metagrad['log_L2_reg'] = fill_parser(parser, hyperparams['log_L2_reg']) #np.zeros(N_weight_types) exact_metagrad['log_param_scale'] = fill_parser(parser, fixed_hyperparams['log_param_scale']) #np.zeros(N_weight_types) exact_metagrad['log_alphas'] = np.zeros(N_iters) exact_metagrad['invlogit_betas'] = np.zeros(N_iters) exact_metagrad2 = VectorParser() exact_metagrad2['log_L2_reg'] = np.zeros(N_weight_types) exact_metagrad2['log_param_scale'] = np.zeros(N_weight_types) exact_metagrad2['log_alphas'] = np.zeros(N_iters) exact_metagrad2['invlogit_betas'] = np.zeros(N_iters) #exact_metagrad = exact_metagradV.vect #print(hyperparams.vect) #exact_metagrad = [np.zeros(N_weight_types), np.zeros(N_weight_types), np.zeros(N_iters), np.zeros(N_iters)] #initialize # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState(npr.RandomState(global_seed + i_hyper + i_iter * 10000).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # N_batches=10 times learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) # TODO: why doesn't the following line work with N_iter=1? W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) #don't update scale W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) # TODO: Put on proper scale; no SGD on log/invlogit scale alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) # TODO: check this L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), exact_metagrad, callback) #W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) #callback(W_opt, N_iters) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) # TODO: This is where the chain rule happens, dhyperloss/dW_opt x dW_opt/dhyperparam_vect; first term is SGD meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] #def meta_callback(hyperparam_vect, i_hyper, metagrad): def meta_callback(hyperparam_vect, i_hyper, metagrad, exact_metagrad=exact_metagrad): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) # these are the unregularized losses below; default sets L2_reg=0.0 meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['train_err'].append(frac_err(x, **train_data)) meta_results['valid_err'].append(frac_err(x, **valid_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) print("metagrad", len(metagrad)) meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) #Michael: added comparisons with exact metagrad here #(2) Angle condition: More strongly, is the cosine of the angle between the two strictly bounded away from 0? #(3) Length: Since hypergradient optimization procedures do not necessarily use a proper line search, it may also be important for the approximate hypergradient to have a length comparable to the true hypergradient. exact_metagrad2['log_L2_reg'] = [sum(exact_metagrad['log_L2_reg'][0:7840]), sum(exact_metagrad['log_L2_reg'][7840:7850])] exact_metagrad2['log_param_scale'] = [sum(exact_metagrad['log_param_scale'][0:7840]), sum(exact_metagrad['log_param_scale'][7840:7850])] exact_metagrad2['log_alphas'] = exact_metagrad['log_alphas'] exact_metagrad2['invlogit_betas'] = exact_metagrad['invlogit_betas'] meta_results['exact_meta_grad_magnitude'].append(np.linalg.norm(exact_metagrad2.vect)) meta_results['DrMAD_exact_angle'].append(np.dot(exact_metagrad2.vect, metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(exact_metagrad2.vect))) #TODO: do the above for parameters separately? E.g. check log_alphas separately old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) #Michael: train->tests # final_result = adam(hyperloss_grad, hyperparams.vect, # meta_callback, N_meta_iter, meta_alpha) final_result = adam(hyperloss_grad, hyperparams.vect, exact_metagrad, meta_callback, N_meta_iter, meta_alpha) #write modified adam to ignore exact hypergrad in sgd4_mad_with_exact #meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
if callback: callback(x, i, g) m = b1t * g + (1 - b1t) * m v = b2 * (g**2) + (1 - b2) * v mhat = m / (1 - (1 - b1)**(i + 1)) vhat = v / (1 - (1 - b2)**(i + 1)) x -= step_size * mhat / (np.sqrt(vhat) + eps) return x # -- # Make NN functions parser = VectorParser() for i, shape in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): parser.add_shape(('weights', i), shape) parser.add_shape(('biases', i), (1, shape[1])) def pred_fun(W_vect, X): """Outputs normalized log-probabilities.""" W = parser.new_vect(W_vect) cur_units = X N_iter = len(layer_sizes) - 1 for i in range(N_iter): cur_W = W[('weights', i)] cur_B = W[('biases', i)] cur_units = np.dot(cur_units, cur_W) + cur_B if i == (N_iter - 1):
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) def build_hypervect(init_log_alphas, init_invlogit_betas, init_log_param_scale): hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) return hyperparams hyperparams = build_hypervect( init_log_alphas, init_invlogit_betas, init_log_param_scale) # Build just for parser. fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def whetlab_optimize(loss, max_iters, callback): for i in xrange(max_iters): params = scientist.suggest() hyperparams = build_hypervect(**params) cur_loss = loss(hyperparams.vect, 0) # No randomness scientist.update(params, -cur_loss) if callback: callback(hyperparams.vect, 0) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) whetlab_optimize(hyperloss, N_meta_iter, meta_callback) best_params = scientist.best() print "best params:", best_params parser.vect = None # No need to pickle zeros return meta_results, parser, best_params
# Helpers def fill_parser(parser, items): partial_vects = [np.full(parser[name].size, items[i]) for i, name in enumerate(parser.names)] return np.concatenate(partial_vects, axis=0) # -- # Run train_data, valid_data, test_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) init_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(init_hyperparams['log_alphas']) betas = logit(init_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) # Average many gradient evaluations at the initial point. hypergrads = np.zeros((N_gradients_in_average, hyperparams.vect.size)) for i in xrange(N_gradients_in_average): hypergrads[i] = hyperloss_grad(hyperparams.vect, i) print i first_gradient = hypergrads[0] avg_gradient = np.mean(hypergrads, axis=0) # Now do a line search along that direction. parsed_avg_grad = hyperparams.new_vect(avg_gradient) stepsize_scale = stepsize_search_rescale/np.max(np.exp(parsed_avg_grad['log_alphas'].ravel())) stepsizes = np.linspace(-stepsize_scale, stepsize_scale, N_points_in_line_search) for i, stepsize in enumerate(stepsizes): cur_hypervect = hyperparams.vect - stepsize * avg_gradient meta_callback(cur_hypervect, 0) # Use the same random seed every time. parser.vect = None # No need to pickle zeros return meta_results, parser, first_gradient, parsed_avg_grad, stepsizes
def run(script_corr): """Three different parsers: w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single script script_parser[i_script] : weights vector for each script transform_parser[i_layer] : transform matrix (scripts x scripts) for each alphabet""" RS = RandomState((seed, "top_rs")) train_data, valid_data, tests_data = omniglot.load_data_split( [11, 2, 2], RS, num_alphabets=N_scripts) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size uncorrelated_mat = np.eye(N_scripts) fully_correlated_mat = np.full((N_scripts, N_scripts), 1.0 / N_scripts) transform_mat = (1 - script_corr ) * uncorrelated_mat + script_corr * fully_correlated_mat transform_mat = transform_mat transform_parser = VectorParser() for i_layer in range(N_layers): if i_layer == N_layers - 1: transform_parser[i_layer] = uncorrelated_mat else: transform_parser[i_layer] = transform_mat script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) def transform_weights(all_z_vect, transform_vect, i_script_out): all_z = script_parser.new_vect(all_z_vect) transform = transform_parser.new_vect(transform_vect) W = OrderedDict( ) # Can't use parser because setting plain array ranges with funkyyak nodes not yet supported for k in w_parser.idxs_and_shapes.keys(): W[k] = 0.0 for i_layer in range(N_layers): script_weightings = transform[i_layer][i_script_out, :] for i_script in range(N_scripts): z_i_script = w_parser.new_vect(all_z[i_script]) script_weighting = script_weightings[i_script] W[('biases', i_layer)] += z_i_script[('biases', i_layer)] * script_weighting W[('weights', i_layer)] += z_i_script[('weights', i_layer)] * script_weighting return np.concatenate([v.ravel() for v in W.values()]) def loss_from_latents(z_vect, transform_vect, i_script, data): w_vect = transform_weights(z_vect, transform_vect, i_script) return loss_fun(w_vect, **data) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2_init) results = defaultdict(list) def hyperloss(transform_vect, i_hyper, record_results=False): def primal_stochastic_loss(z_vect, transform_vect, i_primal): RS = RandomState((seed, i_hyper, i_primal)) loss = 0.0 for _ in range(N_scripts_per_iter): i_script = RS.randint(N_scripts) N_train = train_data[i_script]['X'].shape[0] idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data[i_script], idxs) loss += loss_from_latents(z_vect, transform_vect, i_script, minibatch) reg = regularization(z_vect) if i_primal % 20 == 0: print "Iter {0}, loss {1}, reg {2}".format( i_primal, getval(loss), getval(reg)) print "Full losses: train: {0}, valid: {1}".format( total_loss(train_data, getval(z_vect)), total_loss(valid_data, getval(z_vect))) return loss + reg def total_loss(data, z_vect): return np.mean([ loss_from_latents(z_vect, transform_vect, i_script, data[i_script]) for i_script in range(N_scripts) ]) z_vect_0 = RS.randn( script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_stochastic_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) valid_loss = total_loss(valid_data, z_vect_final) if record_results: results['valid_loss'].append(valid_loss) results['train_loss'].append(total_loss(train_data, z_vect_final)) # results['tests_loss'].append(total_loss(tests_data, z_vect_final)) return valid_loss hyperloss(transform_parser.vect, 0, record_results=True) return results['train_loss'][-1], results['valid_loss'][-1]
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) rs = RandomState((seed)) init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes) # One of each. hyperparams = VectorParser() hyperparams['fake_data'] = init_fake_data fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} loss_meta_parser = VectorParser() loss_meta_parser[''] def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = cur_primal_results[ 'weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) meta_results['train_loss'].append(0) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: print metagrad meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser