def run(): train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(2) V0 = npr.randn(N_weights) * velocity_scale #W0 = npr.randn(N_weights) * np.exp(log_param_scale) bins = np.linspace(-1,1,N_bins) * np.exp(log_param_scale) W_uniform = npr.rand(N_weights) output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) W0, dW_dbins = bininvcdf(W_uniform, bins) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, np.exp(log_alphas), betas, record_learning_curve=True) dL_dx = results['d_x'] dL_dbins = np.dot(dL_dx, dW_dbins) learning_curve = results['learning_curve'] output.append((learning_curve, bins)) bins = bins - dL_dbins * bin_stepsize bins[[0,-1]] = bins[[0,-1]] - dL_dbins[[0,1]] * bin_stepsize bins.sort() # Sort in place. return output
def run(oiter): # ----- Variable for this run ----- log_alpha_0 = all_log_alpha_0[oiter] print "Running job {0} on {1}".format(oiter + 1, socket.gethostname()) train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) V0 = npr.randn(N_weights) * velocity_scale losses = [] d_losses = [] alpha_0 = np.exp(log_alpha_0) for N_iters in all_N_iters: alphas = np.full(N_iters, alpha_0) betas = np.full(N_iters, beta_0) npr.seed(1) W0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, alphas, betas) losses.append(results['loss_final']) d_losses.append(d_log_loss(alpha_0, results['d_alphas'])) return losses, d_losses
def run(): train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = len(parser.vect) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg) losses = [] d_losses = [] for log_alpha_0 in log_stepsizes: npr.seed(0) V0 = npr.randn(N_weights) * velocity_scale alpha_0 = np.exp(log_alpha_0) alphas = np.full(N_iters, alpha_0) betas = np.full(N_iters, beta_0) W0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, alphas, betas) losses.append(results['loss_final']) d_losses.append(d_log_loss(alpha_0, results['d_alphas'])) return losses, d_losses
def run(): train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) V0 = npr.randn(N_weights) * velocity_scale losses = [] d_losses = [] for N_iters in all_N_iters: alphas = np.full(N_iters, alpha_0) betas = np.full(N_iters, beta_0) loss_curve = [] d_loss_curve = [] for log_param_scale in all_log_param_scale: print "log_param_scale {0}, N_iters {1}".format(log_param_scale, N_iters) npr.seed(1) W0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, alphas, betas) loss_curve.append(results['loss_final']) d_loss_curve.append(d_log_loss(W0, results['d_x'])) losses.append(loss_curve) d_losses.append(d_loss_curve) with open('results.pkl', 'w') as f: pickle.dump((losses, d_losses), f)
def run(): train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(1) V0 = npr.randn(N_weights) * velocity_scale W0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, np.exp(log_alphas), betas, record_learning_curve=True) learning_curve = results['learning_curve'] d_log_alphas = np.exp(log_alphas) * results['d_alphas'] output.append((learning_curve, log_alphas, d_log_alphas)) log_alphas = log_alphas - meta_alpha * d_log_alphas return output
def run(): train_images, train_labels, _, _, _ = load_data(normalize=True) train_images = train_images[:N_real_data, :] train_labels = train_labels[:N_real_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N fake_data = npr.randn(*(train_images[:N_fake_data, :].shape)) * init_fake_data_scale fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes) # One of each. def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs]) def meta_loss_fun(x): # To be optimized in the outer loop. return loss_fun(x, X=train_images, T=train_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, fake_data) learning_curve = results['learning_curve'] validation_loss = results['M_final'] output.append((learning_curve, validation_loss, fake_data)) fake_data -= results['dMd_meta'] * data_stepsize # Update data with one gradient step. print "Meta iteration {0} Valiation loss {1}".format(i, validation_loss) return output
def run(superparams): alpha, log_scale_init, offset_init_std = superparams RS = RandomState((seed, "top_rs")) all_alphabets = omniglot.load_data() RS.shuffle(all_alphabets) train_alphabets = all_alphabets[:-N_test_alphabets] tests_alphabets = all_alphabets[-N_test_alphabets:] w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size hyperparams_0 = VectorParser() hyperparams_0['log_scale'] = log_scale_init * np.ones(N_weights) hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights) def reg_loss_fun(W, data, hyperparam_vect, reg_penalty): hyperparams = hyperparams_0.new_vect(hyperparam_vect) Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset'] return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 30 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = np.zeros(N_weights) W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False) results = defaultdict(list) def record_results(hyperparam_vect, i_hyper, g): # print "Meta iter {0}. Recording results".format(i_hyper) RS = RandomState((seed, i_hyper, "evaluation")) new_seed = RS.int32() def loss_fun(alphabets, report_train_loss): return np.mean([hyperloss(hyperparam_vect, new_seed, alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval)]) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype(np.float16)) results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True)) results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False)) record_results(hyperparams_0.vect, 0, None) return [results['train_loss'][0], results['valid_loss'][0]]
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) #fixed_hyperparams = VectorParser() #fixed_hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState(npr.RandomState(global_seed + i_hyper).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve = [] def callback(x, i_iter): if i_iter % N_batches == 0: learning_curve.append(loss_fun(x, **train_data)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) V0 = np.zeros(W0.size) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) # return loss_fun(W_opt, **valid_data) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): (train_images, train_labels), (val_images, val_labels), (test_images, test_labels) = load_data_subset( N_train_data, N_val_data, N_test_data ) batch_idxs = BatchList(N_train_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = parser.N hyperparser = WeightsParser() hyperparser.add_weights("log_L2_reg", (N_weights,)) metas = np.zeros(hyperparser.N) print "Number of hyperparameters to be trained:", hyperparser.N npr.seed(0) hyperparser.set(metas, "log_L2_reg", log_L2_reg_scale + np.ones(N_weights)) def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. L2_reg = np.exp(hyperparser.get(meta_params, "log_L2_reg")) return loss_fun(x, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. L2_reg = np.exp(hyperparser.get(meta_params, "log_L2_reg")) log_prior = -meta_L2_reg * np.dot(L2_reg.ravel(), L2_reg.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, metas) learning_curve = results["learning_curve"] validation_loss = results["M_final"] test_loss = test_loss_fun(results["x_final"]) output.append( ( learning_curve, validation_loss, test_loss, parser.get(results["x_final"], (("weights", 0))), parser.get(np.exp(hyperparser.get(metas, "log_L2_reg")), (("weights", 0))), ) ) metas -= results["dMd_meta"] * meta_stepsize print "Meta iteration {0} Valiation loss {1} Test loss {2}".format(i, validation_loss, test_loss) return output
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) initial_hypergrad = hyperloss_grad( hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) avg_hypergrad = initial_hypergrad.copy() for i in xrange(1, N_meta_iter): avg_hypergrad += hyperloss_grad( hyperparams.vect, i) print i parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad) parser.vect = None # No need to pickle zeros return parser, parsed_init_hypergrad, parsed_avg_hypergrad
def run(): train_images, train_labels, _, _, _ = load_data(normalize=True) train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(2) V0 = npr.randn(N_weights) * velocity_scale #W0 = npr.randn(N_weights) * np.exp(log_param_scale) bindict = {k : np.linspace(-1,1,N_bins) * np.exp(log_param_scale) # Different cdf per layer. for k, v in parser.idxs_and_shapes.iteritems()} output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) #X0, dX_dbins = bininvcdf(W_uniform, bins) X_uniform = npr.rand(N_weights) # Weights are uniform passed through an inverse cdf. X0 = np.zeros(N_weights) dX_dbins = {} for k, cur_bins in bindict.iteritems(): cur_slice, cur_shape = parser.idxs_and_shapes[k] cur_xs = X_uniform[cur_slice] cur_X0, cur_dX_dbins = bininvcdf(cur_xs, cur_bins) X0[cur_slice] = cur_X0 dX_dbins[k] = cur_dX_dbins results = sgd(indexed_loss_fun, batch_idxs, N_iters, X0, V0, np.exp(log_alphas), betas, record_learning_curve=True) dL_dx = results['d_x'] learning_curve = results['learning_curve'] output.append((learning_curve, bindict)) # Update bins with one gradient step. for k, bins in bindict.iteritems(): dL_dbins = np.dot(parser.get(dL_dx, k).flatten(), dX_dbins[k]) bins = bins - dL_dbins * bin_stepsize bins[[0,-1]] = bins[[0,-1]] - dL_dbins[[0,1]] * bin_stepsize bindict[k] = np.sort(bins) bindict = bindict.copy() return output
def run(): train_images, train_labels, _, _, _ = load_data(normalize=True) train_images = train_images[:N_real_data, :] train_labels = train_labels[:N_real_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N # fake_data = npr.randn(*(train_images[:N_fake_data, :].shape)) fake_data = np.zeros(train_images[:N_fake_data, :].shape) one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(0, 10)), 10) # One of each label. def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs]) def meta_loss_fun(x): # To be optimized in the outer loop. return loss_fun(x, X=train_images, T=train_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) results = sgd2( indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, fake_data ) learning_curve = results["learning_curve"] output.append((learning_curve, fake_data)) fake_data -= results["dMd_meta"] * data_stepsize # Update data with one gradient step. return output
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams["log_param_scale"] = np.full(N_weight_types, init_log_param_scale) hyperparams["log_alphas"] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams["invlogit_betas"] = np.full((N_iters, N_weight_types), init_invlogit_betas) for name in parser.names: hyperparams[("rescale", name)] = np.full(N_iters, init_rescales) fixed_hyperparams = VectorParser() fixed_hyperparams["log_L2_reg"] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data["X"][idxs], train_data["T"][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict["learning_curve"].append(loss_fun(x, **train_data)) learning_curve_dict["grad_norm"].append(np.linalg.norm(g)) learning_curve_dict["weight_norm"].append(np.linalg.norm(x)) learning_curve_dict["velocity_norm"].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams["log_param_scale"])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams["log_alphas"]) betas = logit(cur_hyperparams["invlogit_betas"]) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams["log_L2_reg"])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results["train_loss"].append(loss_fun(x, **train_data)) meta_results["valid_loss"].append(loss_fun(x, **valid_data)) meta_results["tests_loss"].append(loss_fun(x, **tests_data)) meta_results["test_err"].append(frac_err(x, **tests_data)) meta_results["learning_curves"].append(learning_curve_dict) if metagrad is not None: meta_results["meta_grad_magnitude"].append(np.linalg.norm(metagrad)) meta_results["meta_grad_angle"].append( np.dot(old_metagrad[0], metagrad) / (np.linalg.norm(metagrad) * np.linalg.norm(old_metagrad[0])) ) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results["train_loss"][-1], meta_results["valid_loss"][-1], meta_results["train_loss"][-1], meta_results["test_err"][-1], ) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run( ): RS = RandomState((seed, "to p_rs")) data = loadData.loadMnist() train_data, tests_data = loadData.load_data_as_dict(data, classNum) train_data = random_partition(train_data, RS, [N_train]).__getitem__(0) tests_data = random_partition(tests_data, RS, [ N_tests]).__getitem__(0) print "training samples {0}: testing samples: {1}".format(N_train,N_tests) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size init_scales = w_parser.new_vect(np.zeros(N_weights)) for i in range(N_layers): init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i]) init_scales[('biases', i)] = 1.0 init_scales = init_scales.vect def regularization(w_vect, reg): return np.dot(w_vect, w_vect * np.exp(reg)) def constrain_reg(t_vect, name): all_r = w_parser.new_vect(t_vect) for i in range(N_layers): all_r[('biases', i)] = 0.0 if name == 'universal': r_mean = np.mean([np.mean(all_r[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_r[('weights', i)] = r_mean elif name == 'layers': for i in range(N_layers): all_r[('weights', i)] = np.mean(all_r[('weights', i)]) elif name == 'units': for i in range(N_layers): all_r[('weights', i)] = np.mean(all_r[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_r.vect def process_reg(t_vect): # Remove the redundancy due to sharing regularization within units all_r = w_parser.new_vect(t_vect) new_r = np.zeros((0,)) for i in range(N_layers): layer = all_r[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_r = layer[:, 0] new_r = np.concatenate((new_r, cur_r)) return new_r def train_z(data, w_vect_0, reg): N_data = data['X'].shape[0] def primal_loss(w_vect, reg, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) loss = loss_fun(w_vect, **minibatch) reg = regularization(w_vect, reg) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), reg, w_vect_0, alpha, beta, N_iters) all_regs, all_tests_loss = [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_regs.append(cur_reg.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) print "Cur_reg", cur_reg # print "Cur_reg", np.mean(cur_reg) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) # print("calculate hypergradients") raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(raw_grad, constraint) # print "constrained_grad",constrained_grad print "\n" # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha cur_reg -= constrained_grad * meta_alpha # cur_reg -= np.sign(constrained_grad) * meta_alpha return cur_reg def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) # t_scale = [-1, 0, 1] # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) # for s in t_scale: # reg = np.ones(N_weights) * log_L2_init + s # loss = new_hyperloss(reg, 0, *cur_split) # print "Results: s= {0}, loss = {1}".format(s, loss) reg = np.ones(N_weights) * log_L2_init constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_reg, all_regs))) return all_L2_regs, all_tests_loss
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size def transform_weights(z_vect, transform): #TODO: isn't this a scale transformation? return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_reg(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): #Don't regularize biases all_t[('biases', i)] = 0.0 if name == 'universal': #One regularization hyperparameter for all weights #TODO: does computing means of means make sense? Not the same as just the mean of all. t_mean = np.mean([np.mean(all_t[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[('weights', i)] = t_mean elif name == 'layers': #One regularization hyperparameter for each layer #TODO: changes the exact hypergradient norm, but not the DrMAD norm. Why??? DrMAD is already constrained? #print t_vect.shape for i in range(N_layers): #print "diff after contraining" + str(np.linalg.norm(all_t[('weights', i)] - np.mean(all_t[('weights', i)]))) all_t[('weights', i)] = np.mean(all_t[('weights', i)]) elif name == 'units': print t_vect.shape #44860; this is correct #for i in range(N_layers): #print "weights "+ str(i) + ": " + str(np.linalg.norm(np.mean(all_t[('weights', i)], axis=1, keepdims=True) - np.mean(all_t[('weights', i)], axis=1, keepdims=True))) #for i in range(N_layers): #TODO: This was the same as layer-wise #all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0,)) for i in range(N_layers): layer = all_t[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] #TODO: equivalent regularization weights new_t = np.concatenate((new_t, cur_t)) return new_t def train_z(data, z_vect_0, transform): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) #TODO: this is a scale transformation, not regularization! loss = loss_fun(w_vect, **minibatch) #use new scale for prediction reg = regularization(z_vect) #regularize original scale #TODO: should be equivalent: w = z*e^transform, so # f(z*e^transform) + e^\lambda||z||^2 = f(w) + e^\lambda||z||^2 = f(w) + e^(\lambda)||e^-2transform w||^2 # see process_transform #if record_results and i_primal % N_thin == 0: #print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd_meta_only_mad(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters) def train_z_exact(data, z_vect_0, transform, meta_iteration=0): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) #if record_results and i_primal % N_thin == 0: # print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd_meta_only(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters, meta_iteration=meta_iteration) all_transforms, all_train_loss, all_valid_loss, all_tests_loss, all_train_rates, all_valid_rates, all_tests_rates, all_avg_regs, hypergrad_angles, hypergrad_signs_angles, hypergrad_norms, exact_hypergrad_norms = [], [], [], [], [], [], [], [], [], [], [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) #TODO: initial scale AND regularization train_loss = getval(loss_fun(w_vect_final, **cur_train_data)) print "Training loss (unregularized) = " +str(train_loss) all_train_loss.append(train_loss) valid_loss = getval(loss_fun(w_vect_final, **cur_valid_data)) print "Validation loss = " +str(valid_loss) all_valid_loss.append(valid_loss) tests_loss = getval(loss_fun(w_vect_final, **cur_tests_data)) print "Test loss = " +str(tests_loss) all_tests_loss.append(tests_loss) plt.plot(all_train_loss, label="training loss (unregularized)") plt.plot(all_valid_loss, label="validation loss") plt.plot(all_tests_loss, label="test loss") plt.title("loss vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("loss") plt.legend() plt.savefig("loss2000_corrected.png") plt.clf() train_rate = getval(frac_err(w_vect_final, **cur_train_data)) print "Training error rate = " +str(train_rate) all_train_rates.append(train_rate) valid_rate = getval(frac_err(w_vect_final, **cur_valid_data)) print "Validation error rate = " +str(valid_rate) all_valid_rates.append(valid_rate) tests_rate = getval(frac_err(w_vect_final, **cur_tests_data)) print "Test error rate = " +str(tests_rate) all_tests_rates.append(tests_rate) plt.plot(all_train_rates, label="training error rate") plt.plot(all_valid_rates, label="validation error rate") plt.plot(all_tests_rates, label="test error rate") plt.title("error rate vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("error rate") plt.legend() plt.savefig("error2000_corrected.png") plt.clf() return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) #No chain rule here def hyperloss_exact(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, meta_it=0): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z_exact(cur_train_data, z_vect_0, transform, meta_iteration=meta_it) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad_exact = grad(hyperloss_exact) #No chain rule here '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path? w_vect_final = transform_weights(z_vect_final, transform) return frac_err(w_vect_final, **cur_valid_data)''' cur_reg = reg_0 #initial regularization, besides regularization() function for i_hyper in range(N_meta_iter): print "Hyper iter "+ str(i_hyper) """if i_hyper % N_meta_thin == 0: test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data) all_tests_rates.append(test_rate) all_transforms.append(cur_reg.copy()) all_avg_regs.append(np.mean(cur_reg)) print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1]) print "Cur_transform", np.mean(cur_reg)""" RS = RandomState((seed, i_top, i_hyper, "hyperloss")) #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) cur_train_data, cur_valid_data = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data) raw_grad_exact = hypergrad_exact(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data, meta_it=i_hyper) #print "before constraining grad" constrained_grad = constrain_reg(raw_grad, constraint) constrained_grad_exact = constrain_reg(raw_grad_exact, constraint) print(np.linalg.norm(raw_grad)) #TODO: #Exploding DrMAD gradient; ~10^10x larger than exact gradient with N_safe_sampling = N_iters print(np.linalg.norm(raw_grad_exact)) # TODO: sometimes negative??? hypergrad_angle = np.dot(constrained_grad, constrained_grad_exact)/(np.linalg.norm(constrained_grad)*np.linalg.norm(constrained_grad_exact)) hypergrad_angles.append(hypergrad_angle) print("cosine of angle between DrMAD and exact = " +str(hypergrad_angle)) hypergrad_signs_angle = np.dot(np.sign(constrained_grad), np.sign(constrained_grad_exact))/len(constrained_grad) hypergrad_signs_angles.append(hypergrad_signs_angle) print("cosine of angle between signs of DrMAD and exact = " +str(hypergrad_signs_angle)) plt.plot(hypergrad_angles, label="between exact and DrMAD hypergradients") plt.plot(hypergrad_signs_angles, label="between signs of DrMAD and exact") plt.title("Cosine of angle vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("cosine of angle") plt.legend() plt.savefig("angle2000_corrected.png") plt.clf() hypergrad_norm = np.linalg.norm(constrained_grad) hypergrad_norms.append(hypergrad_norm) print("DrMAD norm = "+ str(hypergrad_norm)) exact_hypergrad_norm = np.linalg.norm(constrained_grad_exact) exact_hypergrad_norms.append(exact_hypergrad_norm) print("Exact norm = "+ str(exact_hypergrad_norm)) plt.plot(hypergrad_norms, label="DrMAD hypergradient") plt.plot(exact_hypergrad_norms, label="Exact hypergradient") plt.title("Norms of hypergradients vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("norm") plt.legend() plt.savefig("norms2000_corrected.png") plt.clf() cur_reg -= np.sign(constrained_grad) * meta_alpha #TODO: signs of gradient... #TODO: momentum return cur_reg reg = np.zeros(N_weights)+0.2 #TODO: initial -log regularization; not in log scale? constraints = ['universal', 'layers', 'units'] # TODO: uses multiple kinds of hyperparameter sharing, but in order for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top), constraint reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) #return all_L2_regs, all_tests_rates, all_avg_regs all_L2_regs, all_train_loss, all_valid_loss, all_tests_loss, all_train_rates, all_valid_rates, all_tests_rates, all_avg_regs, hypergrad_angles, hypergrad_signs_angles, hypergrad_norms, exact_hypergrad_norms
def plot(): import matplotlib.pyplot as plt import matplotlib as mpl def plot_filters(ax, parser, lims, N_cols=10, L_img=28, padding=2): bg_val = 0 filters = parser[('weights', 0)] output_weights = parser[('weights', 1)] N_outputs = output_weights.shape[1] N_filters = filters.shape[1] N_rows = ceil_div(N_filters, N_cols) L_extra = ceil_div(N_outputs, L_img) output_weights_padded = np.full((N_filters, L_img * L_extra), bg_val) output_weights_padded[:, :N_outputs] = output_weights output_weights_padded = output_weights_padded.reshape( (N_filters, L_extra, L_img)) filters = filters.reshape((L_img, L_img, N_filters)) row_height = L_img + L_extra + padding * 2 col_width = L_img + padding image = np.full((row_height * N_rows, col_width * N_cols), bg_val) def pix_range_x(i): offset = i * col_width return slice(offset, offset + L_img) def pix_range_y(i): offset = i * row_height return slice(offset, offset + L_img + L_extra + padding) for i_x, i_y in it.product(range(N_rows), range(N_cols)): i_filter = i_x + i_y * N_cols if i_filter < N_filters: cur_frame = np.concatenate( (filters[:, :, i_filter], np.full( (padding, L_img), bg_val), output_weights_padded[i, :, :]), axis=0) image[pix_range_y(i_y), pix_range_x(i_x)] = cur_frame img_min, img_max = lims image = (image - img_min) / (img_max - img_min) image = np.minimum(np.maximum(image, 0.0), 1.0) ax.imshow(image, cmap=mpl.cm.binary) ax.set_xticks([]) ax.set_yticks([]) with open('results.pkl') as f: results = pickle.load(f) fig = plt.figure(0) fig.set_size_inches((6, 4)) ax = fig.add_subplot(111) ax.set_title('Meta learning curves') losses = ['train_loss', 'valid_loss', 'tests_loss'] for loss_type in losses: ax.plot(results[loss_type], 'o-', label=loss_type) ax.set_xlabel('Meta iter number') ax.set_ylabel('Negative log prob') ax.legend(loc=1, frameon=False) plt.savefig('learning_curves.png') fig.clf() fig.set_size_inches((6, 8)) ax = fig.add_subplot(211) ax.set_title('Parameter scale') for i, log_scale in enumerate(results['log_scale']): ax.plot(np.sort(log_scale), label="Meta iter {0}".format(i * N_hyper_thin)) ax.legend(loc=2, frameon=False) ax = fig.add_subplot(212) ax.set_title('Parameter offset') for i, offset in enumerate(results['offset']): ax.plot(np.sort(offset), label="Meta iter {0}".format(i * N_hyper_thin)) plt.savefig('Learned regularization.png') w_parser, _, _, _ = make_nn_funs(layer_sizes) log_scales = w_parser.new_vect(np.exp(results['log_scale'])[-1]) offset = w_parser.new_vect(results['offset'][-1]) fig.clf() fig.set_size_inches((6, 6)) ax = fig.add_subplot(111) plot_filters(ax, log_scales, [5, 10]) ax.set_title("Scales") plt.savefig("L2_scale_filters.png") plt.savefig("L2_scale_filters.pdf") fig.clf() fig.set_size_inches((6, 6)) ax = fig.add_subplot(111) plot_filters(ax, offset, [-1, 1]) ax.set_title("Offsets") plt.savefig("L2_mean_filters.png") plt.savefig("L2_mean_filters.pdf")
def run(): (train_images, train_labels),\ (valid_images, valid_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests) batch_idxs = BatchList(N_train, batch_size) N_iters = N_epochs * len(batch_idxs) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def indexed_loss_fun(w, log_L2_reg, i): idxs = batch_idxs[i % len(batch_idxs)] partial_vects = [np.full(parser[name].size, np.exp(log_L2_reg[i])) for i, name in enumerate(parser.names)] L2_reg_vect = np.concatenate(partial_vects, axis=0) return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect) def train_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=train_images, T=train_labels) def valid_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=valid_images, T=valid_labels) def tests_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=tests_images, T=tests_labels) all_learning_curves = [] all_x = [] def hyperloss(hyperparam_vect, i): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append(loss_fun(x, X=train_images, T=train_labels)) npr.seed(i) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names)] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] W_opt = sgd5(grad(indexed_loss_fun), kylist(W0, alphas, betas, log_L2_reg), callback) all_x.append(getval(W_opt)) all_learning_curves.append(learning_curve) return valid_loss_fun(W_opt) hyperloss_grad = grad(hyperloss) add_fields = ['train_loss', 'valid_loss', 'tests_loss'] meta_results = {field : [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) log_L2_reg = cur_hyperparams['log_L2_reg'] for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(train_loss_fun(x)) meta_results['valid_loss'].append(valid_loss_fun(x)) meta_results['tests_loss'].append(tests_loss_fun(x)) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_results['all_learning_curves'] = all_learning_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) initial_hypergrad = hyperloss_grad(hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) avg_hypergrad = initial_hypergrad.copy() for i in xrange(1, N_meta_iter): avg_hypergrad += hyperloss_grad(hyperparams.vect, i) print i parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad) parser.vect = None # No need to pickle zeros return parser, parsed_init_hypergrad, parsed_avg_hypergrad
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) rs = RandomState((seed)) init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes) # One of each. hyperparams = VectorParser() hyperparams['fake_data'] = init_fake_data fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} loss_meta_parser = VectorParser() loss_meta_parser[''] def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) meta_results['train_loss'].append(0) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: print metagrad meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): (train_images, train_labels),\ (valid_images, valid_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests) batch_idxs = BatchList(N_train, batch_size) N_iters = N_epochs * len(batch_idxs) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def train_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=train_images, T=train_labels) def valid_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=valid_images, T=valid_labels) def tests_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=tests_images, T=tests_labels) all_learning_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, ii): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append( loss_fun(x, X=train_images, T=train_labels)) def indexed_loss_fun(w, log_L2_reg, j): # idxs = batch_idxs[i % len(batch_idxs)] npr.seed(1000 * ii + j) idxs = npr.randint(N_train, size=len(batch_idxs)) partial_vects = [ np.full(parser[name].size, np.exp(log_L2_reg[i])) for i, name in enumerate(parser.names) ] L2_reg_vect = np.concatenate(partial_vects, axis=0) return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect) npr.seed(ii) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [ np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names) ] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] results = sgd3(indexed_loss_fun, valid_loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['log_L2_reg'] = results['dMd_meta'] weights_grad = parser.new_vect(W0 * results['dMd_x']) hypergrads['log_param_scale'] = [ np.sum(weights_grad[name]) for name in parser.names ] hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = ( results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num'] meta_results = {field: [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): if i % N_meta_thin == 0: print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) log_L2_reg = cur_hyperparams['log_L2_reg'] for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(train_loss_fun(x)) meta_results['valid_loss'].append(valid_loss_fun(x)) meta_results['tests_loss'].append(tests_loss_fun(x)) meta_results['iter_num'].append(i) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, meta_gamma) meta_results['all_learning_curves'] = all_learning_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): RS = RandomState((seed, "top_rs")) all_alphabets = omniglot.load_data() RS.shuffle(all_alphabets) train_alphabets = all_alphabets[:-N_test_alphabets] tests_alphabets = all_alphabets[-N_test_alphabets:] w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size hyperparams_0 = VectorParser() hyperparams_0['log_scale'] = log_scale_init * np.ones(N_weights) hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights) def reg_loss_fun(W, data, hyperparam_vect, reg_penalty): hyperparams = hyperparams_0.new_vect(hyperparam_vect) Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset'] return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = np.zeros(N_weights) W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False) results = defaultdict(list) def record_results(hyperparam_vect, i_hyper, g): print "Meta iter {0}. Recording results".format(i_hyper) RS = RandomState((seed, i_hyper, "evaluation")) def loss_fun(alphabets, report_train_loss): return np.mean([hyperloss(hyperparam_vect, RS.int32(), alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval)]) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype(np.float16)) results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True)) results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False)) results['tests_loss'].append(loss_fun(tests_alphabets, report_train_loss=False)) print "Train:", results['train_loss'] print "Valid:", results['valid_loss'] print "Tests:", results['tests_loss'] train_hyperloss = partial(hyperloss, alphabets=train_alphabets) rms_prop(grad(train_hyperloss), hyperparams_0.vect, record_results, N_meta_iter, meta_alpha, gamma=0) return results
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) #fixed_hyperparams = VectorParser() #fixed_hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState( npr.RandomState(global_seed + i_hyper).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve = [] def callback(x, i_iter): if i_iter % N_batches == 0: learning_curve.append(loss_fun(x, **train_data)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) V0 = np.zeros(W0.size) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) parser.vect = None # No need to pickle zeros return meta_results, parser
def plot(): import matplotlib.pyplot as plt import matplotlib as mpl def plot_filters(ax, parser, lims, N_cols=10, L_img=28, padding=2): bg_val = 0 filters = parser[('weights', 0)] output_weights = parser[('weights', 1)] N_outputs = output_weights.shape[1] N_filters = filters.shape[1] N_rows = ceil_div(N_filters, N_cols) L_extra = ceil_div(N_outputs, L_img) output_weights_padded = np.full((N_filters, L_img * L_extra), bg_val) output_weights_padded[:, :N_outputs] = output_weights output_weights_padded = output_weights_padded.reshape((N_filters, L_extra, L_img)) filters = filters.reshape((L_img, L_img, N_filters)) row_height = L_img + L_extra + padding * 2 col_width = L_img + padding image = np.full((row_height * N_rows, col_width * N_cols), bg_val) def pix_range_x(i): offset = i * col_width return slice(offset, offset + L_img) def pix_range_y(i): offset = i * row_height return slice(offset, offset + L_img + L_extra + padding) for i_x, i_y in it.product(range(N_rows), range(N_cols)): i_filter = i_x + i_y * N_cols if i_filter < N_filters: cur_frame = np.concatenate((filters[:, :, i_filter], np.full((padding, L_img), bg_val), output_weights_padded[i, :, :]), axis=0) image[pix_range_y(i_y), pix_range_x(i_x)] = cur_frame img_min, img_max = lims image = (image - img_min) / (img_max - img_min) image = np.minimum(np.maximum(image, 0.0), 1.0) ax.imshow(image, cmap = mpl.cm.binary) ax.set_xticks([]) ax.set_yticks([]) with open('results.pkl') as f: results = pickle.load(f) fig = plt.figure(0) fig.set_size_inches((6,4)) ax = fig.add_subplot(111) ax.set_title('Meta learning curves') losses = ['train_loss', 'valid_loss', 'tests_loss'] for loss_type in losses: ax.plot(results[loss_type], 'o-', label=loss_type) ax.set_xlabel('Meta iter number') ax.set_ylabel('Negative log prob') ax.legend(loc=1, frameon=False) plt.savefig('learning_curves.png') fig.clf() fig.set_size_inches((6,8)) ax = fig.add_subplot(211) ax.set_title('Parameter scale') for i, log_scale in enumerate(results['log_scale']): ax.plot(np.sort(log_scale), label = "Meta iter {0}".format(i * N_hyper_thin)) ax.legend(loc=2, frameon=False) ax = fig.add_subplot(212) ax.set_title('Parameter offset') for i, offset in enumerate(results['offset']): ax.plot(np.sort(offset), label = "Meta iter {0}".format(i * N_hyper_thin)) plt.savefig('Learned regularization.png') w_parser, _, _, _ = make_nn_funs(layer_sizes) log_scales = w_parser.new_vect(np.exp(results['log_scale'])[-1]) offset = w_parser.new_vect(results['offset'][-1]) fig.clf() fig.set_size_inches((6,6)) ax = fig.add_subplot(111) plot_filters(ax, log_scales, [5, 10]) ax.set_title("Scales") plt.savefig("L2_scale_filters.png") plt.savefig("L2_scale_filters.pdf") fig.clf() fig.set_size_inches((6,6)) ax = fig.add_subplot(111) plot_filters(ax, offset, [-1, 1]) ax.set_title("Offsets") plt.savefig("L2_mean_filters.png") plt.savefig("L2_mean_filters.pdf")
def run(): (train_images, train_labels),\ (valid_images, valid_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests) batch_idxs = BatchList(N_train, batch_size) N_iters = N_epochs * len(batch_idxs) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def indexed_loss_fun(w, log_L2_reg, i): idxs = batch_idxs[i % len(batch_idxs)] partial_vects = [ np.full(parser[name].size, np.exp(log_L2_reg[i])) for i, name in enumerate(parser.names) ] L2_reg_vect = np.concatenate(partial_vects, axis=0) return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect) def train_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=train_images, T=train_labels) def valid_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=valid_images, T=valid_labels) def tests_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=tests_images, T=tests_labels) all_learning_curves = [] all_x = [] def hyperloss(hyperparam_vect, i): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append( loss_fun(x, X=train_images, T=train_labels)) npr.seed(i) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [ np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names) ] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] W_opt = sgd5(grad(indexed_loss_fun), kylist(W0, alphas, betas, log_L2_reg), callback) all_x.append(getval(W_opt)) all_learning_curves.append(learning_curve) return valid_loss_fun(W_opt) hyperloss_grad = grad(hyperloss) add_fields = ['train_loss', 'valid_loss', 'tests_loss'] meta_results = {field: [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) log_L2_reg = cur_hyperparams['log_L2_reg'] for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(train_loss_fun(x)) meta_results['valid_loss'].append(valid_loss_fun(x)) meta_results['tests_loss'].append(tests_loss_fun(x)) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_results['all_learning_curves'] = all_learning_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): val_images, val_labels, test_images, test_labels, _ = load_data(normalize=True) val_images = val_images[:N_val_data, :] val_labels = val_labels[:N_val_data, :] true_data_scale = np.std(val_images) test_images = test_images[:N_test_data, :] test_labels = test_labels[:N_test_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = len(parser.vect) npr.seed(0) init_fake_data = npr.randn(*(val_images[:N_fake_data, :].shape)) * init_fake_data_scale one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes) # One of each. hyperparser = WeightsParser() hyperparser.add_weights('log_L2_reg', (1,)) hyperparser.add_weights('fake_data', init_fake_data.shape) metas = np.zeros(hyperparser.N) print "Number of hyperparameters to be trained:", hyperparser.N hyperparser.set(metas, 'log_L2_reg', init_log_L2_reg) hyperparser.set(metas, 'fake_data', init_fake_data) def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. L2_reg=np.exp(hyperparser.get(meta_params, 'log_L2_reg')[0]) fake_data=hyperparser.get(meta_params, 'fake_data') return loss_fun(x, X=fake_data[idxs], T=fake_labels[idxs], L2_reg=L2_reg) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. fake_data=hyperparser.get(meta_params, 'fake_data') log_prior = -fake_data_L2_reg * np.dot(fake_data.ravel(), fake_data.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) output = [] for i in range(N_meta_iter): print "L2 reg is ", np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), "| ", npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, metas) learning_curve = results['learning_curve'] validation_loss = results['M_final'] test_err = frac_err(results['x_final'], test_images, test_labels) fake_data_scale = np.std(hyperparser.get(metas, 'fake_data')) / true_data_scale test_loss = test_loss_fun(results['x_final']) output.append((learning_curve, validation_loss, test_loss, fake_data_scale, np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), test_err)) metas -= results['dMd_meta'] * meta_stepsize print "Meta iteration {0} Validation loss {1} Test loss {2} Test err {3}"\ .format(i, validation_loss, test_loss, test_err) return output, hyperparser.get(metas, 'fake_data')
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_iters, init_log_param_scale) # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState( npr.RandomState(global_seed + i_hyper + i_iter * 10000).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) #callback(W_opt, N_iters) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) #Michael: train->tests final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) #meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): RS = RandomState((seed, "top_rs")) data = loadData.loadMnist() train_data_subclass = [] train_data, tests_data = loadData.load_data_as_dict(data, classNum) train_data = random_partition(train_data, RS, [N_train_Full]).__getitem__(0) tests_data = random_partition(tests_data, RS, [ N_tests]).__getitem__(0) train_data_subclass= loadData.loadSubsetData(train_data,RS, N_train, clientNum) print "training samples {0}: testing samples: {1}".format(N_train,N_tests) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size def transform_weights(z_vect, transform): return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_reg(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): all_t[('biases', i)] = 0.0 if name == 'universal': t_mean = np.mean([np.mean(all_t[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[('weights', i)] = t_mean elif name == 'layers': for i in range(N_layers): all_t[('weights', i)] = np.mean(all_t[('weights', i)]) elif name == 'units': for i in range(N_layers): all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0,)) for i in range(N_layers): layer = all_t[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] new_t = np.concatenate((new_t, cur_t)) return new_t def train_z(data, z_vect_0, transform): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters) all_transforms, all_tests_loss, all_tests_rates, all_avg_regs = [], [], [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) def error_rate(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return frac_err(w_vect_final, **cur_valid_data) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data) all_tests_rates.append(test_rate) all_transforms.append(cur_reg.copy()) all_avg_regs.append(np.mean(cur_reg)) print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1]) print "Cur_transform", np.mean(cur_reg) tempConstrained_grad = np.zeros(N_weights) for client_i in range (0,clientNum): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data_subclass.__getitem__(client_i), RS, [N_train-N_valid, N_valid]) print("calculate hypergradients") raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) print("calculate hypergradients end ") constrained_grad = constrain_reg(raw_grad, constraint) tempConstrained_grad += constrained_grad/clientNum cur_reg -= np.sign(tempConstrained_grad) * meta_alpha print("calculate hypergradients end ") return cur_reg reg = np.zeros(N_weights)+0.2 constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) return all_L2_regs, all_tests_rates, all_avg_regs
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size def transform_weights(z_vect, transform): return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_reg(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): all_t[('biases', i)] = 0.0 if name == 'universal': t_mean = np.mean( [np.mean(all_t[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[('weights', i)] = t_mean elif name == 'layers': for i in range(N_layers): all_t[('weights', i)] = np.mean(all_t[('weights', i)]) elif name == 'units': for i in range(N_layers): all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0, )) for i in range(N_layers): layer = all_t[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] new_t = np.concatenate((new_t, cur_t)) return new_t def train_z(data, z_vect_0, transform): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters) all_transforms, all_tests_loss, all_tests_rates, all_avg_regs = [], [], [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) def error_rate(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return frac_err(w_vect_final, **cur_valid_data) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data) all_tests_rates.append(test_rate) all_transforms.append(cur_reg.copy()) all_avg_regs.append(np.mean(cur_reg)) print "Hyper iter {0}, error rate {1}".format( i_hyper, all_tests_rates[-1]) print "Cur_transform", np.mean(cur_reg) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(raw_grad, constraint) cur_reg -= np.sign(constrained_grad) * meta_alpha return cur_reg reg = np.zeros(N_weights) + 0.2 constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) return all_L2_regs, all_tests_rates, all_avg_regs
def run(): (train_images, train_labels),\ (valid_images, valid_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests) batch_idxs = BatchList(N_train, batch_size) N_iters = N_epochs * len(batch_idxs) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def train_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=train_images, T=train_labels) def valid_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=valid_images, T=valid_labels) def tests_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=tests_images, T=tests_labels) all_learning_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, ii): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append(loss_fun(x, X=train_images, T=train_labels)) def indexed_loss_fun(w, log_L2_reg, j): # idxs = batch_idxs[i % len(batch_idxs)] npr.seed(1000 * ii + j) idxs = npr.randint(N_train, size=len(batch_idxs)) partial_vects = [np.full(parser[name].size, np.exp(log_L2_reg[i])) for i, name in enumerate(parser.names)] L2_reg_vect = np.concatenate(partial_vects, axis=0) return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect) npr.seed(ii) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names)] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] results = sgd3(indexed_loss_fun, valid_loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['log_L2_reg'] = results['dMd_meta'] weights_grad = parser.new_vect(W0 * results['dMd_x']) hypergrads['log_param_scale'] = [np.sum(weights_grad[name]) for name in parser.names] hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = (results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num'] meta_results = {field : [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): if i % N_meta_thin == 0: print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) log_L2_reg = cur_hyperparams['log_L2_reg'] for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(train_loss_fun(x)) meta_results['valid_loss'].append(valid_loss_fun(x)) meta_results['tests_loss'].append(tests_loss_fun(x)) meta_results['iter_num'].append(i) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, meta_gamma) meta_results['all_learning_curves'] = all_learning_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): RS = RandomState((seed, "top_rs")) all_data = omniglot.load_rotated_alphabets(RS) train_data, tests_data = random_partition(all_data, RS, [12, 3]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) transform_parser = make_transform([0] * N_layers) def get_layers(vect): layers = [] for i_layer in range(N_layers): weights_by_scripts = vect.reshape((N_scripts, N_weights)) weights_idxs, _ = w_parser.idxs_and_shapes[('weights', i_layer)] biases_idxs, _ = w_parser.idxs_and_shapes[('biases', i_layer)] assert weights_idxs.stop == biases_idxs.start layer_idxs = slice(weights_idxs.start, biases_idxs.stop) layers.append(weights_by_scripts[:, layer_idxs]) return layers def transform_weights(z_vect, transform_vect): z_layers = get_layers(z_vect) transform = transform_parser.new_vect(transform_vect) w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)] return np.concatenate(w_layers, axis=1).ravel() def likelihood_loss(w_vect, data): w = script_parser.new_vect(w_vect) return sum([ loss_fun(w[i], **script_data) for i, script_data in enumerate(data) ]) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def train_z(data, transform_vect, RS): def primal_loss(z_vect, transform_vect, i_primal, record_results=False): w_vect = transform_weights(z_vect, transform_vect) loss = likelihood_loss(w_vect, data) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss) / N_scripts) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_init_scale) return sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters) def train_sharing(): def hyperloss(transform_vect, i_hyper): RS = RandomState((seed, i_hyper, "hyperloss")) cur_train_data, cur_valid_data = random_partition( train_data, RS, [10, 2]) z_vect_final = train_z(cur_train_data, transform_vect, RS) w_vect_final = transform_weights(z_vect_final, transform_vect) return likelihood_loss(w_vect_final, cur_valid_data) / N_scripts hypergrad = grad(hyperloss) cur_transform_vect = make_transform([init_script_corr] * N_layers).vect for i_hyper in range(N_meta_iter): print "Hyper iter {0}".format(i_hyper) grad_transform = hypergrad(cur_transform_vect, i_hyper) cur_transform_vect = cur_transform_vect - grad_transform * meta_alpha return cur_transform_vect transform_vects, train_losses, tests_losses = {}, {}, {} transform_vects['no_sharing'] = make_transform([0, 0, 0]).vect transform_vects['full_sharing'] = make_transform([1, 0, 0]).vect transform_vects['learned_sharing'] = train_sharing() for name in transform_vects.keys(): RS = RandomState("final_training") tv = transform_vects[name] trained_z = train_z(train_data, tv, RS) trained_w = transform_weights(trained_z, tv) train_losses[name] = likelihood_loss(trained_w, train_data) / N_scripts tests_losses[name] = likelihood_loss(trained_w, tests_data) / N_scripts print "{0} : train: {1}, test: {2}".format(name, train_losses[name], tests_losses[name]) return transform_parser, transform_vects, train_losses, tests_losses
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) def build_hypervect(init_log_alphas, init_invlogit_betas, init_log_param_scale): hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) return hyperparams hyperparams = build_hypervect( init_log_alphas, init_invlogit_betas, init_log_param_scale) # Build just for parser. fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def whetlab_optimize(loss, max_iters, callback): for i in xrange(max_iters): params = scientist.suggest() hyperparams = build_hypervect(**params) cur_loss = loss(hyperparams.vect, i) scientist.update(params, -cur_loss) if callback: callback(hyperparams.vect, i) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) whetlab_optimize(hyperloss, N_meta_iter, meta_callback) best_params = scientist.best() print "best params:", best_params parser.vect = None # No need to pickle zeros return meta_results, parser, best_params
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0 or i_iter == N_iters or i_iter == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) learning_curve_dict['iteration'].append(i_iter + 1) print "iteration", i_iter cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) meta_callback(hyperparams.vect, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(params): medianLayer0= params['ml1'][0] medianLayer1= params['ml2'][0] medianLayer2= params['ml3'][0] medianLayer3= params['ml4'][0] # medianLayer0= 0.3 # medianLayer1= 1.3 # medianLayer2= 2.3 # medianLayer3= 3.3 RS = RandomState((seed, "to p_rs")) data = loadData.loadMnist() train_data_subclass = [] train_data, tests_data = loadData.load_data_as_dict(data, classNum) train_data_subclass= loadSubsetData(train_data,RS, N_train, clientNum) print "training samples {0}: testing samples: {1}".format(N_train,N_tests) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size init_scales = w_parser.new_vect(np.zeros(N_weights)) for i in range(N_layers): init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i]) init_scales[('biases', i)] = 1.0 init_scales = init_scales.vect def process_reg(t_vect): # Remove the redundancy due to sharing regularization within units all_r = w_parser.new_vect(t_vect) new_r = np.zeros((0,)) for i in range(N_layers): layer = all_r[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_r = layer[:, 0] new_r = np.concatenate((new_r, cur_r)) return new_r fraction_error = 0.00 all_regs, all_tests_loss = [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg) # fraction_error = frac_err(w_vect_final,**cur_valid_data) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) #reg is the list of hyperparameters cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_regs.append(cur_reg.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) # print "Cur_reg", np.mean(cur_reg) print "Cur_reg", cur_reg for client_i in range (0,clientNum): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data_subclass.__getitem__(client_i), RS, [N_train - N_valid, N_valid]) # print("calculate hypergradients") raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(w_parser, raw_grad, constraint) # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha cur_reg -= constrained_grad * meta_alpha/clientNum print "\n" # print "constrained_grad",constrained_grad return cur_reg def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) # t_scale = [-1, 0, 1] # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) # for s in t_scale: # reg = np.ones(N_weights) * log_L2_init + s # loss = new_hyperloss(reg, 0, *cur_split) # print "Results: s= {0}, loss = {1}".format(s, loss) # reg = np.ones(N_weights) * log_L2_init shape0 = layer_sizes.__getitem__(0) shape1 = layer_sizes.__getitem__(1) shape2 = layer_sizes.__getitem__(2) shape3 = layer_sizes.__getitem__(3) l1= np.ones(shape0*shape1)* medianLayer0 l2= np.ones(shape1*shape2+shape1)* medianLayer1 l3= np.ones(shape2*shape3+shape2)* medianLayer2 l4= np.ones(shape3)* medianLayer3 reg = np.concatenate([l1,l2,l3,l4]) constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_reg, all_regs))) # return all_L2_regs, all_tests_loss return all_tests_loss.__getitem__(all_tests_loss.__len__()-1)
def run(): train_images, train_labels, _, _, _ = load_data(normalize=True) train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(2) V0 = npr.randn(N_weights) * velocity_scale #W0 = npr.randn(N_weights) * np.exp(log_param_scale) bindict = { k: np.linspace(-1, 1, N_bins) * np.exp(log_param_scale) # Different cdf per layer. for k, v in parser.idxs_and_shapes.iteritems() } output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) #X0, dX_dbins = bininvcdf(W_uniform, bins) X_uniform = npr.rand( N_weights) # Weights are uniform passed through an inverse cdf. X0 = np.zeros(N_weights) dX_dbins = {} for k, cur_bins in bindict.iteritems(): cur_slice, cur_shape = parser.idxs_and_shapes[k] cur_xs = X_uniform[cur_slice] cur_X0, cur_dX_dbins = bininvcdf(cur_xs, cur_bins) X0[cur_slice] = cur_X0 dX_dbins[k] = cur_dX_dbins results = sgd(indexed_loss_fun, batch_idxs, N_iters, X0, V0, np.exp(log_alphas), betas, record_learning_curve=True) dL_dx = results['d_x'] learning_curve = results['learning_curve'] output.append((learning_curve, bindict)) # Update bins with one gradient step. for k, bins in bindict.iteritems(): dL_dbins = np.dot(parser.get(dL_dx, k).flatten(), dX_dbins[k]) bins = bins - dL_dbins * bin_stepsize bins[[0, -1]] = bins[[0, -1]] - dL_dbins[[0, 1]] * bin_stepsize bindict[k] = np.sort(bins) bindict = bindict.copy() return output
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) init_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(init_hyperparams['log_alphas']) betas = logit(init_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) # Average many gradient evaluations at the initial point. hypergrads = np.zeros((N_gradients_in_average, hyperparams.vect.size)) for i in xrange(N_gradients_in_average): hypergrads[i] = hyperloss_grad(hyperparams.vect, i) print i first_gradient = hypergrads[0] avg_gradient = np.mean(hypergrads, axis=0) # Now do a line search along that direction. parsed_avg_grad = hyperparams.new_vect(avg_gradient) stepsize_scale = stepsize_search_rescale/np.max(np.exp(parsed_avg_grad['log_alphas'].ravel())) stepsizes = np.linspace(-stepsize_scale, stepsize_scale, N_points_in_line_search) for i, stepsize in enumerate(stepsizes): cur_hypervect = hyperparams.vect - stepsize * avg_gradient meta_callback(cur_hypervect, 0) # Use the same random seed every time. parser.vect = None # No need to pickle zeros return meta_results, parser, first_gradient, parsed_avg_grad, stepsizes
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) rs = RandomState((seed)) init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes) # One of each. hyperparams = VectorParser() hyperparams['fake_data'] = init_fake_data fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} loss_meta_parser = VectorParser() loss_meta_parser[''] def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = cur_primal_results[ 'weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) meta_results['train_loss'].append(0) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: print metagrad meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size init_scales = w_parser.new_vect(np.zeros(N_weights)) for i in range(N_layers): init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i]) init_scales[('biases', i)] = 1.0 init_scales = init_scales.vect def regularization(w_vect, reg): return np.dot(w_vect, w_vect * np.exp(reg)) def constrain_reg(t_vect, name): all_r = w_parser.new_vect(t_vect) for i in range(N_layers): all_r[('biases', i)] = 0.0 if name == 'universal': r_mean = np.mean( [np.mean(all_r[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_r[('weights', i)] = r_mean elif name == 'layers': for i in range(N_layers): all_r[('weights', i)] = np.mean(all_r[('weights', i)]) elif name == 'units': for i in range(N_layers): all_r[('weights', i)] = np.mean(all_r[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_r.vect def process_reg(t_vect): # Remove the redundancy due to sharing regularization within units all_r = w_parser.new_vect(t_vect) new_r = np.zeros((0, )) for i in range(N_layers): layer = all_r[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_r = layer[:, 0] new_r = np.concatenate((new_r, cur_r)) return new_r def train_z(data, w_vect_0, reg): N_data = data['X'].shape[0] def primal_loss(w_vect, reg, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) loss = loss_fun(w_vect, **minibatch) reg = regularization(w_vect, reg) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), reg, w_vect_0, alpha, beta, N_iters) all_regs, all_tests_loss = [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_regs.append(cur_reg.copy()) print "Hyper iter {0}, test loss {1}".format( i_hyper, all_tests_loss[-1]) print "Cur_reg", np.mean(cur_reg) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(raw_grad, constraint) print constrained_grad # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha cur_reg -= constrained_grad * meta_alpha return cur_reg def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) # t_scale = [-1, 0, 1] # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) # for s in t_scale: # reg = np.ones(N_weights) * log_L2_init + s # loss = new_hyperloss(reg, 0, *cur_split) # print "Results: s= {0}, loss = {1}".format(s, loss) reg = np.ones(N_weights) * log_L2_init constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_reg, all_regs))) return all_L2_regs, all_tests_loss
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) init_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(init_hyperparams['log_alphas']) betas = logit(init_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) # Average many gradient evaluations at the initial point. hypergrads = np.zeros((N_gradients_in_average, hyperparams.vect.size)) for i in xrange(N_gradients_in_average): hypergrads[i] = hyperloss_grad(hyperparams.vect, i) print i first_gradient = hypergrads[0] avg_gradient = np.mean(hypergrads, axis=0) # Now do a line search along that direction. parsed_avg_grad = hyperparams.new_vect(avg_gradient) stepsize_scale = 1000. / np.max( np.exp(parsed_avg_grad['log_alphas'].ravel())) stepsizes = np.linspace(-stepsize_scale, stepsize_scale, N_points_in_line_search) for i, stepsize in enumerate(stepsizes): cur_hypervect = hyperparams.vect + stepsize * avg_gradient meta_callback(cur_hypervect, 0) # Use the same random seed every time. parser.vect = None # No need to pickle zeros return meta_results, parser, first_gradient, avg_gradient, stepsizes
def run(): RS = RandomState((seed, "top_rs")) all_data = omniglot.load_flipped_alphabets() train_data, tests_data = random_partition(all_data, RS, [12, 3]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) transform_parser = make_transform([0] * N_layers) def get_layers(vect): layers = [] for i_layer in range(N_layers): weights_by_scripts = vect.reshape((N_scripts, N_weights)) weights_idxs, _ = w_parser.idxs_and_shapes[('weights', i_layer)] biases_idxs, _ = w_parser.idxs_and_shapes[('biases', i_layer)] assert weights_idxs.stop == biases_idxs.start layer_idxs = slice(weights_idxs.start, biases_idxs.stop) layers.append(weights_by_scripts[:, layer_idxs]) return layers def transform_weights(z_vect, transform_vect): z_layers = get_layers(z_vect) transform = transform_parser.new_vect(transform_vect) w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)] return np.concatenate(w_layers, axis=1).ravel() def likelihood_loss(w_vect, data): w = script_parser.new_vect(w_vect) return sum([loss_fun(w[i], **script_data) for i, script_data in enumerate(data)]) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def train_z(data, transform_vect, RS): def primal_loss(z_vect, transform_vect, i_primal, record_results=False): w_vect = transform_weights(z_vect, transform_vect) loss = likelihood_loss(w_vect, data) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss) / N_scripts) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_init_scale) return sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters) def train_sharing(): def hyperloss(transform_vect, i_hyper): RS = RandomState((seed, i_hyper, "hyperloss")) cur_train_data, cur_valid_data = random_partition(train_data, RS, [10, 2]) z_vect_final = train_z(cur_train_data, transform_vect, RS) w_vect_final = transform_weights(z_vect_final, transform_vect) return likelihood_loss(w_vect_final, cur_valid_data) / N_scripts hypergrad = grad(hyperloss) cur_transform_vect = make_transform([init_script_corr] * N_layers).vect for i_hyper in range(N_meta_iter): print "Hyper iter {0}".format(i_hyper) grad_transform = hypergrad(cur_transform_vect, i_hyper) cur_transform_vect = cur_transform_vect - grad_transform * meta_alpha return cur_transform_vect transform_vects, train_losses, tests_losses = {}, {}, {} transform_vects['no_sharing'] = make_transform([0, 0, 0]).vect transform_vects['full_sharing'] = make_transform([1, 0, 0]).vect transform_vects['learned_sharing'] = train_sharing() for name in transform_vects.keys(): RS = RandomState("final_training") tv = transform_vects[name] trained_z = train_z(train_data, tv, RS) trained_w = transform_weights(trained_z, tv) train_losses[name] = likelihood_loss(trained_w, train_data) / N_scripts tests_losses[name] = likelihood_loss(trained_w, tests_data) / N_scripts print "{0} : train: {1}, test: {2}".format(name, train_losses[name], tests_losses[name]) return transform_parser, transform_vects, train_losses, tests_losses
def run(): """Three different parsers: w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single script script_parser[i_script] : weights vector for each script transform_parser[i_layer] : transform matrix (scripts x scripts) for each alphabet""" RS = RandomState((seed, "top_rs")) train_data, valid_data, tests_data = omniglot.load_data_split( [11, 2, 2], RS, num_alphabets=N_scripts) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size transform_parser = make_transform(N_scripts, script_corr_init) script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) def get_layers(vect): layers = [] for i_layer in range(N_layers): weights_by_scripts = vect.reshape((N_scripts, N_weights)) weights_idxs, _ = w_parser.idxs_and_shapes[('weights', i_layer)] biases_idxs, _ = w_parser.idxs_and_shapes[('biases', i_layer)] assert weights_idxs.stop == biases_idxs.start layer_idxs = slice(weights_idxs.start, biases_idxs.stop) layers.append(weights_by_scripts[:, layer_idxs]) return layers def transform_weights(z_vect, transform_vect): z_layers = get_layers(z_vect) transform = transform_parser.new_vect(transform_vect) w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)] return np.concatenate(w_layers, axis=1).ravel() def total_loss(w_vect, data): w = script_parser.new_vect(w_vect) return sum([loss_fun(w[i], **script_data) for i, script_data in enumerate(data)]) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2_init) results = defaultdict(list) def hyperloss(transform_vect, i_hyper, record_results=True): RS = RandomState((seed, i_hyper, "hyperloss")) def primal_loss(z_vect, transform_vect, i_primal, record_results=False): w_vect = transform_weights(z_vect, transform_vect) loss = total_loss(w_vect, train_data) reg = regularization(z_vect) if VERBOSE and record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format( i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg)) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) w_vect_final = transform_weights(z_vect_final, transform_vect) valid_loss = total_loss(w_vect_final, valid_data) if record_results: results['valid_loss'].append(getval(valid_loss) / N_scripts) results['train_loss'].append(total_loss(w_vect_final, train_data) / N_scripts) results['tests_loss'].append(total_loss(w_vect_final, tests_data) / N_scripts) return valid_loss grad_transform = 0.0 for i_hyper in range(N_grad_averages): grad_transform += grad(hyperloss)(transform_parser.vect, i_hyper, record_results=False) grad_transform /= N_grad_averages i_hyper = N_grad_averages for i, d in enumerate(line_search_dists): new_transform_vect = transform_parser.vect - d * grad_transform hyperloss(new_transform_vect, i_hyper, record_results=True) print "Hyper iter {0}".format(i) print "Results", {k : v[-1] for k, v in results.iteritems()} grad_transform_dict = transform_parser.new_vect(grad_transform).as_dict() return results, grad_transform_dict
def run(script_corr): """Three different parsers: w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single script script_parser[i_script] : weights vector for each script transform_parser[i_layer] : transform matrix (scripts x scripts) for each alphabet""" RS = RandomState((seed, "top_rs")) train_data, valid_data, tests_data = omniglot.load_data_split([11, 2, 2], RS, num_alphabets=N_scripts) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size uncorrelated_mat = np.eye(N_scripts) fully_correlated_mat = np.full((N_scripts, N_scripts), 1.0 / N_scripts) transform_mat = (1 - script_corr) * uncorrelated_mat + script_corr * fully_correlated_mat transform_parser = VectorParser() for i_layer in range(N_layers): if i_layer > 0: transform_parser[i_layer] = uncorrelated_mat else: transform_parser[i_layer] = transform_mat script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) def transform_weights(all_z_vect, transform_vect, i_script_out): all_z = script_parser.new_vect( all_z_vect) transform = transform_parser.new_vect(transform_vect) W = OrderedDict() # Can't use parser because setting plain array ranges with funkyyak nodes not yet supported for k in w_parser.idxs_and_shapes.keys(): W[k] = 0.0 for i_layer in range(N_layers): script_weightings = transform[i_layer][i_script_out, :] for i_script in range(N_scripts): z_i_script = w_parser.new_vect(all_z[i_script]) script_weighting = script_weightings[i_script] W[('biases', i_layer)] += z_i_script[('biases', i_layer)] * script_weighting W[('weights', i_layer)] += z_i_script[('weights', i_layer)] * script_weighting return np.concatenate([v.ravel() for v in W.values()]) def loss_from_latents(z_vect, transform_vect, i_script, data): w_vect = transform_weights(z_vect, transform_vect, i_script) return loss_fun(w_vect, **data) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2_init) results = defaultdict(list) def hyperloss(transform_vect, i_hyper, record_results=False): def sub_primal_stochastic_loss(z_vect, transform_vect, i_primal, i_script): RS = RandomState((seed, i_hyper, i_primal, i_script)) N_train = train_data[i_script]['X'].shape[0] idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data[i_script], idxs) loss = loss_from_latents(z_vect, transform_vect, i_script, minibatch) if i_primal % N_thin == 0 and i_script == 0: print "Iter {0}, full losses: train: {1}, valid: {2}".format( i_primal, total_loss(train_data, getval(z_vect)), total_loss(valid_data, getval(z_vect))) if i_script == 0: # Only add regularization once loss += regularization(z_vect) return loss def total_loss(data, z_vect): return np.mean([loss_from_latents(z_vect, transform_vect, i_script, data[i_script]) for i_script in range(N_scripts)]) z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(sub_primal_stochastic_loss), transform_vect, z_vect_0, alpha, beta, N_iters, N_scripts_per_iter, callback=None) valid_loss = total_loss(valid_data, z_vect_final) if record_results: results['valid_loss'].append(valid_loss) results['train_loss'].append(total_loss(train_data, z_vect_final)) # results['tests_loss'].append(total_loss(tests_data, z_vect_final)) return valid_loss hyperloss(transform_parser.vect, 0, record_results=True) return results['train_loss'][-1], results['valid_loss'][-1]
def run(script_corr): """Three different parsers: w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single script script_parser[i_script] : weights vector for each script transform_parser[i_layer] : transform matrix (scripts x scripts) for each alphabet""" RS = RandomState((seed, "top_rs")) train_data, valid_data, tests_data = omniglot.load_data_split( [11, 2, 2], RS, num_alphabets=N_scripts) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size uncorrelated_mat = np.eye(N_scripts) fully_correlated_mat = np.full((N_scripts, N_scripts), 1.0 / N_scripts) transform_mat = (1 - script_corr ) * uncorrelated_mat + script_corr * fully_correlated_mat transform_parser = VectorParser() for i_layer in range(N_layers): if i_layer > 0: transform_parser[i_layer] = uncorrelated_mat else: transform_parser[i_layer] = transform_mat script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) def transform_weights(all_z_vect, transform_vect, i_script_out): all_z = script_parser.new_vect(all_z_vect) transform = transform_parser.new_vect(transform_vect) W = OrderedDict( ) # Can't use parser because setting plain array ranges with funkyyak nodes not yet supported for k in w_parser.idxs_and_shapes.keys(): W[k] = 0.0 for i_layer in range(N_layers): script_weightings = transform[i_layer][i_script_out, :] for i_script in range(N_scripts): z_i_script = w_parser.new_vect(all_z[i_script]) script_weighting = script_weightings[i_script] W[('biases', i_layer)] += z_i_script[('biases', i_layer)] * script_weighting W[('weights', i_layer)] += z_i_script[('weights', i_layer)] * script_weighting return np.concatenate([v.ravel() for v in W.values()]) def loss_from_latents(z_vect, transform_vect, i_script, data): w_vect = transform_weights(z_vect, transform_vect, i_script) return loss_fun(w_vect, **data) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2_init) results = defaultdict(list) def hyperloss(transform_vect, i_hyper, record_results=False): def sub_primal_stochastic_loss(z_vect, transform_vect, i_primal, i_script): RS = RandomState((seed, i_hyper, i_primal, i_script)) N_train = train_data[i_script]['X'].shape[0] idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data[i_script], idxs) loss = loss_from_latents(z_vect, transform_vect, i_script, minibatch) if i_primal % N_thin == 0 and i_script == 0: print "Iter {0}, full losses: train: {1}, valid: {2}".format( i_primal, total_loss(train_data, getval(z_vect)), total_loss(valid_data, getval(z_vect))) if i_script == 0: # Only add regularization once loss += regularization(z_vect) return loss def total_loss(data, z_vect): return np.mean([ loss_from_latents(z_vect, transform_vect, i_script, data[i_script]) for i_script in range(N_scripts) ]) z_vect_0 = RS.randn( script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(sub_primal_stochastic_loss), transform_vect, z_vect_0, alpha, beta, N_iters, N_scripts_per_iter, callback=None) valid_loss = total_loss(valid_data, z_vect_final) if record_results: results['valid_loss'].append(valid_loss) results['train_loss'].append(total_loss(train_data, z_vect_final)) # results['tests_loss'].append(total_loss(tests_data, z_vect_final)) return valid_loss hyperloss(transform_parser.vect, 0, record_results=True) return results['train_loss'][-1], results['valid_loss'][-1]
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size exact_metagrad = [np.array([0])] #just a placeholder def transform_weights(z_vect, transform): return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_reg(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): #Don't regularize biases all_t[('biases', i)] = 0.0 if name == 'universal': #One regularization hyperparameter for all weights #TODO: does computing means of means make sense? Not the same as just the mean of all. t_mean = np.mean([np.mean(all_t[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[('weights', i)] = t_mean elif name == 'layers': #One regularization hyperparameter for each layer #TODO: changes the exact hypergradient norm, but not the DrMAD norm. Why??? DrMAD is already constrained? print t_vect.shape for i in range(N_layers): print "diff after contraining" + str(np.linalg.norm(all_t[('weights', i)] - np.mean(all_t[('weights', i)]))) all_t[('weights', i)] = np.mean(all_t[('weights', i)]) elif name == 'units': print t_vect.shape #44860; this is correct for i in range(N_layers): print "weights "+ str(i) + ": " + str(np.linalg.norm(np.mean(all_t[('weights', i)], axis=1, keepdims=True) - np.mean(all_t[('weights', i)], axis=1, keepdims=True))) #for i in range(N_layers): #TODO: This was the same as layer-wise #all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0,)) for i in range(N_layers): layer = all_t[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] new_t = np.concatenate((new_t, cur_t)) return new_t #TODO: make sure the exact_metagrad gets passed by reference def train_z(data, z_vect_0, transform, exact_metagrad): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): #exact_metagrad=exact_metagrad2, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), transform, z_vect_0, exact_metagrad, alpha, beta, N_iters) all_transforms, all_tests_loss, all_tests_rates, all_avg_regs = [], [], [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top, exact_metagrad): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, exact_metagrad): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform, exact_metagrad) w_vect_final = transform_weights(z_vect_final, transform) #TODO: print/store losses and error rates here print "Training loss (unregularized) = " +str(getval(loss_fun(w_vect_final, **cur_train_data))) print "Validation loss = " +str(getval(loss_fun(w_vect_final, **cur_valid_data))) print "Test loss = " +str(getval(loss_fun(w_vect_final, **tests_data))) print "Training error = "+ str(getval(frac_err(w_vect_final, **cur_train_data))) print "Validation error = "+ str(getval(frac_err(w_vect_final, **cur_valid_data))) print "Test error = "+ str(getval(frac_err(w_vect_final, **tests_data))) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) #No chain rule here '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path? w_vect_final = transform_weights(z_vect_final, transform) return frac_err(w_vect_final, **cur_valid_data)''' cur_reg = reg_0 for i_hyper in range(N_meta_iter): print "Hyper iter "+ str(i_hyper) """if i_hyper % N_meta_thin == 0: test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data) all_tests_rates.append(test_rate) all_transforms.append(cur_reg.copy()) all_avg_regs.append(np.mean(cur_reg)) print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1]) print "Cur_transform", np.mean(cur_reg)""" RS = RandomState((seed, i_top, i_hyper, "hyperloss")) #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) cur_train_data, cur_valid_data = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data, exact_metagrad) #print "before constraining grad" constrained_grad = constrain_reg(raw_grad, constraint) # TODO: can put exact hypergradient here, using constraint #print "after constraining grad, before constraining exact" # TODO: DrMAD norm matches after constraining, but not exact norm?? Why??? # This one is about 4x larger than constrained one print np.linalg.norm(raw_grad) print np.linalg.norm(exact_metagrad[0]) constrained_exact_grad = constrain_reg(exact_metagrad[0], constraint) #print "after constraining exact" # TODO: compute statistics # TODO: sometimes negative??? print("cosine of angle between DrMAD and exact = " +str(np.dot(constrained_grad, constrained_exact_grad)/(np.linalg.norm(constrained_grad)*np.linalg.norm(constrained_exact_grad)))) print("cosine of angle between signs of DrMAD and exact = " +str(np.dot(np.sign(constrained_grad), np.sign(constrained_exact_grad))/len(constrained_grad))) print("DrMAD norm = "+ str(np.linalg.norm(constrained_grad))) print("Exact norm = "+ str(np.linalg.norm(constrained_exact_grad))) cur_reg -= np.sign(constrained_grad) * meta_alpha #TODO: signs of gradient... #TODO: momentum return cur_reg reg = np.zeros(N_weights)+0.2 constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top), constraint reg = train_reg(reg, constraint, N_meta_iter, i_top, exact_metagrad) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) return all_L2_regs, all_tests_rates, all_avg_regs
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) #only uses two different regularization hyperparameters, one for each layer? N_weight_types = len(parser.names) # = 2 print(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_iters, init_log_param_scale) #don't update scale #TODO: remove scale from gradient, then? exact_metagrad = VectorParser() exact_metagrad['log_L2_reg'] = fill_parser(parser, hyperparams['log_L2_reg']) #np.zeros(N_weight_types) exact_metagrad['log_param_scale'] = fill_parser(parser, fixed_hyperparams['log_param_scale']) #np.zeros(N_weight_types) exact_metagrad['log_alphas'] = np.zeros(N_iters) exact_metagrad['invlogit_betas'] = np.zeros(N_iters) exact_metagrad2 = VectorParser() exact_metagrad2['log_L2_reg'] = np.zeros(N_weight_types) exact_metagrad2['log_param_scale'] = np.zeros(N_weight_types) exact_metagrad2['log_alphas'] = np.zeros(N_iters) exact_metagrad2['invlogit_betas'] = np.zeros(N_iters) #exact_metagrad = exact_metagradV.vect #print(hyperparams.vect) #exact_metagrad = [np.zeros(N_weight_types), np.zeros(N_weight_types), np.zeros(N_iters), np.zeros(N_iters)] #initialize # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState(npr.RandomState(global_seed + i_hyper + i_iter * 10000).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # N_batches=10 times learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) # TODO: why doesn't the following line work with N_iter=1? W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) #don't update scale W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) # TODO: Put on proper scale; no SGD on log/invlogit scale alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) # TODO: check this L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), exact_metagrad, callback) #W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) #callback(W_opt, N_iters) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) # TODO: This is where the chain rule happens, dhyperloss/dW_opt x dW_opt/dhyperparam_vect; first term is SGD meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] #def meta_callback(hyperparam_vect, i_hyper, metagrad): def meta_callback(hyperparam_vect, i_hyper, metagrad, exact_metagrad=exact_metagrad): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) # these are the unregularized losses below; default sets L2_reg=0.0 meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['train_err'].append(frac_err(x, **train_data)) meta_results['valid_err'].append(frac_err(x, **valid_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) print("metagrad", len(metagrad)) meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) #Michael: added comparisons with exact metagrad here #(2) Angle condition: More strongly, is the cosine of the angle between the two strictly bounded away from 0? #(3) Length: Since hypergradient optimization procedures do not necessarily use a proper line search, it may also be important for the approximate hypergradient to have a length comparable to the true hypergradient. exact_metagrad2['log_L2_reg'] = [sum(exact_metagrad['log_L2_reg'][0:7840]), sum(exact_metagrad['log_L2_reg'][7840:7850])] exact_metagrad2['log_param_scale'] = [sum(exact_metagrad['log_param_scale'][0:7840]), sum(exact_metagrad['log_param_scale'][7840:7850])] exact_metagrad2['log_alphas'] = exact_metagrad['log_alphas'] exact_metagrad2['invlogit_betas'] = exact_metagrad['invlogit_betas'] meta_results['exact_meta_grad_magnitude'].append(np.linalg.norm(exact_metagrad2.vect)) meta_results['DrMAD_exact_angle'].append(np.dot(exact_metagrad2.vect, metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(exact_metagrad2.vect))) #TODO: do the above for parameters separately? E.g. check log_alphas separately old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) #Michael: train->tests # final_result = adam(hyperloss_grad, hyperparams.vect, # meta_callback, N_meta_iter, meta_alpha) final_result = adam(hyperloss_grad, hyperparams.vect, exact_metagrad, meta_callback, N_meta_iter, meta_alpha) #write modified adam to ignore exact hypergrad in sgd4_mad_with_exact #meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size def transform_weights(z_vect, transform): return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_transform(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): all_t[("biases", i)] = 0.0 if name == "universal": t_mean = np.mean([np.mean(all_t[("weights", i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[("weights", i)] = t_mean elif name == "layers": for i in range(N_layers): all_t[("weights", i)] = np.mean(all_t[("weights", i)]) elif name == "units": for i in range(N_layers): all_t[("weights", i)] = np.mean(all_t[("weights", i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0,)) for i in range(N_layers): layer = all_t[("weights", i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] new_t = np.concatenate((new_t, cur_t)) return new_t def train_z(data, z_vect_0, transform): N_data = data["X"].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters) all_transforms, all_tests_loss = [], [] def train_reg(transform_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_transform = transform_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_transforms.append(cur_transform.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_transform, i_hyper, *cur_split) constrained_grad = constrain_transform(raw_grad, constraint) cur_transform -= constrained_grad * meta_alpha return cur_transform transform = np.zeros(N_weights) constraints = ["universal", "layers", "units"] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) transform = train_reg(transform, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) return all_L2_regs, all_tests_loss
def run( subClassIndexList): RS = RandomState((seed, "to p_rs")) data = loadData.loadMnist() train_data,tests_data = loadData.load_data_as_dict(data, classNum, subClassIndexList.__getitem__(0)) train_data_subclass = [] train_data_subclass= loadSubsetData(train_data,RS, N_train, clientNum) print "training samples {0}: testing samples: {1}".format(N_train,N_tests) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size init_scales = w_parser.new_vect(np.zeros(N_weights)) for i in range(N_layers): init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i]) init_scales[('biases', i)] = 1.0 init_scales = init_scales.vect all_regs, all_tests_loss = [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) #reg is the list of hyperparameters cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_regs.append(cur_reg.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) # print "Cur_reg", np.mean(cur_reg) print "Cur_reg", cur_reg for client_i in range (0,clientNum): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data_subclass.__getitem__(client_i), RS, [N_train - N_valid, N_valid]) # print("calculate hypergradients") raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(w_parser, raw_grad, constraint) # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha/clientNum cur_reg -= constrained_grad * meta_alpha/clientNum print "\n" return cur_reg def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) # t_scale = [-1, 0, 1] # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) # for s in t_scale: # reg = np.ones(N_weights) * log_L2_init + s # loss = new_hyperloss(reg, 0, *cur_split) # print "Results: s= {0}, loss = {1}".format(s, loss) reg = np.ones(N_weights) * log_L2_init constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(w_parser, process_reg, all_regs))) return all_L2_regs, all_tests_loss
def run(script_corr_init): """Three different parsers: w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single script script_parser[i_script] : weights vector for each script transform_parser[i_layer] : transform matrix (scripts x scripts) for each alphabet""" RS = RandomState((seed, "top_rs")) train_data, valid_data, tests_data = omniglot.load_data_split([11, 2, 2], RS, num_alphabets=N_scripts) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size transform_parser = make_transform(N_scripts, script_corr_init) script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) def get_layers(vect): layers = [] for i_layer in range(N_layers): weights_by_scripts = vect.reshape((N_scripts, N_weights)) weights_idxs, _ = w_parser.idxs_and_shapes[("weights", i_layer)] biases_idxs, _ = w_parser.idxs_and_shapes[("biases", i_layer)] assert weights_idxs.stop == biases_idxs.start layer_idxs = slice(weights_idxs.start, biases_idxs.stop) layers.append(weights_by_scripts[:, layer_idxs]) return layers def transform_weights(z_vect, transform_vect): z_layers = get_layers(z_vect) transform = transform_parser.new_vect(transform_vect) w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)] return np.concatenate(w_layers, axis=1).ravel() def total_loss(w_vect, data): w = script_parser.new_vect(w_vect) return sum([loss_fun(w[i], **script_data) for i, script_data in enumerate(data)]) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2_init) results = defaultdict(list) def hyperloss(transform_vect, i_hyper, record_results=True): RS = RandomState((seed, i_hyper, "hyperloss")) def primal_loss(z_vect, transform_vect, i_primal, record_results): RS = RandomState((seed, i_hyper, i_primal, i_script)) w_vect = transform_weights(z_vect, transform_vect) loss = total_loss(w_vect, train_data) reg = regularization(z_vect) if VERBOSE and record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format( i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg) ) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) w_vect_final = transform_weights(z_vect_final, transform_vect) valid_loss = total_loss(w_vect_final, valid_data) if record_results: results["valid_loss"].append(getval(valid_loss) / N_scripts) results["train_loss"].append(total_loss(w_vect_final, train_data) / N_scripts) return valid_loss hyperloss(transform_parser.vect, 0) return results["train_loss"][-1], results["valid_loss"][-1]
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) N_weights = len(parser.vect) hyperparams = VectorParser() rs = RandomState((seed)) hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\ + rs.randn(N_weights) * init_log_L2_reg_noise hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) cur_primal_results = {} def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = np.exp(cur_hyperparams['log_L2_reg']) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) x, learning_curve_dict = cur_primal_results[ 'weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field] = cur_hyperparams[field] meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) initial_hypergrad = hyperloss_grad(hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser, parsed_init_hypergrad