def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size def transform_weights(z_vect, transform): return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_transform(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): all_t[('biases', i)] = 0.0 if name == 'universal': t_mean = np.mean([np.mean(all_t[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[('weights', i)] = t_mean elif name == 'layers': for i in range(N_layers): all_t[('weights', i)] = np.mean(all_t[('weights', i)]) elif name == 'units': for i in range(N_layers): all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0,)) for i in range(N_layers): layer = all_t[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] new_t = np.concatenate((new_t, cur_t)) return new_t def train_z(data, z_vect_0, transform): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters) all_transforms, all_tests_loss = [], [] def train_reg(transform_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_transform = transform_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_transforms.append(cur_transform.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_transform, i_hyper, *cur_split) constrained_grad = constrain_transform(raw_grad, constraint) cur_transform -= constrained_grad * meta_alpha return cur_transform transform = np.zeros(N_weights) constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) transform = train_reg(transform, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) return all_L2_regs, all_tests_loss
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size def transform_weights(z_vect, transform): return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_transform(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): all_t[("biases", i)] = 0.0 if name == "universal": t_mean = np.mean([np.mean(all_t[("weights", i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[("weights", i)] = t_mean elif name == "layers": for i in range(N_layers): all_t[("weights", i)] = np.mean(all_t[("weights", i)]) elif name == "units": for i in range(N_layers): all_t[("weights", i)] = np.mean(all_t[("weights", i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0,)) for i in range(N_layers): layer = all_t[("weights", i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] new_t = np.concatenate((new_t, cur_t)) return new_t def train_z(data, z_vect_0, transform): N_data = data["X"].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters) all_transforms, all_tests_loss = [], [] def train_reg(transform_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_transform = transform_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_transforms.append(cur_transform.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_transform, i_hyper, *cur_split) constrained_grad = constrain_transform(raw_grad, constraint) cur_transform -= constrained_grad * meta_alpha return cur_transform transform = np.zeros(N_weights) constraints = ["universal", "layers", "units"] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) transform = train_reg(transform, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) return all_L2_regs, all_tests_loss
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size def transform_weights( z_vect, transform): #TODO: isn't this a scale transformation? return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_reg(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): #Don't regularize biases all_t[('biases', i)] = 0.0 if name == 'universal': #One regularization hyperparameter for all weights #TODO: does computing means of means make sense? Not the same as just the mean of all. t_mean = np.mean( [np.mean(all_t[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[('weights', i)] = t_mean elif name == 'layers': #One regularization hyperparameter for each layer #TODO: changes the exact hypergradient norm, but not the DrMAD norm. Why??? DrMAD is already constrained? #print t_vect.shape for i in range(N_layers): #print "diff after contraining" + str(np.linalg.norm(all_t[('weights', i)] - np.mean(all_t[('weights', i)]))) all_t[('weights', i)] = np.mean(all_t[('weights', i)]) elif name == 'units': print t_vect.shape #44860; this is correct #for i in range(N_layers): #print "weights "+ str(i) + ": " + str(np.linalg.norm(np.mean(all_t[('weights', i)], axis=1, keepdims=True) - np.mean(all_t[('weights', i)], axis=1, keepdims=True))) #for i in range(N_layers): #TODO: This was the same as layer-wise #all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0, )) for i in range(N_layers): layer = all_t[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] #TODO: equivalent regularization weights new_t = np.concatenate((new_t, cur_t)) return new_t def train_z(data, z_vect_0, transform): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights( z_vect, transform ) #TODO: this is a scale transformation, not regularization! loss = loss_fun(w_vect, **minibatch) #use new scale for prediction reg = regularization(z_vect) #regularize original scale #TODO: should be equivalent: w = z*e^transform, so # f(z*e^transform) + e^\lambda||z||^2 = f(w) + e^\lambda||z||^2 = f(w) + e^(\lambda)||e^-2transform w||^2 # see process_transform #if record_results and i_primal % N_thin == 0: #print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd_meta_only_mad(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters) def train_z_exact(data, z_vect_0, transform, meta_iteration=0): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) #if record_results and i_primal % N_thin == 0: # print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd_meta_only(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters, meta_iteration=meta_iteration) def train_z2(data, z_vect_0, transform): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) return loss + reg return sgd_meta_only_mad2(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters) all_transforms, all_train_loss, all_valid_loss, all_tests_loss, all_train_rates, all_valid_rates, all_tests_rates, all_avg_regs, hypergrad_angles, hypergrad_angles2, hypergrad_signs_angles, hypergrad_signs_angles2, hypergrad_norms, hypergrad_norms2, exact_hypergrad_norms = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights( z_vect_final, transform) #TODO: initial scale AND regularization train_loss = getval(loss_fun(w_vect_final, **cur_train_data)) print "Training loss (unregularized) = " + str(train_loss) all_train_loss.append(train_loss) valid_loss = getval(loss_fun(w_vect_final, **cur_valid_data)) print "Validation loss = " + str(valid_loss) all_valid_loss.append(valid_loss) tests_loss = getval(loss_fun(w_vect_final, **cur_tests_data)) print "Test loss = " + str(tests_loss) all_tests_loss.append(tests_loss) """plt.plot(all_train_loss, label="training loss (unregularized)") plt.plot(all_valid_loss, label="validation loss") plt.plot(all_tests_loss, label="test loss") plt.title("loss vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("loss") plt.legend() plt.savefig("loss"+str(N_iters)+"_corrected.png") plt.clf()""" train_rate = getval(frac_err(w_vect_final, **cur_train_data)) print "Training error rate = " + str(train_rate) all_train_rates.append(train_rate) valid_rate = getval(frac_err(w_vect_final, **cur_valid_data)) print "Validation error rate = " + str(valid_rate) all_valid_rates.append(valid_rate) tests_rate = getval(frac_err(w_vect_final, **cur_tests_data)) print "Test error rate = " + str(tests_rate) all_tests_rates.append(tests_rate) """plt.plot(all_train_rates, label="training error rate") plt.plot(all_valid_rates, label="validation error rate") plt.plot(all_tests_rates, label="test error rate") plt.title("error rate vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("error rate") plt.legend() plt.savefig("error"+str(N_iters)+"_corrected.png") plt.clf()""" return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) #No chain rule here def hyperloss_exact(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, meta_it=0): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z_exact(cur_train_data, z_vect_0, transform, meta_iteration=meta_it) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad_exact = grad(hyperloss_exact) #No chain rule here def hyperloss2(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z2(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad2 = grad(hyperloss2) '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path? w_vect_final = transform_weights(z_vect_final, transform) return frac_err(w_vect_final, **cur_valid_data)''' cur_reg = reg_0 #initial regularization, besides regularization() function for i_hyper in range(N_meta_iter): print "Hyper iter " + str(i_hyper) """if i_hyper % N_meta_thin == 0: test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data) all_tests_rates.append(test_rate) all_transforms.append(cur_reg.copy()) all_avg_regs.append(np.mean(cur_reg)) print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1]) print "Cur_transform", np.mean(cur_reg)""" RS = RandomState((seed, i_top, i_hyper, "hyperloss")) #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) cur_train_data, cur_valid_data = random_partition( train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data) raw_grad2 = hypergrad2(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data) raw_grad_exact = hypergrad_exact(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data, meta_it=i_hyper) #print "before constraining grad" constrained_grad = constrain_reg(raw_grad, constraint) constrained_grad2 = constrain_reg(raw_grad2, constraint) constrained_grad_exact = constrain_reg(raw_grad_exact, constraint) print(np.linalg.norm(raw_grad)) print(np.linalg.norm(raw_grad2)) #TODO: #Exploding DrMAD gradient; ~10^10x larger than exact gradient with N_safe_sampling = N_iters print(np.linalg.norm(raw_grad_exact)) # TODO: sometimes negative??? hypergrad_angle = np.dot( constrained_grad, constrained_grad_exact) / ( np.linalg.norm(constrained_grad) * np.linalg.norm(constrained_grad_exact)) hypergrad_angles.append(hypergrad_angle) hypergrad_angle2 = np.dot( constrained_grad2, constrained_grad_exact) / ( np.linalg.norm(constrained_grad2) * np.linalg.norm(constrained_grad_exact)) hypergrad_angles2.append(hypergrad_angle2) print("cosine of angle between DrMAD and exact = " + str(hypergrad_angle)) print("cosine of angle between DrMAD2 and exact = " + str(hypergrad_angle2)) hypergrad_signs_angle = np.dot( np.sign(constrained_grad), np.sign(constrained_grad_exact)) / len(constrained_grad) hypergrad_signs_angles.append(hypergrad_signs_angle) print("cosine of angle between signs of DrMAD and exact = " + str(hypergrad_signs_angle)) hypergrad_signs_angle2 = np.dot( np.sign(constrained_grad2), np.sign(constrained_grad_exact)) / len(constrained_grad2) hypergrad_signs_angles2.append(hypergrad_signs_angle2) print("cosine of angle between signs of DrMAD2 and exact = " + str(hypergrad_signs_angle2)) """plt.plot(hypergrad_angles, label="exact vs DrMAD") plt.plot(hypergrad_signs_angles, label="signs exact vs signs DrMAD") plt.plot(hypergrad_angles2, label="exact vs DrMAD2") plt.plot(hypergrad_signs_angles2, label="signs exact vs signs DrMAD2") plt.title("Cosine of angle between hypergradients vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("cosine of angle") plt.legend() plt.savefig("angle"+str(N_iters)+"_corrected2.png") plt.clf()""" hypergrad_norm = np.linalg.norm(constrained_grad) hypergrad_norms.append(hypergrad_norm) print("DrMAD norm = " + str(hypergrad_norm)) hypergrad_norm2 = np.linalg.norm(constrained_grad2) hypergrad_norms2.append(hypergrad_norm2) print("DrMAD2 norm = " + str(hypergrad_norm2)) exact_hypergrad_norm = np.linalg.norm(constrained_grad_exact) exact_hypergrad_norms.append(exact_hypergrad_norm) print("Exact norm = " + str(exact_hypergrad_norm)) """plt.plot(hypergrad_norms, label="DrMAD hypergradient") plt.plot(hypergrad_norms2, label="DrMAD2 hypergradient") plt.plot(exact_hypergrad_norms, label="Exact hypergradient") plt.title("Norms of hypergradients vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("norm") plt.legend() plt.savefig("norms"+str(N_iters)+"_corrected2.png") plt.clf()""" cur_reg -= np.sign( constrained_grad) * meta_alpha #TODO: signs of gradient... #TODO: momentum return cur_reg reg = np.zeros( N_weights) + 0.2 #TODO: initial -log regularization; not in log scale? constraints = ['universal', 'layers', 'units'] # TODO: uses multiple kinds of hyperparameter sharing, but in order for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top), constraint reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) #return all_L2_regs, all_tests_rates, all_avg_regs all_L2_regs, all_train_loss, all_valid_loss, all_tests_loss, all_train_rates, all_valid_rates, all_tests_rates, all_avg_regs, hypergrad_angles, hypergrad_signs_angles, hypergrad_norms, exact_hypergrad_norms
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size init_scales = w_parser.new_vect(np.zeros(N_weights)) for i in range(N_layers): init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i]) init_scales[('biases', i)] = 1.0 init_scales = init_scales.vect def regularization(w_vect, reg): return np.dot(w_vect, w_vect * np.exp(reg)) def constrain_reg(t_vect, name): all_r = w_parser.new_vect(t_vect) for i in range(N_layers): all_r[('biases', i)] = 0.0 if name == 'universal': r_mean = np.mean([np.mean(all_r[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_r[('weights', i)] = r_mean elif name == 'layers': for i in range(N_layers): all_r[('weights', i)] = np.mean(all_r[('weights', i)]) elif name == 'units': for i in range(N_layers): all_r[('weights', i)] = np.mean(all_r[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_r.vect def process_reg(t_vect): # Remove the redundancy due to sharing regularization within units all_r = w_parser.new_vect(t_vect) new_r = np.zeros((0,)) for i in range(N_layers): layer = all_r[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_r = layer[:, 0] new_r = np.concatenate((new_r, cur_r)) return new_r def train_z(data, w_vect_0, reg): N_data = data['X'].shape[0] def primal_loss(w_vect, reg, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) loss = loss_fun(w_vect, **minibatch) reg = regularization(w_vect, reg) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), reg, w_vect_0, alpha, beta, N_iters) all_regs, all_tests_loss = [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_regs.append(cur_reg.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) print "Cur_reg", np.mean(cur_reg) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(raw_grad, constraint) print constrained_grad # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha cur_reg -= constrained_grad * meta_alpha return cur_reg def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) # t_scale = [-1, 0, 1] # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) # for s in t_scale: # reg = np.ones(N_weights) * log_L2_init + s # loss = new_hyperloss(reg, 0, *cur_split) # print "Results: s= {0}, loss = {1}".format(s, loss) reg = np.ones(N_weights) * log_L2_init constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_reg, all_regs))) return all_L2_regs, all_tests_loss
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size init_scales = w_parser.new_vect(np.zeros(N_weights)) for i in range(N_layers): init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i]) init_scales[('biases', i)] = 1.0 init_scales = init_scales.vect def regularization(w_vect, reg): return np.dot(w_vect, w_vect * np.exp(reg)) def constrain_reg(t_vect, name): all_r = w_parser.new_vect(t_vect) for i in range(N_layers): all_r[('biases', i)] = 0.0 if name == 'universal': r_mean = np.mean( [np.mean(all_r[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_r[('weights', i)] = r_mean elif name == 'layers': for i in range(N_layers): all_r[('weights', i)] = np.mean(all_r[('weights', i)]) elif name == 'units': for i in range(N_layers): all_r[('weights', i)] = np.mean(all_r[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_r.vect def process_reg(t_vect): # Remove the redundancy due to sharing regularization within units all_r = w_parser.new_vect(t_vect) new_r = np.zeros((0, )) for i in range(N_layers): layer = all_r[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_r = layer[:, 0] new_r = np.concatenate((new_r, cur_r)) return new_r def train_z(data, w_vect_0, reg): N_data = data['X'].shape[0] def primal_loss(w_vect, reg, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) loss = loss_fun(w_vect, **minibatch) reg = regularization(w_vect, reg) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), reg, w_vect_0, alpha, beta, N_iters) all_regs, all_tests_loss = [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_regs.append(cur_reg.copy()) print "Hyper iter {0}, test loss {1}".format( i_hyper, all_tests_loss[-1]) print "Cur_reg", np.mean(cur_reg) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(raw_grad, constraint) print constrained_grad # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha cur_reg -= constrained_grad * meta_alpha return cur_reg def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) # t_scale = [-1, 0, 1] # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) # for s in t_scale: # reg = np.ones(N_weights) * log_L2_init + s # loss = new_hyperloss(reg, 0, *cur_split) # print "Results: s= {0}, loss = {1}".format(s, loss) reg = np.ones(N_weights) * log_L2_init constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) reg = train_reg(reg, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_reg, all_regs))) return all_L2_regs, all_tests_loss
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size exact_metagrad = [np.array([0])] #just a placeholder def transform_weights(z_vect, transform): return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_reg(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): #Don't regularize biases all_t[('biases', i)] = 0.0 if name == 'universal': #One regularization hyperparameter for all weights #TODO: does computing means of means make sense? Not the same as just the mean of all. t_mean = np.mean([np.mean(all_t[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[('weights', i)] = t_mean elif name == 'layers': #One regularization hyperparameter for each layer #TODO: changes the exact hypergradient norm, but not the DrMAD norm. Why??? DrMAD is already constrained? print t_vect.shape for i in range(N_layers): print "diff after contraining" + str(np.linalg.norm(all_t[('weights', i)] - np.mean(all_t[('weights', i)]))) all_t[('weights', i)] = np.mean(all_t[('weights', i)]) elif name == 'units': print t_vect.shape #44860; this is correct for i in range(N_layers): print "weights "+ str(i) + ": " + str(np.linalg.norm(np.mean(all_t[('weights', i)], axis=1, keepdims=True) - np.mean(all_t[('weights', i)], axis=1, keepdims=True))) #for i in range(N_layers): #TODO: This was the same as layer-wise #all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0,)) for i in range(N_layers): layer = all_t[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] new_t = np.concatenate((new_t, cur_t)) return new_t #TODO: make sure the exact_metagrad gets passed by reference def train_z(data, z_vect_0, transform, exact_metagrad): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): #exact_metagrad=exact_metagrad2, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), transform, z_vect_0, exact_metagrad, alpha, beta, N_iters) all_transforms, all_tests_loss, all_tests_rates, all_avg_regs = [], [], [], [] def train_reg(reg_0, constraint, N_meta_iter, i_top, exact_metagrad): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, exact_metagrad): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform, exact_metagrad) w_vect_final = transform_weights(z_vect_final, transform) #TODO: print/store losses and error rates here print "Training loss (unregularized) = " +str(getval(loss_fun(w_vect_final, **cur_train_data))) print "Validation loss = " +str(getval(loss_fun(w_vect_final, **cur_valid_data))) print "Test loss = " +str(getval(loss_fun(w_vect_final, **tests_data))) print "Training error = "+ str(getval(frac_err(w_vect_final, **cur_train_data))) print "Validation error = "+ str(getval(frac_err(w_vect_final, **cur_valid_data))) print "Test error = "+ str(getval(frac_err(w_vect_final, **tests_data))) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) #No chain rule here '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path? w_vect_final = transform_weights(z_vect_final, transform) return frac_err(w_vect_final, **cur_valid_data)''' cur_reg = reg_0 for i_hyper in range(N_meta_iter): print "Hyper iter "+ str(i_hyper) """if i_hyper % N_meta_thin == 0: test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data) all_tests_rates.append(test_rate) all_transforms.append(cur_reg.copy()) all_avg_regs.append(np.mean(cur_reg)) print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1]) print "Cur_transform", np.mean(cur_reg)""" RS = RandomState((seed, i_top, i_hyper, "hyperloss")) #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) cur_train_data, cur_valid_data = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data, exact_metagrad) #print "before constraining grad" constrained_grad = constrain_reg(raw_grad, constraint) # TODO: can put exact hypergradient here, using constraint #print "after constraining grad, before constraining exact" # TODO: DrMAD norm matches after constraining, but not exact norm?? Why??? # This one is about 4x larger than constrained one print np.linalg.norm(raw_grad) print np.linalg.norm(exact_metagrad[0]) constrained_exact_grad = constrain_reg(exact_metagrad[0], constraint) #print "after constraining exact" # TODO: compute statistics # TODO: sometimes negative??? print("cosine of angle between DrMAD and exact = " +str(np.dot(constrained_grad, constrained_exact_grad)/(np.linalg.norm(constrained_grad)*np.linalg.norm(constrained_exact_grad)))) print("cosine of angle between signs of DrMAD and exact = " +str(np.dot(np.sign(constrained_grad), np.sign(constrained_exact_grad))/len(constrained_grad))) print("DrMAD norm = "+ str(np.linalg.norm(constrained_grad))) print("Exact norm = "+ str(np.linalg.norm(constrained_exact_grad))) cur_reg -= np.sign(constrained_grad) * meta_alpha #TODO: signs of gradient... #TODO: momentum return cur_reg reg = np.zeros(N_weights)+0.2 constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top), constraint reg = train_reg(reg, constraint, N_meta_iter, i_top, exact_metagrad) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) return all_L2_regs, all_tests_rates, all_avg_regs