def hypergrad(outgrad): d_x = outgrad global v_current v = v_current d_alphas, d_gammas = np.zeros(len(alphas)), np.zeros(len(gammas)) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(gamma) output. beta = np.linspace(0.001, 0.999, N_safe_sampling) #evenly spaced, Michael for i, alpha, gamma in iters[::-1]: # Here is the averaging sequence, Michael x = (1 - beta[i]) * x_init + beta[i] * x_final x_previous = (1 - beta[i - 1]) * x_init + beta[i - 1] * x_final v = np.subtract(x, x_previous) / alpha #recover velocity d_alphas[i] = np.dot(d_x, v) g = L_grad(x, meta, i) # Evaluate gradient # v = (v+(1.0 - gamma)*g)/gamma d_v += d_x * alpha d_gammas[i] = np.dot(d_v, v + g) d_x -= (1.0 - gamma) * L_hvp_x( x, meta, d_v, i) #DrMad paper forgot to mention this line, Michael d_meta -= (1.0 - gamma) * L_hvp_meta(x, meta, d_v, i) d_v *= gamma #DrMad paper forgot to mention this line, Michael # assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_gammas, d_meta
def hypergrad(outgrad): d_x = outgrad global v_current v = v_current d_alphas, d_gammas = np.zeros(alphas.shape), np.zeros(gammas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. beta = np.linspace(0.001, 0.999, N_safe_sampling) for i, alpha, gamma in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_gamma_vect = fill_parser(parser, gamma) x = (1 - beta[i]) * x_init + beta[i] * x_final x_previous = (1 - beta[i - 1]) * x_init + beta[i - 1] * x_final v = (np.subtract(x, x_previous)) / cur_alpha_vect # recover velocity for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i,j] = np.dot(d_x[ixs], v[ixs]) g = L_grad(x, meta, i) # Evaluate gradient d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_gammas[i,j] = np.dot(d_v[ixs], v[ixs] + g[ixs]) d_x -= L_hvp_x(x, meta, (1.0 - cur_gamma_vect)*d_v, i) d_meta -= L_hvp_meta(x, meta, (1.0 - cur_gamma_vect)* d_v, i) d_v *= cur_gamma_vect # assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_gammas, d_meta
def hypergrad(outgrad): d_x = outgrad d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) L_hvp_meta = grad(grad_proj, 1) for i, alpha, beta in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_beta_vect = fill_parser(parser, beta) for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i, j] = np.dot(d_x[ixs], V.val[ixs]) # Exactly reverse SGD X.sub(cur_alpha_vect * V.val) g = L_grad(X.val, meta, i) V.add(g).div(cur_beta_vect) d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_betas[i, j] = np.dot(d_v[ixs], V.val[ixs]) d_x -= L_hvp_x(X.val, meta, d_v, i) d_meta -= L_hvp_meta(X.val, meta, d_v, i) d_v *= cur_beta_vect assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta
def hypergrad(outgrad): d_x = outgrad d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. for i, alpha, beta in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_beta_vect = fill_parser(parser, beta) for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i, j] = np.dot(d_x[ixs], V.val[ixs]) X.sub(cur_alpha_vect * V.val) # Reverse position update g = L_grad(X.val, meta, i) # Evaluate gradient V.add((1.0 - cur_beta_vect) * g).div(cur_beta_vect) # Reverse momentum update d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_betas[i, j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs]) d_x -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect) * d_v, i) d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect) * d_v, i) d_v *= cur_beta_vect assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta
def run(): print "Running experiment..." sgd_optimized_points = [] ed_optimized_points = [] for i in xrange(N_samples): rs = RandomState((seed, i)) x0 = rs.randn(D) * x_init_scale v0 = rs.randn(D) * v_init_scale sgd_optimized_points.append( sgd(grad(nllfunt), x=x0, v=v0, learn_rate=alpha, decay=decay, iters=N_iter)) rs = RandomState((seed, i)) x0 = rs.randn(D) * x_init_scale v0 = rs.randn(D) * v_init_scale ed_optimized_points.append( entropic_descent(grad(nllfunt), x=x0, v=v0, learn_rate=alpha, decay=decay, iters=N_iter, theta=theta, rs=rs)) entropy = np.log(decay) * D * N_iter return sgd_optimized_points, ed_optimized_points, entropy
def run(): print "Running experiment..." sgd_optimized_points = [] ed_optimized_points = [] aed_optimized_points = [] asgd_optimized_points = [] for i in xrange(N_samples): rs = RandomState((seed, i)) x0 = rs.randn(D) * x_init_scale v0 = rs.randn(D) * v_init_scale sgd_optimized_points.append( sgd(grad(nllfunt), x=x0, v=v0, learn_rate=alpha, decay=decay, iters=N_iter)) rs = RandomState((seed, i)) x0 = rs.randn(D) * x_init_scale v0 = rs.randn(D) * v_init_scale ed_optimized_points.append( entropic_descent(grad(nllfunt), x=x0, v=v0, learn_rate=alpha, decay=decay, iters=N_iter, theta=theta, rs=rs)) entropy = np.log(decay) * D * N_iter rs = RandomState((seed, i)) x0 = rs.randn(D) * x_init_scale v0 = rs.randn(D) * v_init_scale aed_optimized_points.append( adaptive_entropic_descent(grad(nllfunt), x=x0, v=v0, init_learn_rate=alpha, init_log_decay=np.log(decay), meta_learn_rate=meta_alpha, meta_decay=meta_decay, iters=N_iter)) rs = RandomState((seed, i)) x0 = rs.randn(D) * x_init_scale v0 = rs.randn(D) * v_init_scale asgd_optimized_points.append( adaptive_sgd(grad(nllfunt), x=x0, v=v0, init_learn_rate=alpha, init_log_decay=np.log(decay), meta_learn_rate=meta_alpha, meta_decay=meta_decay, iters=N_iter)) return sgd_optimized_points, ed_optimized_points, aed_optimized_points, asgd_optimized_points, entropy
def hypergrad(outgrad): d_x = outgrad d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. for i, alpha, beta in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_beta_vect = fill_parser(parser, beta) for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i,j] = np.dot(d_x[ixs], V.val[ixs]) X.sub(cur_alpha_vect * V.val) # Reverse position update g = L_grad(X.val, meta, i) # Evaluate gradient V.add((1.0 - cur_beta_vect) * g).div(cur_beta_vect) # Reverse momentum update d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_betas[i,j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs]) d_x -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect)*d_v, i) d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect)* d_v, i) d_v *= cur_beta_vect assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta
def test_sub(): fun = lambda x, y : to_scalar(x - y) d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y)) for arg1, arg2 in arg_pairs(): check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def test_sub(): fun = lambda x, y: to_scalar(x - y) d_fun_0 = lambda x, y: to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y: to_scalar(grad(fun, 1)(x, y)) for arg1, arg2 in arg_pairs(): check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def sgd(loss_fun, batches, N_iter, x, v, alphas, betas, record_learning_curve=False): # TODO: Warp alpha and beta to map from real-valued domains (exp and logistic?) def print_perf(): pass if (i + 1) % iter_per_epoch == 0: print "End of epoch {0}: loss is {1}".format(i / iter_per_epoch, loss_fun(X.val, batches.all_idxs)) X, V = ExactRep(x), ExactRep(v) x_orig = X.val iter_per_epoch = len(batches) num_epochs = N_iter/len(batches) + 1 iters = zip(range(N_iter), alphas, betas, batches * num_epochs) loss_grad = grad(loss_fun) loss_hvp = grad(lambda x, d, idxs : np.dot(loss_grad(x, idxs), d)) learning_curve = [loss_fun(x_orig, batches.all_idxs)] for i, alpha, beta, batch in iters: V.mul(beta) g = loss_grad(X.val, batch) V.sub((1.0 - beta) * g) X.add(alpha * V.val) if record_learning_curve and (i+1) % iter_per_epoch == 0: learning_curve.append(loss_fun(X.val, batches.all_idxs)) #print_perf() x_final = X.val d_x = loss_grad(X.val, batches.all_idxs) loss_final = loss_fun(x_final, batches.all_idxs) d_v = np.zeros(d_x.shape) d_alphas = deque() d_betas = deque() print_perf() for i, alpha, beta, batch in iters[::-1]: print_perf() d_v += d_x * alpha X.sub(alpha * V.val) g = loss_grad(X.val, batch) d_alphas.appendleft(np.dot(d_x, V.val)) V.add((1.0 - beta) * g) V.div(beta) d_betas.appendleft(np.dot(d_v, V.val + g)) d_x = d_x - (1.0 - beta) * loss_hvp(X.val, d_v, batch) d_v = d_v * beta d_alphas = np.array(d_alphas) d_betas = np.array(d_betas) # print "-"*80 assert np.all(x_orig == X.val) return {'x_final' : x_final, 'learning_curve' : learning_curve, 'loss_final' : loss_final, 'd_x' : d_x, 'd_v' : d_v, 'd_alphas' : d_alphas, 'd_betas' : d_betas}
def sgd(loss_fun, batches, N_iter, x, v, alphas, betas, record_learning_curve=False): # TODO: Warp alpha and beta to map from real-valued domains (exp and logistic?) def print_perf(): pass if (i + 1) % iter_per_epoch == 0: print "End of epoch {0}: loss is {1}".format(i / iter_per_epoch, loss_fun(X.val, batches.all_idxs)) X, V = ExactRep(x), ExactRep(v) x_orig = X.val iter_per_epoch = len(batches) num_epochs = N_iter / len(batches) + 1 iters = zip(range(N_iter), alphas, betas, batches * num_epochs) loss_grad = grad(loss_fun) loss_hvp = grad(lambda x, d, idxs: np.dot(loss_grad(x, idxs), d)) learning_curve = [loss_fun(x_orig, batches.all_idxs)] for i, alpha, beta, batch in iters: V.mul(beta) g = loss_grad(X.val, batch) V.sub((1.0 - beta) * g) X.add(alpha * V.val) if record_learning_curve and (i + 1) % iter_per_epoch == 0: learning_curve.append(loss_fun(X.val, batches.all_idxs)) # print_perf() x_final = X.val d_x = loss_grad(X.val, batches.all_idxs) loss_final = loss_fun(x_final, batches.all_idxs) d_v = np.zeros(d_x.shape) d_alphas = deque() d_betas = deque() print_perf() for i, alpha, beta, batch in iters[::-1]: print_perf() d_v += d_x * alpha X.sub(alpha * V.val) g = loss_grad(X.val, batch) d_alphas.appendleft(np.dot(d_x, V.val)) V.add((1.0 - beta) * g) V.div(beta) d_betas.appendleft(np.dot(d_v, V.val + g)) d_x = d_x - (1.0 - beta) * loss_hvp(X.val, d_v, batch) d_v = d_v * beta d_alphas = np.array(d_alphas) d_betas = np.array(d_betas) # print "-"*80 assert np.all(x_orig == X.val) return {'x_final': x_final, 'learning_curve': learning_curve, 'loss_final': loss_final, 'd_x': d_x, 'd_v': d_v, 'd_alphas': d_alphas, 'd_betas': d_betas}
def test_div(): fun = lambda x, y: to_scalar(x / y) d_fun_0 = lambda x, y: to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y: to_scalar(grad(fun, 1)(x, y)) make_gap_from_zero = lambda x: np.sqrt(x**2 + 0.5) for arg1, arg2 in arg_pairs(): arg1 = make_gap_from_zero(arg1) arg2 = make_gap_from_zero(arg2) check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def test_div(): fun = lambda x, y : to_scalar(x / y) d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y)) make_gap_from_zero = lambda x : np.sqrt(x **2 + 0.5) for arg1, arg2 in arg_pairs(): arg1 = make_gap_from_zero(arg1) arg2 = make_gap_from_zero(arg2) check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def test_pow(): fun = lambda x, y : to_scalar(x ** y) d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y)) make_positive = lambda x : np.abs(x) + 1.1 # Numeric derivatives fail near zero for arg1, arg2 in arg_pairs(): arg1 = make_positive(arg1) arg2 = np.round(arg2) check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def test_pow(): fun = lambda x, y: to_scalar(x**y) d_fun_0 = lambda x, y: to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y: to_scalar(grad(fun, 1)(x, y)) make_positive = lambda x: np.abs( x) + 1.1 # Numeric derivatives fail near zero for arg1, arg2 in arg_pairs(): arg1 = make_positive(arg1) arg2 = np.round(arg2) check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def test_hess_vector_prod(): npr.seed(1) randv = npr.randn(10) def fun(x): return np.sin(np.dot(x, randv)) df = grad(fun) def vector_product(x, v): return np.sin(np.dot(v, df(x))) ddf = grad(vector_product) A = npr.randn(10) B = npr.randn(10) check_grads(fun, A) check_grads(vector_product, A, B)
def sgd3(optimizing_loss, secondary_loss, x0, v0, alphas, betas, meta, callback=None): """Same as sgd2 but simplifies things by not bothering with grads of optimizing loss (can always just pass that in as the secondary loss)""" X, V = ExactRep(x0), ExactRep(v0) L_grad = grad(optimizing_loss) # Gradient wrt parameters. grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. iters = zip(range(len(alphas)), alphas, betas) for i, alpha, beta in iters: if callback: callback(X.val, i) g = L_grad(X.val, meta, i) V.mul(beta).sub((1.0 - beta) * g) X.add(alpha * V.val) x_final = X.val M_grad = grad(secondary_loss, 0) # Gradient wrt parameters. M_meta_grad = grad(secondary_loss, 1) # Gradient wrt metaparameters. dMd_x = M_grad(X.val, meta) dMd_v = np.zeros(dMd_x.shape) dMd_alphas = deque() dMd_betas = deque() dMd_meta = M_meta_grad(X.val, meta) for i, alpha, beta in iters[::-1]: dMd_alphas.appendleft(np.dot(dMd_x, V.val)) X.sub(alpha * V.val) g = L_grad(X.val, meta, i) V.add((1.0 - beta) * g).div(beta) dMd_v += dMd_x * alpha dMd_betas.appendleft(np.dot(dMd_v, V.val + g)) dMd_x -= (1.0 - beta) * L_hvp_x(X.val, meta, dMd_v, i) dMd_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, dMd_v, i) dMd_v *= beta assert np.all(ExactRep(x0).val == X.val) return { 'x_final': x_final, 'dMd_x': dMd_x, 'dMd_v': dMd_v, 'dMd_alphas': dMd_alphas, 'dMd_betas': dMd_betas, 'dMd_meta': dMd_meta }
def hypergrad(outgrad): d_x = outgrad d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. beta = np.linspace(0.001,0.999,N_safe_sampling) for i in range(N_safe_sampling)[::-1]: x_current = (1-beta[i])*x_init + beta[i]*x_final d_v += d_x * alpha d_x -= (1.0 - gamma) * L_hvp_x(x_current, meta, d_v, i) d_meta -= (1.0 - gamma) * L_hvp_meta(x_current, meta, d_v, i) d_v *= gamma # assert np.all(ExactRep(x0).val == X.val) return d_meta
def build_lstm(seq_width, state_size, output_size, l2_penalty=0.0): parser = VectorParser() parser.add_shape('change', (seq_width + state_size + 1, state_size)) parser.add_shape('gate', (seq_width + state_size + 1, state_size)) parser.add_shape('keep', (seq_width + state_size + 1, state_size)) parser.add_shape('output', (state_size, output_size)) def update_lstm(input, state, change_weights, gate_weights, keep_weights): """One iteration of an LSTM layer without an output.""" change = activations(input, state, change_weights) gate = activations(input, state, gate_weights) keep = activations(input, state, keep_weights) return state * keep + gate * change def compute_hiddens(weights_vect, seqs): """Goes from right to left, updating the state.""" weights = parser.new_vect(weights_vect) num_seqs = seqs.shape[1] state = np.zeros((num_seqs, state_size)) for cur_input in seqs: # Iterate over time steps. state = update_lstm(cur_input, state, weights['change'], weights['gate'], weights['keep']) return state def predictions(weights_vect, seqs): weights = parser.new_vect(weights_vect) return np.dot(compute_hiddens(weights_vect, seqs), weights['output']) def loss(weights, seqs, targets): log_lik = -np.sum((predictions(weights, seqs) - targets)**2) log_prior = -l2_penalty * np.dot(weights, weights) return (-log_prior - log_lik) / targets.shape[0] return loss, grad(loss), predictions, compute_hiddens, parser
def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = RS.randn(N_weights) * initialization_scale W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False)
def run(): train_data, valid_data, test_data = load_data_subset(N_train, N_valid, N_test) kernel = make_sq_exp_kernel(L0) def loss_fun(transform, train_data, valid_data): train_data = augment_data(train_data, transform) return weighted_neighbors_loss(train_data, valid_data, kernel) loss_grad = batchwise_function(grad(loss_fun)) loss_fun = batchwise_function(loss_fun) batch_idxs = BatchList(N_valid, batch_size) A = np.eye(N_pix) valid_losses = [loss_fun(A, train_data, valid_data)] test_losses = [loss_fun(A, train_data, test_data)] A += A_init_scale * npr.randn(N_pix, N_pix) for meta_iter in range(N_meta_iters): print "Iter {0} valid {1} test {2}".format( meta_iter, valid_losses[-1], test_losses[-1]) for idxs in batch_idxs: valid_batch = [x[idxs] for x in valid_data] d_A = loss_grad(A, train_data, valid_batch) A -= meta_alpha * (d_A + meta_L1 * np.sign(A)) valid_losses.append(loss_fun(A, train_data, valid_data)) test_losses.append( loss_fun(A, train_data, test_data)) return A, valid_losses, test_losses
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data["X"][idxs], train_data["T"][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict["learning_curve"].append(loss_fun(x, **train_data)) learning_curve_dict["grad_norm"].append(np.linalg.norm(g)) learning_curve_dict["weight_norm"].append(np.linalg.norm(x)) learning_curve_dict["velocity_norm"].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams["log_param_scale"])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams["log_alphas"]) betas = logit(cur_hyperparams["invlogit_betas"]) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams["log_L2_reg"])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) # callback(W_opt, N_iters) return W_opt, learning_curve_dict
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState(npr.RandomState(global_seed + i_hyper + i_iter * 10000).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # N_batches=10 times learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) # TODO: why doesn't the following line work with N_iter=1? W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) #don't update scale W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) # TODO: Put on proper scale; no SGD on log/invlogit scale alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) # TODO: check this L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), exact_metagrad, callback) #W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) #callback(W_opt, N_iters) return W_opt, learning_curve_dict
def sgd3_naive(optimizing_loss, x, v, alphas, betas, meta, fwd_callback=None, reverse_callback=None): """Same as sgd2 but simplifies things by not bothering with grads of optimizing loss (can always just pass that in as the secondary loss)""" x = x.astype(np.float16) v = v.astype(np.float16) L_grad = grad(optimizing_loss) # Gradient wrt parameters. iters = zip(range(len(alphas)), alphas, betas) # Forward pass for i, alpha, beta in iters: if fwd_callback: fwd_callback(x, i) g = L_grad(x, meta, i) v = v * beta v = v - ((1.0 - beta) * g) x = x + alpha * v x = x.astype(np.float16) v = v.astype(np.float16) # Reverse pass for i, alpha, beta in iters[::-1]: x = x - alpha * v g = L_grad(x, meta, i) v = v + (1.0 - beta) * g v = v / beta if reverse_callback: reverse_callback(x, i) x = x.astype(np.float16) v = v.astype(np.float16)
def hyperloss(hyperparam_vect, i): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append( loss_fun(x, X=train_images, T=train_labels)) npr.seed(i) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [ np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names) ] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] W_opt = sgd5(grad(indexed_loss_fun), kylist(W0, alphas, betas, log_L2_reg), callback) all_x.append(getval(W_opt)) all_learning_curves.append(learning_curve) return valid_loss_fun(W_opt)
def hypergrad(outgrad): d_x = outgrad d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. for i in range(N_iters)[::-1]: X.sub(alpha * V.val) # Reverse position update g = L_grad(X.val, meta, i) # Evaluate gradient V.add((1.0 - beta) * g).div(beta) # Reverse momentum update d_v += d_x * alpha d_x -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i) d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i) d_v *= beta assert np.all(ExactRep(x0).val == X.val) return d_meta
def train_reg(transform_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_transform = transform_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_transforms.append(cur_transform.copy()) print "Hyper iter {0}, test loss {1}".format( i_hyper, all_tests_loss[-1]) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_transform, i_hyper, *cur_split) constrained_grad = constrain_transform(raw_grad, constraint) cur_transform -= constrained_grad * meta_alpha return cur_transform
def adam(grad, x, callback=None, num_iters=100, step_size=0.1, b1=0.1, b2=0.01, eps=10**-4, lam=10**-4): m = np.zeros(len(x)) v = np.zeros(len(x)) for i in xrange(num_iters): b1t = 1 - (1 - b1) * (lam**i) g = grad(x, i) if callback: callback(x, i, g) m = b1t * g + (1 - b1t) * m v = b2 * (g**2) + (1 - b2) * v mhat = m / (1 - (1 - b1)**(i + 1)) vhat = v / (1 - (1 - b2)**(i + 1)) x -= step_size * mhat / (np.sqrt(vhat) + eps) return x
def primal_optimizer(hyperparams_vect, meta_epoch): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, meta_epoch, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) cur_hyperparams = hyperparams.new_vect(hyperparams_vect) rs = RandomState((seed, meta_epoch)) # Randomly initialize weights W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) # Init regularization term L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) # Set step sizes alphas = np.exp(cur_hyperparams['log_alphas']) # Momentum terms betas = logit(cur_hyperparams['invlogit_betas']) # Train model W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser) cur_primal_results['weights'] = getval(W_opt).copy() return W_opt
def test_concatenate_axis_1(): A = npr.randn(5, 6, 4) B = npr.randn(5, 6, 4) def fun(x): return to_scalar(np.concatenate((B, x, B), axis=1)) d_fun = lambda x : to_scalar(grad(fun)(x)) check_grads(fun, A) check_grads(d_fun, A)
def run(): (train_images, train_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_tests) parser, pred_fun, nllfun, frac_err = make_nn_funs(layer_sizes, L2_per_dpt) N_param = len(parser.vect) print "Running experiment..." results = defaultdict(list) for i in xrange(N_samples): x_init_scale = np.full(N_param, init_scale) def indexed_loss_fun(w, i_iter): rs = RandomState((seed, i, i_iter)) idxs = rs.randint(N_train, size=batch_size) return nllfun(w, train_images[idxs], train_labels[idxs]) * N_train gradfun = grad(indexed_loss_fun) def callback(x, t, v, entropy): results[("entropy", i)].append(entropy / N_train) results[("v_norm", i)].append(norm(v) / np.sqrt(N_param)) results[("minibatch_likelihood", i)].append(-indexed_loss_fun(x, t)) if t % thin != 0 and t != N_iter and t != 0: return results[('iterations', i)].append(t) results[("train_likelihood", i)].append(-nllfun(x, train_images, train_labels)) results[("tests_likelihood", i)].append(-nllfun(x, tests_images, tests_labels)) results[("tests_error", i)].append(frac_err(x, tests_images, tests_labels)) print "Iteration {0:5} Train likelihood {1:2.4f} Test likelihood {2:2.4f}" \ " Test Err {3:2.4f}".format(t, results[("train_likelihood", i)][-1], results[("tests_likelihood", i)][-1], results[("tests_error", i)][-1]) rs = RandomState((seed, i)) entropic_descent2(gradfun, callback=callback, x_scale=x_init_scale, epsilon=epsilon, gamma=gamma, alpha=alpha, annealing_schedule=annealing_schedule, rs=rs) return results
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict
def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_regs.append(cur_reg.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) print "Cur_reg", cur_reg # print "Cur_reg", np.mean(cur_reg) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) # print("calculate hypergradients") raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(raw_grad, constraint) # print "constrained_grad",constrained_grad print "\n" # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha cur_reg -= constrained_grad * meta_alpha # cur_reg -= np.sign(constrained_grad) * meta_alpha return cur_reg
def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) def error_rate(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return frac_err(w_vect_final, **cur_valid_data) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data) all_tests_rates.append(test_rate) all_transforms.append(cur_reg.copy()) all_avg_regs.append(np.mean(cur_reg)) print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1]) print "Cur_transform", np.mean(cur_reg) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(raw_grad, constraint) cur_reg -= np.sign(constrained_grad) * meta_alpha return cur_reg
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0 or i_iter == N_iters or i_iter == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) learning_curve_dict['iteration'].append(i_iter + 1) print "iteration", i_iter cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict
def test_adam(): N_weights = 5 W0 = 0.1 * npr.randn(N_weights) (loss_fun, true_argmin) = make_optimization_problem(N_weights) x_min = adam(grad(loss_fun), W0) assert np.allclose(x_min, true_argmin, rtol=1e-3, atol=1e-4), \ "Diffs are: {0}".format(x_min - true_argmin)
def hyperloss(transform_vect, i_hyper, record_results=False): def primal_stochastic_loss(z_vect, transform_vect, i_primal): RS = RandomState((seed, i_hyper, i_primal)) loss = 0.0 for _ in range(N_scripts_per_iter): i_script = RS.randint(N_scripts) N_train = train_data[i_script]["X"].shape[0] idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data[i_script], idxs) loss += loss_from_latents(z_vect, transform_vect, i_script, minibatch) loss /= N_scripts_per_iter reg = regularization(z_vect) # if i_primal % 10 == 0: # print "Iter {0}, loss {1}, reg {2}".format(i_primal, getval(loss), getval(reg)) # print "Full losses: train: {0}, valid: {1}".format( # total_loss(train_data, getval(z_vect)), # total_loss(valid_data, getval(z_vect))) return loss + reg def total_loss(data, z_vect): return np.mean( [loss_from_latents(z_vect, transform_vect, i_script, data[i_script]) for i_script in range(N_scripts)] ) z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_stochastic_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) valid_loss = total_loss(valid_data, z_vect_final) if record_results: results["valid_loss"].append(valid_loss) results["train_loss"].append(total_loss(train_data, z_vect_final)) # results['tests_loss'].append(total_loss(tests_data, z_vect_final)) return valid_loss
def test_sign(): fun = lambda x : 3.0 * np.sign(x) d_fun = grad(fun) check_grads(fun, 1.1) check_grads(fun, -1.1) check_grads(d_fun, 1.1) check_grads(d_fun, -1.1)
def test_abs(): fun = lambda x : 3.0 * np.abs(x) d_fun = grad(fun) check_grads(fun, 1.1) check_grads(fun, -1.1) check_grads(d_fun, 1.1) check_grads(d_fun, -1.1)
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState( npr.RandomState(global_seed + i_hyper).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve = [] def callback(x, i_iter): if i_iter % N_batches == 0: learning_curve.append(loss_fun(x, **train_data)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) callback(W_opt, N_iters) return W_opt, learning_curve
def test_sgd_parser(): N_weights = 6 W0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 4 batch_idxs = BatchList(N_data, batch_size) parser = VectorParser() parser.add_shape('first', [2,]) parser.add_shape('second', [1,]) parser.add_shape('third', [3,]) N_weight_types = 3 alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) meta = 0.1 * npr.randn(N_weights*2) A = npr.randn(N_data, N_weights) def loss_fun(W, meta, i=None): idxs = batch_idxs.all_idxs if i is None else batch_idxs[i % len(batch_idxs)] sub_A = A[idxs, :] return np.dot(np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W) def full_loss(params): (W0, alphas, betas, meta) = params result = sgd_parsed(grad(loss_fun), kylist(W0, alphas, betas, meta), parser) return loss_fun(result, meta) d_num = nd(full_loss, (W0, alphas, betas, meta)) d_an_fun = grad(full_loss) d_an = d_an_fun([W0, alphas, betas, meta]) for i, (an, num) in enumerate(zip(d_an, d_num[0])): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num)
def run(): train_data, valid_data, test_data = load_data_subset( N_train, N_valid, N_test) kernel = make_sq_exp_kernel(L0) def loss_fun(transform, train_data, valid_data): train_data = augment_data(train_data, transform) return weighted_neighbors_loss(train_data, valid_data, kernel) loss_grad = batchwise_function(grad(loss_fun)) loss_fun = batchwise_function(loss_fun) batch_idxs = BatchList(N_valid, batch_size) A = np.eye(N_pix) valid_losses = [loss_fun(A, train_data, valid_data)] test_losses = [loss_fun(A, train_data, test_data)] A += A_init_scale * npr.randn(N_pix, N_pix) for meta_iter in range(N_meta_iters): print "Iter {0} valid {1} test {2}".format(meta_iter, valid_losses[-1], test_losses[-1]) for idxs in batch_idxs: valid_batch = [x[idxs] for x in valid_data] d_A = loss_grad(A, train_data, valid_batch) A -= meta_alpha * (d_A + meta_L1 * np.sign(A)) valid_losses.append(loss_fun(A, train_data, valid_data)) test_losses.append(loss_fun(A, train_data, test_data)) return A, valid_losses, test_losses
def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg) # fraction_error = frac_err(w_vect_final,**cur_valid_data) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) #reg is the list of hyperparameters cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_regs.append(cur_reg.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) # print "Cur_reg", np.mean(cur_reg) print "Cur_reg", cur_reg for client_i in range (0,clientNum): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data_subclass.__getitem__(client_i), RS, [N_train - N_valid, N_valid]) # print("calculate hypergradients") raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(w_parser, raw_grad, constraint) # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha # cur_reg -= constrained_grad * meta_alpha/clientNum cur_reg -= np.sign(constrained_grad) * meta_alpha/clientNum print "\n" # print "constrained_grad",constrained_grad return cur_reg
def hyperloss(transform_vect, i_hyper, record_results=False): def sub_primal_stochastic_loss(z_vect, transform_vect, i_primal, i_script): RS = RandomState((seed, i_hyper, i_primal, i_script)) N_train = train_data[i_script]['X'].shape[0] idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data[i_script], idxs) loss = loss_from_latents(z_vect, transform_vect, i_script, minibatch) if i_primal % N_thin == 0 and i_script == 0: print "Iter {0}, full losses: train: {1}, valid: {2}".format( i_primal, total_loss(train_data, getval(z_vect)), total_loss(valid_data, getval(z_vect))) if i_script == 0: # Only add regularization once loss += regularization(z_vect) return loss def total_loss(data, z_vect): return np.mean([loss_from_latents(z_vect, transform_vect, i_script, data[i_script]) for i_script in range(N_scripts)]) z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(sub_primal_stochastic_loss), transform_vect, z_vect_0, alpha, beta, N_iters, N_scripts_per_iter, callback=None) valid_loss = total_loss(valid_data, z_vect_final) if record_results: results['valid_loss'].append(valid_loss) results['train_loss'].append(total_loss(train_data, z_vect_final)) # results['tests_loss'].append(total_loss(tests_data, z_vect_final)) return valid_loss
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) #fixed_hyperparams = VectorParser() #fixed_hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState(npr.RandomState(global_seed + i_hyper).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve = [] def callback(x, i_iter): if i_iter % N_batches == 0: learning_curve.append(loss_fun(x, **train_data)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) V0 = np.zeros(W0.size) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) # return loss_fun(W_opt, **valid_data) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) parser.vect = None # No need to pickle zeros return meta_results, parser
def test_concatenate_axis_1_unnamed(): """Tests whether you can specify the axis without saying "axis=1".""" A = npr.randn(5, 6, 4) B = npr.randn(5, 6, 4) def fun(x): return to_scalar(np.concatenate((B, x, B), 1)) d_fun = lambda x : to_scalar(grad(fun)(x)) check_grads(fun, A) check_grads(d_fun, A)
def test_index_multiple_slices(): A = npr.randn(7) def fun(x): y = x[2:6] z = y[1:3] return to_scalar(z) d_fun = lambda x : to_scalar(grad(fun)(x)) check_grads(fun, A) check_grads(d_fun, A)
def test_index_slice_fanout(): A = npr.randn(5, 6, 4) def fun(x): y = x[::-1, 2:4, :] z = x[::-1, 3:5, :] return to_scalar(y + z) d_fun = lambda x : to_scalar(grad(fun)(x)) check_grads(fun, A) check_grads(d_fun, A)