def __init__(self, unit, x, d, n_layers, n_in, n_h, n_y, reg=0.0001): if unit == 'lstm': self.layers = lstm.layers else: self.layers = gru.layers self.x = x # x: 1D: batch_size, 2D: n_words, 3D: n_fin self.d = d # d: 1D: batch_size, 2D: n_words n_fin = n_in * 7 + 1 batch = T.cast(self.d.shape[0], dtype='int32') params, o_layer, emit = self.layers(x=self.x, batch=batch, n_fin=n_fin, n_h=n_h, n_y=n_y, n_layers=n_layers) self.p_y = y_prob(o_layer, emit, self.d.dimshuffle(1, 0), batch) self.y_pred = vitabi(o_layer, emit, batch) self.nll = -T.mean(self.p_y) self.L2_sqr = L2_sqr(params) self.cost = self.nll + reg * self.L2_sqr / 2. self.errors = T.neq(self.y_pred, self.d) self.g = T.grad(self.cost, params) self.updates = adam(params, self.g)
def update_nn(init_var_params, batch_data, batch_labels): log_posterior = lambda weights, t: logprob(weights, batch_data, batch_labels) # Build variational objective. objective, gradient, unpack_params = \ black_box_variational_inference(log_posterior, num_weights, num_samples=20) variational_params = adam(gradient, init_var_params, step_size=0.01, num_iters=50) return variational_params
def train_lstm(inputs, outputs, state_size, batch_size=256, param_scale=0.001, num_epochs=5, step_size=0.001): # split data (again) into a training and a validation set (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data( inputs, out_data=outputs, frac=0.80) input_size = tr_inputs.shape[2] output_size = tr_outputs.shape[2] init_params = init_lstm_params(input_size, state_size, output_size, param_scale=param_scale, rs=npr.RandomState(0)) num_batches = int(np.ceil(tr_inputs.shape[1] / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx+1) * batch_size) # Define training objective def objective(params, iter): idx = batch_indices(iter) return -lstm_log_likelihood( params, tr_inputs[:, idx, :], tr_outputs[:, idx, :]) # Get gradient of objective using autograd. objective_grad = grad(objective) print( " Epoch | Train accuracy | Train log-like | Holdout accuracy | Holdout log-like ") def print_perf(params, iter, gradient): train_acc = accuracy(params, tr_inputs, tr_outputs) train_ll = -lstm_log_likelihood(params, tr_inputs, tr_outputs) valid_acc = accuracy(params, va_inputs, va_outputs) valid_ll = -lstm_log_likelihood(params, va_inputs, va_outputs) print("{:15}|{:20}|{:20}|{:20}|{:20}".format( iter//num_batches, train_acc, train_ll, valid_acc, valid_ll)) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, init_params, step_size=step_size, num_iters=num_epochs, callback=print_perf) return optimized_params
def get_Aopt(inX, iny): X_train, y_train, X_test, y_test = ascdata.split_train_test(inX, iny) X_train = np.concatenate((X_train, np.ones((X_train.shape[ 0 ], 1))), 1) X_test = np.concatenate((X_test, np.ones((X_test.shape[ 0 ], 1))), 1) X_train_less, s_train = ascdata.split_X_s(X_train) X_test_less, s_test = ascdata.split_X_s(X_test) s_train_phi = ascdata.generate_phi(s_train, d, A_phi, b_phi) s_test_phi = ascdata.generate_phi(s_test, d, A_phi, b_phi) nfeatures = X_train.shape[1] - 1 # Dimensions of phi(s) nfeatures_phi = d invT2 = 10 def logprob(inA, inX, iny, ins_phi): RMS = 0 for i in range(len(iny)): wi = np.dot(inA, inX[i]) RMS_current = (iny[i] - np.dot(wi, ins_phi[i]))**2 RMS += RMS_current return -RMS objective = lambda inA, t: -logprob(inA, X_train_less, y_train, s_train_phi) LLHs = [] LLH_xs = [] def callback(params, t, g): LLH = -objective(params, t) LLHs.append(LLH) LLH_xs.append(t) print("Iteration {} log likelihood {}".format(t, LLH)) init_A = 0.00000000001*(np.ones((nfeatures_phi, nfeatures))) # init_A = [[ -3.05236728e-04, -9.50015728e-04, -3.80139503e-04, 1.44010470e-04, -3.05236728e-04, # -4.96117987e-04, -1.02736409e-04, -1.86416292e-04, -9.52628589e-04, -1.55023279e-03, # 1.44717581e-04, 1.00000000e-11, -9.50028200e-04, -4.96117987e-04, 1.00000000e-11, # -3.05236728e-04, 1.77416412e-06, -8.16665436e-06, 3.12622951e-05, -8.25700143e-04, # 1.44627987e-04, 1.90211243e-05, -8.28273186e-04, -9.41349990e-04, -4.56671031e-04, # 9.79097070e-03, -6.41866046e-04, -7.79274856e-05, 1.44539330e-04, -3.05236728e-04, # -5.99188450e-04, -7.29470175e-04, -6.69558174e-04, -9.50028200e-04]] init_A = np.array(init_A) print("Optimizing network parameters...") optimized_params = adam(grad(objective), init_A, step_size=0.01, num_iters=1000, callback=callback) Aopt = optimized_params print "Aopt = ", Aopt return Aopt, X_train_less, y_train, s_train, X_test_less, y_test, s_test, LLHs, LLH_xs
def update_nn(init_var_params, batch_data, batch_labels, iteration): log_posterior = lambda weights, t: logprob(weights, batch_data, batch_labels) # Build variational objective. objective, gradient, unpack_params = \ black_box_variational_inference(log_posterior, num_weights, num_samples=20) variational_params = adam(gradient, init_var_params, step_size=0.1, num_iters=10) return variational_params, objective
def train_nn( inputs, outputs, num_hiddens, batch_size=256, param_scale=0.1, num_epochs=5, step_size=0.001, L2_reg=1.0): # split data (again) into a training and a validation set (tr_inputs, va_inputs), (tr_outputs, va_outputs) = ctd.split_data( inputs, out_data=outputs, frac=0.80) num_input_dims = tr_inputs.shape[1] num_output_dims = tr_outputs.shape[1] layer_sizes = [num_input_dims] + num_hiddens + [num_output_dims] init_params = init_random_params(param_scale, layer_sizes) num_batches = int(np.ceil(tr_inputs.shape[0] / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx+1) * batch_size) # Define training objective def objective(params, iter): idx = batch_indices(iter) return -log_posterior( params, tr_inputs[idx], tr_outputs[idx], L2_reg) # Get gradient of objective using autograd. objective_grad = grad(objective) print( " Epoch | Train accuracy | Train log-like | Holdout accuracy | Holdout log-like ") def print_perf(params, iter, gradient): if iter % num_batches == 0: train_acc = accuracy(params, tr_inputs, tr_outputs) train_ll = log_posterior(params, tr_inputs, tr_outputs, L2_reg) valid_acc = accuracy(params, va_inputs, va_outputs) valid_ll = log_posterior(params, va_inputs, va_outputs, L2_reg) print("{:15}|{:20}|{:20}|{:20}|{:20}".format( iter//num_batches, train_acc, train_ll, valid_acc, valid_ll)) # The optimizers provided can optimize lists, tuples, or dicts of # parameters. optimized_params = adam( objective_grad, init_params, step_size=step_size, num_iters=num_epochs * num_batches, callback=print_perf) return optimized_params
def _create_optimizer(self): if self.optimizer_type == optimizers.L_BFGS: self.optimizer = l_bfgs(self.loss, self.iterations) elif self.optimizer_type == optimizers.ADAM: self.optimizer = adam(self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) elif self.optimizer_type == optimizers.ADAGRAD: self.optimizer = adagrad(self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) elif self.optimizer_type == optimizers.GRADIENT_DESCENT: self.optimizer = gradient_descent(self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) elif self.optimizer_type == optimizers.RMSPROP: self.optimizer = rmsprop(self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) elif self.optimizer_type == optimizers.ADADELTA: self.optimizer = adadelta(self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) else: raise "Unsupported optimizer"
def find_gamma(initial_gamma, loss, n_steps: int): """Finds gamma after n_steps minimzation of loss function. :param initial_gamma: Starting point for gradient optimization :param loss: Loss function to optimize (object containing taylor method). Taylor methods has to return two values: a) function at point b) gradient at point :param n_steps: number of steps for optimizer """ initial_gamma = np.array([initial_gamma]) # GammaLoss oczekuje tablicy optimizer = adam(f=loss, starting_point=initial_gamma, learning_rate=.1, beta1=0.9, beta2=0.999, epsilon=1e-8) gamma = initial_gamma for _ in range(n_steps): gamma, _, _ = next(optimizer) return gamma[0]
training_text = one_hot_to_string(train_inputs[:,t,:]) predicted_text = one_hot_to_string(logprobs[:,t,:]) print(training_text.replace('\n', ' ') + "|" + predicted_text.replace('\n', ' ')) def training_loss(params, iter): return -rnn_log_likelihood(params, train_inputs, train_inputs) def callback(weights, iter, gradient): if iter % 10 == 0: print("Iteration", iter, "Train loss:", training_loss(weights, 0)) print_training_prediction(weights) # Build gradient of loss function using autograd. training_loss_grad = grad(training_loss) print("Training RNN...") trained_params = adam(training_loss_grad, init_params, step_size=0.1, num_iters=1000, callback=callback) print() print("Generating text from RNN...") num_letters = 30 for t in range(20): text = "" for i in range(num_letters): seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :] logprobs = rnn_predict(trained_params, seqs)[-1].ravel() text += chr(npr.choice(len(logprobs), p=np.exp(logprobs))) print(text)
def __init__(self, x_span, x_word, x_ctx, x_dist, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg): """ :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id :param x_dist: 1D: batch; elem=distance between sentences of ant and ment :param y : 1D: batch """ self.input = [x_span, x_word, x_ctx, x_dist, y] self.x_span = x_span self.x_word = x_word self.x_ctx = x_ctx self.x_dist = x_dist self.y = y dim_x = dim_w * (2 + 4 + 20) + 1 batch = y.shape[0] """ Params """ if init_emb is None: self.emb = theano.shared(sample_weights(n_vocab, dim_w)) else: self.emb = theano.shared(init_emb) self.W_d = theano.shared(sample_weights(dim_d)) self.W_i = theano.shared(sample_weights(dim_x, dim_h*3)) self.W_h = theano.shared(sample_weights(dim_h*3, dim_h)) self.W_o = theano.shared(sample_weights(dim_h)) self.params = [self.W_d, self.W_i, self.W_h, self.W_o] """ Input Layer """ x_s = self.emb[x_span] # 1D: batch, 2D: limit * 2, 3D: dim_w x_w = self.emb[x_word] # 1D: batch, 2D: 4, 3D: dim_w x_c = self.emb[x_ctx] # 1D: batch, 2D: window * 2 * 2, 3D: dim_w x_d = self.W_d[x_dist] # 1D: batch x_s_avg = T.concatenate([T.mean(x_s[:, :x_s.shape[1]/2], 1), T.mean(x_s[:, x_s.shape[1]/2:], 1)], 1) x = T.concatenate([x_s_avg, x_w.reshape((batch, -1)), x_c.reshape((batch, -1)), x_d.reshape((batch, 1))], 1) """ Intermediate Layers """ h1 = relu(T.dot(x, self.W_i)) # h1: 1D: batch, 2D: dim_h h2 = relu(T.dot(h1, self.W_h)) # h2: 1D: batch, 2D: dim_h """ Output Layer """ p_y = sigmoid(T.dot(h2, self.W_o)) # p_y: 1D: batch """ Predicts """ self.thresholds = theano.shared(np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX)) self.y_hat = self.binary_predict(p_y) # 1D: batch, 2D: 9 (thresholds) self.y_hat_index = T.argmax(p_y) self.p_y_hat = p_y[self.y_hat_index] """ Cost Function """ self.nll = - T.sum(y * T.log(p_y) + (1. - y) * T.log((1. - p_y))) # TODO: ranking criterion self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2 """ Update """ self.grad = T.grad(self.cost, self.params) self.updates = adam(self.params, self.grad) """ Check Results """ self.result = T.eq(self.y_hat, y.reshape((y.shape[0], 1))) # 1D: batch, 2D: 9 (thresholds) self.total_p = T.sum(self.y_hat, 0) self.total_r = T.sum(y, keepdims=True) self.correct = T.sum(self.result, 0) self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))
def __init__(self, x_span, x_word, x_ctx, x_dist, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg): """ :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id :param x_dist: 1D: batch; elem=distance between sentences of ant and ment :param y : 1D: batch """ self.input = [x_span, x_word, x_ctx, x_dist, y] self.x_span = x_span self.x_word = x_word self.x_ctx = x_ctx self.x_dist = x_dist self.y = y dim_x = dim_w * (2 + 4 + 20) + 1 batch = y.shape[0] """ Params """ if init_emb is None: self.emb = theano.shared(sample_weights(n_vocab, dim_w)) else: self.emb = theano.shared(init_emb) self.W_d = theano.shared(sample_weights(dim_d)) self.W_i = theano.shared(sample_weights(dim_x, dim_h * 3)) self.W_h = theano.shared(sample_weights(dim_h * 3, dim_h)) self.W_o = theano.shared(sample_weights(dim_h)) self.params = [self.W_d, self.W_i, self.W_h, self.W_o] """ Input Layer """ x_s = self.emb[x_span] # 1D: batch, 2D: limit * 2, 3D: dim_w x_w = self.emb[x_word] # 1D: batch, 2D: 4, 3D: dim_w x_c = self.emb[x_ctx] # 1D: batch, 2D: window * 2 * 2, 3D: dim_w x_d = self.W_d[x_dist] # 1D: batch x_s_avg = T.concatenate([ T.mean(x_s[:, :x_s.shape[1] / 2], 1), T.mean(x_s[:, x_s.shape[1] / 2:], 1) ], 1) x = T.concatenate([ x_s_avg, x_w.reshape((batch, -1)), x_c.reshape((batch, -1)), x_d.reshape((batch, 1)) ], 1) """ Intermediate Layers """ h1 = relu(T.dot(x, self.W_i)) # h1: 1D: batch, 2D: dim_h h2 = relu(T.dot(h1, self.W_h)) # h2: 1D: batch, 2D: dim_h """ Output Layer """ p_y = sigmoid(T.dot(h2, self.W_o)) # p_y: 1D: batch """ Predicts """ self.thresholds = theano.shared( np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX)) self.y_hat = self.binary_predict(p_y) # 1D: batch, 2D: 9 (thresholds) self.y_hat_index = T.argmax(p_y) self.p_y_hat = p_y[self.y_hat_index] """ Cost Function """ self.nll = -T.sum(y * T.log(p_y) + (1. - y) * T.log( (1. - p_y))) # TODO: ranking criterion self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2 """ Update """ self.grad = T.grad(self.cost, self.params) self.updates = adam(self.params, self.grad) """ Check Results """ self.result = T.eq(self.y_hat, y.reshape( (y.shape[0], 1))) # 1D: batch, 2D: 9 (thresholds) self.total_p = T.sum(self.y_hat, 0) self.total_r = T.sum(y, keepdims=True) self.correct = T.sum(self.result, 0) self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))
x = tf.placeholder(tf.float32, [None, 784]) t = tf.placeholder(tf.float32, [None, 10]) layers = [ Dense(784, 256, tf.nn.relu), Dense(256, 256, tf.nn.relu), Dense(256, 10, tf.nn.softmax) ] y, params = f_props(layers, x) cost = -tf.reduce_mean(tf.reduce_sum(t * tf_log(y), axis=1)) # Choose an optimizer from sgd, sgd_clip, momentum, nesterov_momentum, adagrad, adadelta, rmsprop, adam, adamax, smorms3 updates = adam(cost, params) train = tf.group(*updates) test = tf.argmax(y, axis=1) n_epochs = 10 batch_size = 100 n_batches = train_X.shape[0] // batch_size with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(n_epochs): train_X, train_y = shuffle(train_X, train_y, random_state=random_state) for i in range(n_batches): start = i * batch_size end = start + batch_size
num_epochs = 5 step_size = 0.001 print("Loading training data...") N, train_images, train_labels, test_images, test_labels = load_mnist() init_params = init_random_params(param_scale) num_batches = int(np.ceil(len(train_images) / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx+1) * batch_size) # Define training objective def objective(params, iter): idx = batch_indices(iter) return -log_posterior(params, train_images[idx], train_labels[idx], L2_reg) # Get gradient of objective using autograd. objective_grad = grad(objective) print(" Epoch | Train accuracy | Test accuracy ") def print_perf(params, iter, gradient): if iter % num_batches == 0: train_acc = accuracy(params, train_images, train_labels) test_acc = accuracy(params, test_images, test_labels) print("{:15}|{:20}|{:20}".format(iter//num_batches, train_acc, test_acc)) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, init_params, step_size=step_size, num_iters=num_epochs * num_batches, callback=print_perf)
print(training_text.replace('\n', ' ') + "|" + predicted_text.replace('\n', ' ')) def training_loss(params, iter): return -rnn_log_likelihood(params, train_inputs, train_inputs) def callback(weights, iter, gradient): if iter % 10 == 0: print("Iteration", iter, "Train loss:", training_loss(weights, 0)) print_training_prediction(weights) # Build gradient of loss function using autograd. training_loss_grad = grad(training_loss) print("loss", training_loss(init_params,0)) print("grad", training_loss_grad(init_params,0)) print("Training RNN...") trained_params = adam(training_loss_grad, init_params, step_size=0.1, num_iters=1000, callback=callback) print() print("Generating text from RNN...") num_letters = 30 for t in range(20): text = "" for i in range(num_letters): seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :] logprobs = rnn_predict(trained_params, seqs)[-1].ravel() text += chr(npr.choice(len(logprobs), p=np.exp(logprobs))) print(text)
def train_mlp(inputs, outputs, layer_sizes, # don't include inputs and outputs batch_size=256, init_weights=None, param_scale=0.1, num_epochs=5, step_size=0.001, l1_lambda=0, l2_lambda=0, nonlinearity=util.tanh): # split data (again) into a training and a validation set (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data( inputs, out_data=outputs, frac=0.80) # define num of batches num_batches = int(np.ceil(tr_inputs.shape[0] / batch_size)) # define nn arch num_input_dims = tr_inputs.shape[-1] num_output_dims = tr_outputs.shape[-1] layer_sizes = [num_input_dims] + layer_sizes + [num_output_dims] predictions, logprob, num_weights = build(layer_sizes=layer_sizes, nonlinearity=nonlinearity) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx+1) * batch_size) # Define training objective def objective(weights, iter): idx = batch_indices(iter) return -logprob(weights, tr_inputs[idx], tr_outputs[idx]) \ + l2_lambda * np.sum(np.power(weights,2)) \ + l1_lambda * np.sum(np.abs(weights)) # Get gradient of objective using autograd. objective_grad = grad(objective) print( " Epoch | Train cosine | Train log-like | Holdout cosine | Holdout log-like ") def print_perf(weights, iter, gradient): # make predictions tr_preds = predictions(weights, tr_inputs) va_preds = predictions(weights, va_inputs) # get accuracy measurements train_acc = cms(tr_preds, tr_outputs) valid_acc = cms(va_preds, va_outputs) # get log likelihoods train_ll = -logprob(weights, tr_inputs, tr_outputs) valid_ll = -logprob(weights, va_inputs, va_outputs) print("{:15}|{:20}|{:20}|{:20}|{:20}".format( iter//num_batches, train_acc, train_ll, valid_acc, valid_ll)) # define init weights if init_weights is None: init_weights = param_scale * np.random.randn(num_weights) # optimize parameters trained_weights = adam(objective_grad, init_weights, step_size=step_size, num_iters=num_epochs*num_batches, callback=print_perf) return predictions, logprob, trained_weights
# Train with sgd batch_idxs = make_batches(train_images.shape[0], batch_size) num_batches = len(batch_idxs) cur_dir = np.zeros(num_weights * 2) for epoch in range(num_epochs): batch_counter = 0 for idxs in batch_idxs: log_posterior = lambda weights: logprob(weights, train_images[ idxs], train_labels[idxs]) objective, gradient, unpack_params = \ black_box_variational_inference(log_posterior, num_weights, num_samples) ''' grad_w = gradient(variational_params) cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_w variational_params -= learning_rate * cur_dir ''' variational_params = adam(gradient, variational_params, num_batches=num_batches, batch_id=batch_counter, step_size=0.01, num_iters=10) weights = extract_weights(variational_params, num_weights) print_perf(epoch, weights) batch_counter += 1
def train_mlp(inputs, outputs, init_weights=None, num_epochs=100, step_size=0.001, batch_size=128, param_scale=0.01, l1_lambda=0, l2_lambda=0, nonlinearity=identity): # split data (again) into a training and a validation set (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data(inputs, out_data=outputs, frac=0.80) num_batches = int(np.ceil(tr_inputs.shape[0] / float(batch_size))) input_count = tr_inputs.shape[-1] output_count = tr_outputs.shape[-1] pred_fun, loglike_fun, num_weights = build(input_count, output_count, nonlinearity=nonlinearity) if init_weights is None: init_weights = np.random.randn(num_weights) * param_scale def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx + 1) * batch_size) def loss(weights, x, y): return -loglike_fun(weights, x, y) \ + l1_lambda * np.sum(np.abs(weights)) \ + l2_lambda * np.sum(np.power(weights, 2)) def batch_loss(weights, iter): idx = batch_indices(iter) return loss(weights, tr_inputs[idx, :], tr_outputs[idx, :]) print( " Epoch | Train cosine | Train log-like | Holdout cosine | Holdout log-like " ) def print_perf(weights, iter, gradient): # make predictions tr_preds = pred_fun(weights, tr_inputs) va_preds = pred_fun(weights, va_inputs) # get accuracy measurements train_acc = cms(tr_preds, tr_outputs) valid_acc = cms(va_preds, va_outputs) # get log likelihoods train_ll = -loglike_fun(weights, tr_inputs, tr_outputs) valid_ll = -loglike_fun(weights, va_inputs, va_outputs) print("{:15}|{:20}|{:20}|{:20}|{:20}".format(iter // num_batches, train_acc, train_ll, valid_acc, valid_ll)) grad_fun = grad(batch_loss) trained_weights = adam(grad_fun, init_weights, step_size=step_size, callback=print_perf, num_iters=num_epochs * num_batches) return pred_fun, loglike_fun, trained_weights
testcostlist = [] testcostlist_dropout = [] traincostlist = [] iterlist = [] def print_function_dropout(params, iter, gradient): return def print_function(params, iter, gradient): use_dropout = False testcost = np.sum((neural_net_predict(params, testinputs, use_dropout) - testtargets)**2) testcostlist.append(testcost) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, init_params, step_size=step_size, num_iters=200, callback=print_function) optimized_params_list = [] for i in range(len(seed_inputs)): random.seed(seed_inputs[i]) optimized_params_list.append(adam(objective_grad_dropout, init_params, step_size=step_size, num_iters=200, callback=print_function_dropout)) fig2, ax = plt.subplots() plt.cla() plt.title('Dropout Result Comparisons, dropout_rate:' + str(dropout_rate)) ax.set_xlabel("Possible Inputs") ax.set_ylabel("Neural Network Outputs") plot_inputs = np.linspace(-8, 8, num=400) for i in range(len(seed_inputs)): # Plot data and functions. outputs = neural_net_predict(optimized_params_list[i], np.expand_dims(plot_inputs, 1), False, True)
# Train with sgd batch_idxs = make_batches(train_images.shape[0], batch_size) num_batches = len(batch_idxs) cur_dir = np.zeros(num_weights*2) for epoch in range(num_epochs): batch_counter = 0 for idxs in batch_idxs: log_posterior = lambda weights: logprob(weights, train_images[idxs], train_labels[idxs]) objective, gradient, unpack_params = \ black_box_variational_inference(log_posterior, num_weights, num_samples) ''' grad_w = gradient(variational_params) cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_w variational_params -= learning_rate * cur_dir ''' variational_params = adam(gradient, variational_params, num_batches=num_batches, batch_id=batch_counter, step_size=0.01, num_iters=10) weights = extract_weights(variational_params, num_weights) print_perf(epoch, weights) batch_counter += 1
# define batched loss batch_idxs = make_batches(N_data, batch_size=256) def opt_loss(W, i): idxs = batch_idxs[i % len(batch_idxs)] return loss(W, train_images[idxs], train_labels[idxs]) # define callback function print(" Epoch | Train err | Test error ") def print_perf(th, i, g): if i % 10 == 0: train_loss = loss(th, train_images[:1000], train_labels[:1000]) test_perf = frac_err(th, test_images[:5000, :, :, :], test_labels[:5000, :]) train_perf = frac_err(th, train_images[:1000, :, :, :], train_labels[:1000, :]) print("{epoch:15} | {train_loss:15} | {train_perf:15} | {test_perf:15} " . \ format(epoch=i, train_loss = train_loss, train_perf=train_perf, test_perf = test_perf)) # run sgd with momentum W = adam(grad(opt_loss), W, callback=print_perf, num_iters=20000, step_size=.0005)
ax5.cla() ax6.cla() ax7.cla() ax.plot(inputs, targets, 'bx') ax.plot(plot_inputs, outputs) ax.set_xlabel("Possible Inputs") ax.set_ylabel("Neural Network Outputs") ax.set_ylim([-2, 2]) plt.draw() ax2.matshow(params[0][0].T, cmap='cool') ax2.set_xlabel("Hidder Layer 1 (Incoming Weights)") ax3.matshow(np.array([params[0][1]]).T, cmap='cool') ax3.set_ylabel("Hidder Layer 1 Bias") ax4.matshow(params[1][0].T, cmap='cool') ax4.set_xlabel("Hidden Layer 2 (Incoming Weights)") ax5.matshow(np.array([params[1][1]]).T, cmap='cool') ax5.set_ylabel("Hidder Layer 2 Bias") ax6.matshow(params[2][0], cmap='cool') ax6.set_xlabel("Hidden Layer 2 (Outgoing weights)") ax7.matshow(np.array([params[2][1]]), cmap='cool') ax7.set_ylabel("Output Bias") #plt.savefig(str(iter) + '.jpg') plt.pause(1.0 / 60.0) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, init_params, step_size=step_size, num_iters=100, callback=print_function)
param_scale = 0.1 #Define the input arrays x and the desired output array y inputs = np.array([[1, 1, 0], [1, 0, 0], [1, 1, 1], [0, 1, 1]]) targets = np.array([[0, 0, 1, 1]]).T L2_reg = 1.0 init_params = init_random_params(param_scale, layer_sizes) # Define training objective def objective(params, iter): return -log_posterior(params, inputs, targets, L2_reg) # Get gradient of objective using autograd. objective_grad = grad(objective) print('Train accuracy | Test accuracy') def print_final(params, iter, gradient): train_acc = accuracy(params, inputs, targets) test_acc = accuracy(params, inputs, targets) print("{:20}|{:20}".format(train_acc, test_acc)) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, init_params, step_size=0.001, num_iters=100, callback=print_final)
# Set up figure. fig = plt.figure(figsize=(8, 8), facecolor='white') ax = fig.add_subplot(111, frameon=False) plt.ion() plt.show(block=False) def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) plt.cla() target_distribution = lambda x: np.exp(log_density(x, t)) plot_isocontours(ax, target_distribution) mean, log_std = unpack_params(params) variational_contour = lambda x: mvn.pdf(x, mean, np.diag(np.exp(2 * log_std))) plot_isocontours(ax, variational_contour) plt.draw() plt.pause(1.0 / 30.0) print("Optimizing variational parameters...") init_mean = -1 * np.ones(D) init_log_std = -5 * np.ones(D) init_var_params = np.concatenate([init_mean, init_log_std]) variational_params = adam(gradient, init_var_params, step_size=0.1, num_iters=2000, callback=callback)
print("rms total for prognum", prog_num, "at breakpoint", bp, bp_rms) ax.plot(plot_inputs, outputs, 'g') LLHs = [] LLH_xs = [] def callback(params, t, g): LLH = -objective(params, t) LLH_xs.append(t) print("Iteration {} log likelihood {}".format(t, LLH)) LLHs.append(LLH) y_test_pred = predictions(params, X_test) rms_total = ascdata.RMSE(y_test, y_test_pred) print("rms total", rms_total) plot_prediction_data(X_bp1, y_bp1, params, ax1, 1, 4194659) plot_prediction_data(X_bp2, y_bp2, params, ax2, 3, 4194873) plt.draw() #plt.pause(1.0/60.0) rs = npr.RandomState(0) init_params = 10 * rs.randn(num_weights) print("Optimizing network parameters...") optimized_params = adam(grad(objective), init_params, step_size=0.5, num_iters=100, callback=callback) # fig.savefig('neuralnet_singletask_crossents.png') # plt.figure(2, facecolor='white') # plt.plot(LLH_xs, LLHs) # plt.savefig('neuralnet_singletask_LLH.png')
plt.ion() plt.show(block=False) def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) # Sample functions from posterior. mean, cov = unpack_params(params) rs = npr.RandomState(0) sample_weights = rs.randn(10, num_weights) * np.sqrt(cov) + mean plot_inputs = np.linspace(-8, 8, num=200) outputs = predictions(sample_weights, np.expand_dims(plot_inputs, 1)) # Plot data and functions. plt.cla() ax.plot(inputs.ravel(), targets.ravel(), 'bx') ax.plot(plot_inputs, outputs[:, :, 0].T) ax.set_ylim([-2, 3]) plt.draw() plt.pause(1.0/60.0) # Initialize variational parameters rs = npr.RandomState(0) init_mean = rs.randn(num_weights) init_log_cov = -5 * np.ones(num_weights) init_var_params = np.concatenate([init_mean, init_log_cov]) print("Optimizing variational parameters...") variational_params = adam(gradient, init_var_params, step_size=0.1, num_iters=1000, callback=callback)
def train_cnn(inputs, outputs, layer_specs, init_weights=None, param_scale=0.1, step_size=0.001, batch_size=128, num_epochs=50, L2_reg=1.0): ''' wrapper function to train the convnet ''' (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data(inputs, out_data=outputs, frac=0.80) input_shape = tr_inputs.shape num_data = tr_inputs.shape # number of batches num_batches = int(np.ceil(tr_inputs.shape[0] / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx + 1) * batch_size) # build CNN num_weights, pred_fun, loss_fun, frac_err = build(input_shape[1:], layer_specs, L2_reg) def batch_loss(weights, iter): idx = batch_indices(iter) return loss_fun(weights, tr_inputs[idx], tr_outputs[idx]) loss_grad = grad(batch_loss) # init weights if init_weights is None: rs = npr.RandomState() init_weights = rs.randn(num_weights) * param_scale print( " Epoch | Train loss | Train err | Validation loss | Validation error " ) def print_perf(weights, epoch, gradients): va_perf = frac_err(weights, va_inputs, va_outputs) tr_perf = frac_err(weights, tr_inputs, tr_outputs) va_loss = loss_fun(weights, va_inputs, va_outputs) tr_loss = loss_fun(weights, tr_inputs, tr_outputs) print("{0:15}|{1:15}|{2:15}|{3:18}|{4:15}".format( epoch, tr_loss, tr_perf, va_loss, va_perf)) # optimize parameters trained_weights = adam(loss_grad, init_weights, step_size=step_size, num_iters=num_epochs, callback=print_perf) return pred_fun, loss_fun, trained_weights
seed = npr.RandomState(0) def objective(combined_params, iter): data_idx = batch_indices(iter) gen_params, rec_params = combined_params return -vae_lower_bound(gen_params, rec_params, train_images[data_idx], seed) / data_dim # Get gradients of objective using autograd. objective_grad = grad(objective) print( " Epoch | Objective | Fake probability | Real Probability " ) def print_perf(combined_params, iter, grad): if iter % 10 == 0: gen_params, rec_params = combined_params bound = np.mean(objective(combined_params, iter)) print("{:15}|{:20}".format(iter // num_batches, bound)) fake_data = generate_from_prior(gen_params, 20, latent_dim, seed) save_images(fake_data, 'vae_samples.png', vmin=0, vmax=1) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, combined_init_params, step_size=step_size, num_iters=num_epochs * num_batches, callback=print_perf)
elbos.append(elbo_val) if t % 50 == 0: print("Iteration {} lower bound {}".format(t, elbo_val)) init_mean = -1 * np.ones(D) init_log_std = -5 * np.ones(D) init_var_params = np.concatenate([init_mean, init_log_std]) variational_params = optfun(num_iters, init_var_params, callback) return np.array(elbos) # let's optimize this with a few different step sizes elbo_lists = [] step_sizes = [.1, .25, .5] for step_size in step_sizes: # optimize with standard gradient + adam optfun = lambda n, init, cb: adam(gradient, init, step_size=step_size, num_iters=n, callback=cb) standard_lls = optimize_and_lls(optfun) # optimize with natural gradient + sgd, no momentum optnat = lambda n, init, cb: sgd(natural_gradient, init, step_size=step_size, num_iters=n, callback=cb, mass=.001) natural_lls = optimize_and_lls(optnat) elbo_lists.append((standard_lls, natural_lls)) # visually compare the ELBO plt.figure(figsize=(12,8)) colors = ['b', 'k', 'g'] for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists): plt.plot(np.arange(len(stand_lls)), stand_lls, '--', label="standard (adam, step-size = %2.2f)"%ss, alpha=.5, c=col) plt.plot(np.arange(len(nat_lls)), nat_lls, '-',
objective = lambda weights, t: -logprob(weights, inputs, targets) # Set up figure. fig = plt.figure(figsize=(12, 8), facecolor='white') ax = fig.add_subplot(111, frameon=False) plt.show(block=False) def callback(params, t, g): print("Iteration {} log likelihood {}".format(t, -objective(params, t))) # Plot data and functions. plt.cla() ax.plot(inputs.ravel(), targets.ravel(), 'bx') plot_inputs = np.reshape(np.linspace(-7, 7, num=300), (300, 1)) outputs = predictions(params, plot_inputs) ax.plot(plot_inputs, outputs) ax.set_ylim([-1, 1]) plt.draw() plt.pause(1.0 / 60.0) rs = npr.RandomState(0) init_params = 0.1 * rs.randn(num_weights) print("Optimizing network parameters...") optimized_params = adam(grad(objective), init_params, step_size=0.01, num_iters=1000, callback=callback)
def experiment(train_data, valid_data, test_data, init_scale, batch_size, num_iters_hypernet, step_size_hypernet, num_iters_hyper, step_size_hyper, num_iters, graph_mod, global_seed=0): """Run the second experiment, which consists of fitting a hypernetwork, which outputs neural network parameters. These neural network parameters try to fit the training data with some additional loss for the hyperparameters. We try to optimize the hyperparameters given the learned neural network response through the hypernetwork. We observe how the hypernetwork performs on the training and testing, by graphing it against the true loss. The true loss is found by training a neural network to convergence at a discrete number of points. :param train_data: The training data which is a tuple of (train_input, train_target). :param valid_data: The testing data which is a tuple of (valid_input, valid_target). :param test_data: The testing data which is a tuple of (test_input, test_target). :param init_scale: The scale (positive float) for the hypernetwork initialization. :param batch_size: The number of hyperparameters to sample for each iteration. :param num_iters_hypernet: The number of iterations (integer) to run the hypernetwork optimizer for. :param step_size_hypernet: The step size (positive float) for the hypernetwork optimizer. :param num_iters_hyper: The number of iterations (integer) to run the hyperparameter optimizer for. :param step_size_hyper: The step size (positive float) for the hypernetwork optimizer. :param num_iters: The number of iterations (integer) to run the optimization for. :param graph_mod: How many iterations (integer) to weight between each graph of the loss. :param global_seed: The seed (integer) to use when choosing a constant seed. :return: None. """ assert init_scale > 0 assert step_size_hypernet > 0 and step_size_hyper > 0 assert num_iters > 0 and num_iters_hypernet > 0 and num_iters_hyper > 0 global hyper_cur hyper_cur = -3.5 # Initialize the hyperparameter (float). # Define information about hyper loss and how hyper parameters are sampled. hyper_sample_var = 0 # 10e-4 # The variance to use when sampling hyperparameters from a Gaussian distribution. def sample_hypers(hyper, rs): """Sample a hyperparameter. :param hyper: The current hyperparameter ([float]). :param rs: A numpy randomstate. :return: A sampled hyperparameter (float). """ return np.array([rs.randn() * hyper_sample_var + hyper]).reshape(1, -1) def hyper_loss(weights, hyper): """Find the loss for neural network that is dependant on the hyperparameter. :param weights: The weights ([[float]]) of the neural network. :param hyper: The hyperparameter (float) input to the hypernetwork. :return: The loss (float) of network dependant on the hyperparameter. """ return -log_gaussian(weights, np.exp(hyper)) example_hyper = sample_hypers(hyper_cur, npr.RandomState(global_seed)) # Test the sample function. assert example_hyper is not None train_inputs, train_targets = train_data valid_inputs, valid_targets = valid_data test_inputs, test_targets = test_data batch_ind, feature_ind = 0, 1 elementary_input_size = np.shape(train_inputs)[feature_ind] elementary_output_size = np.shape(train_targets)[feature_ind] elementary_layer_sizes = [elementary_input_size, elementary_output_size] num_hypers = example_hyper.shape[feature_ind] # The dimensionality of the hyperparameter space (integer). # Define neural network and function to turn a vector into its weight structure. example_elementary_params = init_random_params(init_scale, elementary_layer_sizes, npr.RandomState(global_seed)) flat_elementary_params, unflatten_vector_to_network_weights = flatten(example_elementary_params) assert hyper_loss(example_elementary_params, example_hyper) is not None num_elementary_params = len(flat_elementary_params) # Define a hypernetwork parametrized by some hyperparameters. hypernet_layer_sizes = [num_hypers, num_elementary_params] # Note that there are no hidden units. objective_functions = get_loss_functions(unflatten_vector_to_network_weights, sample_hypers, hyper_loss, batch_size, train_inputs, train_targets, test_inputs, test_targets, valid_inputs, valid_targets, global_seed) hypernet, train_objective, valid_objective, test_objective = objective_functions[:4] hyper_train_objective, hyper_valid_objective, hyper_test_objective = objective_functions[4:-1] hyper_train_stochastic_objective = objective_functions[-1] # Next, train a neural network from scratch with different hyperparameter values. real_step_size = 0.0001 # The step size to use to find the real loss (float). real_num_iters = 1000 # The number of iterations to use to find the real loss (integer). range_min = -2.0 # The min log variance for the hyper parameter of the variance of weight distribution to graph. range_max = 4.0 # The max log variance for the hyper parameter of the variance of weight distribution to graph. num_visual_points = 10 # The number of points to test the real loss of - expensive (integer). real_hyper_range = np.linspace(range_min + 1.0, range_max - 1.0, num_visual_points) real_train_loss = np.zeros(real_hyper_range.shape) real_train_performance = np.zeros(real_hyper_range.shape) real_valid_loss = np.zeros(real_hyper_range.shape) real_test_loss = np.zeros(real_hyper_range.shape) min_real_valid_loss, min_real_hyper = 10e32, 10e32 for i, hypers in enumerate(real_hyper_range): print("Optimizing network parameters: ", i) init_params = init_random_params(init_scale, elementary_layer_sizes, npr.RandomState(global_seed)) def cur_obj(w, seed): """The current objective function of the neural network. :param w: The weights ([float]) of the neural network. :param seed: The seed (integer) for sampling a hyperparameter. :return: The current objective value (float). """ return train_objective(w, hypers, seed) optimized_params, _, _, _ = adam(grad(cur_obj), init_params, step_size=real_step_size, num_iters=real_num_iters) real_train_loss[i] = train_objective(optimized_params, hypers, global_seed) real_train_performance[i] = real_train_loss[i] - hyper_loss(optimized_params, hypers) real_valid_loss[i] = valid_objective(optimized_params, hypers, global_seed) if real_valid_loss[i] < min_real_valid_loss: min_real_valid_loss = real_valid_loss[i] print("Best hyperparameter found = ", hypers) real_test_loss[i] = test_objective(optimized_params, hypers, global_seed) fig, axs = create_figure_and_axs() # Set up the arrays to store information for plotting. num_hyper_test_points = 200 # Test a large number of hyperparameters with the learned function - cheap (integer)! learned_hyper_range = np.linspace(range_min, range_max, num_hyper_test_points) # Hyperparameters to test. hyper_train_loss = np.zeros(learned_hyper_range.shape) # Hypernetwork training loss per hyperparameter. hyper_train_performance = np.zeros(learned_hyper_range.shape) # Hypernetwork training performance per # hyperparameter. Note that performance is loss - regularization loss. hyper_valid_loss, hyper_test_loss = np.zeros(learned_hyper_range.shape), np.zeros(learned_hyper_range.shape) def callback(hyper_weights, opt_iteration, g): """Do whatever work is desired on each optimization iteration. Draws graphs, prints information, and stores information. :param hyper_weights: The weights ([[float]]) of the hypernetwork. :param opt_iteration: The current iteration of optimization. :param g: The gradient ([[float]]) of the optimizer. :return: None. """ global log_likelihoods, valid_loss, test_loss, grad_norms_hyper, grad_norms_hypernet, global_opt_iteration global hyper_cur log_likelihood = hyper_train_objective(hyper_weights, hyper_cur) log_likelihoods[global_opt_iteration] = log_likelihood # Store the training loss. weights_cur = hypernet(hyper_weights, hyper_cur) train_performance[global_opt_iteration] = log_likelihood - hyper_loss(weights_cur, hyper_cur) valid_loss[global_opt_iteration] = hyper_valid_objective(hyper_weights, hyper_cur) test_loss[global_opt_iteration] = hyper_test_objective(hyper_weights, hyper_cur) grad_norm = np.sum([np.sum([np.sum(np.abs(weight_or_bias)) for weight_or_bias in layer]) for layer in g]) grad_norms_hypernet[global_opt_iteration] = grad_norm grad_norms_hyper[global_opt_iteration] = grad_norms_hyper[global_opt_iteration-1] global_opt_iteration += 1 print("Iteration {} Loss {} Grad L1 Norm {}".format(opt_iteration, log_likelihood, grad_norm)) if global_opt_iteration % graph_mod == 0: # Only print on every iteration that is a multiple of graph_mod. [ax.cla() for ax in axs] # Clear all of the axes. axs[0].set_xlabel('Hyperparameter $\lambda$'), axs[0].set_ylabel('Loss $\mathcal{L}$') for cur, hyper in enumerate(learned_hyper_range): hyper_train_loss[cur] = hyper_train_objective(hyper_weights, hyper) weights = hypernet(hyper_weights, hyper) hyper_train_performance[cur] = hyper_train_loss[cur] - hyper_loss(weights, hyper) hyper_valid_loss[cur] = hyper_valid_objective(hyper_weights, hyper) hyper_test_loss[cur] = hyper_test_objective(hyper_weights, hyper) axs[0].plot(real_hyper_range, real_train_loss, 'bx', ms=28, label='Train loss of optimized weights') axs[0].plot(learned_hyper_range, hyper_train_loss, 'b-', label='Train loss of hypernetwork weights') axs[0].set_ylim([-1.5, 3.8]) axs[0].plot(real_hyper_range, real_valid_loss, 'rx', ms=28, label='Valid. loss of optimized weights') axs[0].plot(learned_hyper_range, hyper_valid_loss, 'r-', label='Valid. loss of hypernetwork weights') min_hyper_found = 1.836 # Known minimum from doing a search with 1000 points over this range. axs[0].axvline(x=min_hyper_found, c='k', linestyle='dashed', label='Optimal hyperparameter $\lambda$') pdf_range = np.linspace(hyper_cur - 0.5, hyper_cur + 0.5, 100) axs[0].plot(pdf_range, norm.pdf(pdf_range, loc=hyper_cur, scale=0.06) / 4.0 + axs[0].get_ylim()[0], c='g', label='$p (\lambda | \hat{\lambda})$') [ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.45), borderaxespad=0.0, fancybox=True, framealpha=0.0, fontsize=28) for ax in axs] # Create a legend for all the axes. setup_ax_and_save(axs, fig, 'hypernets_local_small') def callback_outer(hyper, opt_iteration, g): """Do whatever work is desired on each outer optimization iteration. Stores information. :param hyper: The hyperparameter (float) input to the hypernetwork. :param opt_iteration: The current iteration of optimization. :param g: The gradient ([[float]]) of the optimizer. :return: None. """ global grad_norms_hyper, train_hypers, global_hyperopt_iteration grad_norms_hyper[global_opt_iteration - 1] = np.abs(g) train_hypers[global_hyperopt_iteration] = hyper global_hyperopt_iteration += 1 print("Outer Iteration {} Hyper {} Grad L1 Norm {}".format(global_hyperopt_iteration, hyper, grad_norms_hyper[global_opt_iteration])) init_hypernet_params = init_random_params(init_scale, hypernet_layer_sizes, npr.RandomState(global_seed)) m_hyper = None # A record of the current m for re-starting the Adam optimizer. v_hyper = None # A record of the current v for re-starting the Adam optimizer cur_iter_hyper = None # A record of the current iteration for re-starting the Adam optimizer. for _ in range(num_iters): def hyper_train_stochastic_objective_current(hyper_weights, seed): """The objective for the hypernetwork, with a fixed hyperparameter. :param hyper_weights: The weights ([[float]]) of the hypernetwork. :param seed: The seed (integer) for sampling a hyperparameter. :return: The hypernetwork's loss (float). """ return hyper_train_stochastic_objective(hyper_cur, hyper_weights, seed) init_hypernet_params = sgd(grad(hyper_train_stochastic_objective_current), init_hypernet_params, step_size=step_size_hypernet, num_iters=num_iters_hypernet, callback=callback, mass=0) def valid_objective_current(hyper, seed): """The objective for the hyperparameter, with a fixed hypernetwork. :param hyper: The hyperparameter (float) input to the hypernetwork. :param seed: The seed (integer) for sampling a hyperparameter. :return: The validation loss (float). """ return valid_objective(hypernet(init_hypernet_params, hyper), hyper, seed) hyper_cur, m_hyper, v_hyper, cur_iter_hyper = adam(grad(valid_objective_current), hyper_cur, step_size=step_size_hyper, num_iters=num_iters_hyper, callback=callback_outer, m=m_hyper, v=v_hyper, offset=cur_iter_hyper) print("The current hyperparameter is:", hyper_cur)