예제 #1
0
파일: rnn.py 프로젝트: sanyu12/n2nRNNSRL
    def __init__(self, unit, x, d, n_layers, n_in, n_h, n_y, reg=0.0001):

        if unit == 'lstm':
            self.layers = lstm.layers
        else:
            self.layers = gru.layers

        self.x = x  # x: 1D: batch_size, 2D: n_words, 3D: n_fin
        self.d = d  # d: 1D: batch_size, 2D: n_words

        n_fin = n_in * 7 + 1
        batch = T.cast(self.d.shape[0], dtype='int32')

        params, o_layer, emit = self.layers(x=self.x,
                                            batch=batch,
                                            n_fin=n_fin,
                                            n_h=n_h,
                                            n_y=n_y,
                                            n_layers=n_layers)

        self.p_y = y_prob(o_layer, emit, self.d.dimshuffle(1, 0), batch)
        self.y_pred = vitabi(o_layer, emit, batch)

        self.nll = -T.mean(self.p_y)
        self.L2_sqr = L2_sqr(params)
        self.cost = self.nll + reg * self.L2_sqr / 2.
        self.errors = T.neq(self.y_pred, self.d)

        self.g = T.grad(self.cost, params)
        self.updates = adam(params, self.g)
예제 #2
0
def update_nn(init_var_params, batch_data, batch_labels):
    log_posterior = lambda weights, t: logprob(weights, batch_data, batch_labels)

    # Build variational objective.
    objective, gradient, unpack_params = \
        black_box_variational_inference(log_posterior, num_weights, num_samples=20)

    variational_params = adam(gradient, init_var_params, step_size=0.01, num_iters=50)

    return variational_params
예제 #3
0
파일: lstm.py 프로젝트: mhw32/adaware-nlp
def train_lstm(inputs,
               outputs,
               state_size,
               batch_size=256,
               param_scale=0.001,
               num_epochs=5,
               step_size=0.001):

    # split data (again) into a training and a validation set
    (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data(
        inputs, out_data=outputs, frac=0.80)

    input_size = tr_inputs.shape[2]
    output_size = tr_outputs.shape[2]

    init_params = init_lstm_params(input_size,
                                   state_size,
                                   output_size,
                                   param_scale=param_scale,
                                   rs=npr.RandomState(0))

    num_batches = int(np.ceil(tr_inputs.shape[1] / batch_size))

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx+1) * batch_size)

    # Define training objective
    def objective(params, iter):
        idx = batch_indices(iter)
        return -lstm_log_likelihood(
            params, tr_inputs[:, idx, :], tr_outputs[:, idx, :])

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print(
        "     Epoch     |    Train accuracy  |    Train log-like  |  Holdout accuracy  |  Holdout log-like  ")

    def print_perf(params, iter, gradient):
        train_acc = accuracy(params, tr_inputs, tr_outputs)
        train_ll = -lstm_log_likelihood(params, tr_inputs, tr_outputs)
        valid_acc = accuracy(params, va_inputs, va_outputs)
        valid_ll = -lstm_log_likelihood(params, va_inputs, va_outputs)
        print("{:15}|{:20}|{:20}|{:20}|{:20}".format(
            iter//num_batches, train_acc, train_ll, valid_acc, valid_ll))

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad,
                            init_params,
                            step_size=step_size,
                            num_iters=num_epochs,
                            callback=print_perf)

    return optimized_params
def get_Aopt(inX, iny):
    X_train, y_train, X_test, y_test = ascdata.split_train_test(inX, iny)
    X_train = np.concatenate((X_train, np.ones((X_train.shape[ 0 ], 1))), 1)
    X_test = np.concatenate((X_test, np.ones((X_test.shape[ 0 ], 1))), 1)
    X_train_less, s_train = ascdata.split_X_s(X_train)
    X_test_less, s_test = ascdata.split_X_s(X_test)

    s_train_phi = ascdata.generate_phi(s_train, d, A_phi, b_phi)
    s_test_phi = ascdata.generate_phi(s_test, d, A_phi, b_phi)

    nfeatures = X_train.shape[1] - 1
    # Dimensions of phi(s)
    nfeatures_phi = d
    invT2 = 10

    def logprob(inA, inX, iny, ins_phi):
        RMS = 0
        for i in range(len(iny)):
            wi = np.dot(inA, inX[i])
            RMS_current = (iny[i] - np.dot(wi, ins_phi[i]))**2
            RMS += RMS_current
        return -RMS

    objective = lambda inA, t: -logprob(inA, X_train_less, y_train, s_train_phi)

    LLHs = []
    LLH_xs = []

    def callback(params, t, g):
        LLH = -objective(params, t)
        LLHs.append(LLH)
        LLH_xs.append(t)
        print("Iteration {} log likelihood {}".format(t, LLH))

    init_A = 0.00000000001*(np.ones((nfeatures_phi, nfeatures)))
    # init_A =  [[ -3.05236728e-04,  -9.50015728e-04,  -3.80139503e-04,   1.44010470e-04, -3.05236728e-04,
    #              -4.96117987e-04,  -1.02736409e-04,  -1.86416292e-04, -9.52628589e-04,  -1.55023279e-03,
    #              1.44717581e-04,   1.00000000e-11, -9.50028200e-04,  -4.96117987e-04,   1.00000000e-11,
    #              -3.05236728e-04, 1.77416412e-06,  -8.16665436e-06,   3.12622951e-05,  -8.25700143e-04,
    #              1.44627987e-04,   1.90211243e-05,  -8.28273186e-04,  -9.41349990e-04, -4.56671031e-04,
    #              9.79097070e-03,  -6.41866046e-04,  -7.79274856e-05, 1.44539330e-04,  -3.05236728e-04,
    #              -5.99188450e-04,  -7.29470175e-04, -6.69558174e-04,  -9.50028200e-04]]
    init_A = np.array(init_A)

    print("Optimizing network parameters...")
    optimized_params = adam(grad(objective), init_A,
                            step_size=0.01, num_iters=1000, callback=callback)

    Aopt = optimized_params
    print "Aopt = ", Aopt

    return Aopt, X_train_less, y_train, s_train, X_test_less, y_test, s_test, LLHs, LLH_xs
예제 #5
0
def update_nn(init_var_params, batch_data, batch_labels, iteration):
    log_posterior = lambda weights, t: logprob(weights, batch_data,
                                               batch_labels)

    # Build variational objective.
    objective, gradient, unpack_params = \
        black_box_variational_inference(log_posterior, num_weights, num_samples=20)

    variational_params = adam(gradient,
                              init_var_params,
                              step_size=0.1,
                              num_iters=10)

    return variational_params, objective
예제 #6
0
def train_nn(
        inputs, outputs, num_hiddens,
        batch_size=256, param_scale=0.1,
        num_epochs=5, step_size=0.001, L2_reg=1.0):

    # split data (again) into a training and a validation set
    (tr_inputs, va_inputs), (tr_outputs, va_outputs) = ctd.split_data(
        inputs, out_data=outputs, frac=0.80)

    num_input_dims = tr_inputs.shape[1]
    num_output_dims = tr_outputs.shape[1]
    layer_sizes = [num_input_dims] + num_hiddens + [num_output_dims]
    init_params = init_random_params(param_scale, layer_sizes)
    num_batches = int(np.ceil(tr_inputs.shape[0] / batch_size))

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx+1) * batch_size)

    # Define training objective
    def objective(params, iter):
        idx = batch_indices(iter)
        return -log_posterior(
            params, tr_inputs[idx], tr_outputs[idx], L2_reg)

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print(
        "     Epoch     |    Train accuracy  |    Train log-like  |  Holdout accuracy  |  Holdout log-like  ")

    def print_perf(params, iter, gradient):
        if iter % num_batches == 0:
            train_acc = accuracy(params, tr_inputs, tr_outputs)
            train_ll = log_posterior(params, tr_inputs, tr_outputs, L2_reg)
            valid_acc = accuracy(params, va_inputs, va_outputs)
            valid_ll = log_posterior(params, va_inputs, va_outputs, L2_reg)
            print("{:15}|{:20}|{:20}|{:20}|{:20}".format(
                iter//num_batches, train_acc, train_ll, valid_acc, valid_ll))

    # The optimizers provided can optimize lists, tuples, or dicts of
    # parameters.
    optimized_params = adam(
        objective_grad, init_params, step_size=step_size,
        num_iters=num_epochs * num_batches, callback=print_perf)

    return optimized_params
예제 #7
0
 def _create_optimizer(self):
     if self.optimizer_type == optimizers.L_BFGS:
         self.optimizer = l_bfgs(self.loss, self.iterations)
     elif self.optimizer_type == optimizers.ADAM:
         self.optimizer = adam(self.learning_rate)
         self.train_op = self.optimizer.minimize(self.loss)
     elif self.optimizer_type == optimizers.ADAGRAD:
         self.optimizer = adagrad(self.learning_rate)
         self.train_op = self.optimizer.minimize(self.loss)
     elif self.optimizer_type == optimizers.GRADIENT_DESCENT:
         self.optimizer = gradient_descent(self.learning_rate)
         self.train_op = self.optimizer.minimize(self.loss)
     elif self.optimizer_type == optimizers.RMSPROP:
         self.optimizer = rmsprop(self.learning_rate)
         self.train_op = self.optimizer.minimize(self.loss)
     elif self.optimizer_type == optimizers.ADADELTA:
         self.optimizer = adadelta(self.learning_rate)
         self.train_op = self.optimizer.minimize(self.loss)
     else:
         raise "Unsupported optimizer"
예제 #8
0
def find_gamma(initial_gamma, loss, n_steps: int):
    """Finds gamma after n_steps minimzation of loss function.

    :param initial_gamma: Starting point for gradient optimization
    :param loss: Loss function to optimize (object containing taylor method).
    Taylor methods has to return two values:
    a) function at point
    b) gradient at point
    :param n_steps: number of steps for optimizer
    """
    initial_gamma = np.array([initial_gamma])  # GammaLoss oczekuje tablicy
    optimizer = adam(f=loss,
                     starting_point=initial_gamma,
                     learning_rate=.1,
                     beta1=0.9,
                     beta2=0.999,
                     epsilon=1e-8)
    gamma = initial_gamma
    for _ in range(n_steps):
        gamma, _, _ = next(optimizer)
    return gamma[0]
예제 #9
0
파일: rnn.py 프로젝트: AugustLONG/autograd
            training_text  = one_hot_to_string(train_inputs[:,t,:])
            predicted_text = one_hot_to_string(logprobs[:,t,:])
            print(training_text.replace('\n', ' ') + "|" +
                  predicted_text.replace('\n', ' '))

    def training_loss(params, iter):
        return -rnn_log_likelihood(params, train_inputs, train_inputs)

    def callback(weights, iter, gradient):
        if iter % 10 == 0:
            print("Iteration", iter, "Train loss:", training_loss(weights, 0))
            print_training_prediction(weights)

    # Build gradient of loss function using autograd.
    training_loss_grad = grad(training_loss)

    print("Training RNN...")
    trained_params = adam(training_loss_grad, init_params, step_size=0.1,
                          num_iters=1000, callback=callback)

    print()
    print("Generating text from RNN...")
    num_letters = 30
    for t in range(20):
        text = ""
        for i in range(num_letters):
            seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :]
            logprobs = rnn_predict(trained_params, seqs)[-1].ravel()
            text += chr(npr.choice(len(logprobs), p=np.exp(logprobs)))
        print(text)
    def __init__(self, x_span, x_word, x_ctx, x_dist, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg):
        """
        :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id
        :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id
        :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id
        :param x_dist: 1D: batch; elem=distance between sentences of ant and ment
        :param y     : 1D: batch
        """

        self.input  = [x_span, x_word, x_ctx, x_dist, y]
        self.x_span = x_span
        self.x_word = x_word
        self.x_ctx  = x_ctx
        self.x_dist = x_dist
        self.y      = y

        dim_x = dim_w * (2 + 4 + 20) + 1
        batch = y.shape[0]

        """ Params """
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab, dim_w))
        else:
            self.emb = theano.shared(init_emb)

        self.W_d = theano.shared(sample_weights(dim_d))
        self.W_i = theano.shared(sample_weights(dim_x, dim_h*3))
        self.W_h = theano.shared(sample_weights(dim_h*3, dim_h))
        self.W_o = theano.shared(sample_weights(dim_h))
        self.params = [self.W_d, self.W_i, self.W_h, self.W_o]

        """ Input Layer """
        x_s = self.emb[x_span]     # 1D: batch, 2D: limit * 2,      3D: dim_w
        x_w = self.emb[x_word]     # 1D: batch, 2D: 4,              3D: dim_w
        x_c = self.emb[x_ctx]      # 1D: batch, 2D: window * 2 * 2, 3D: dim_w
        x_d = self.W_d[x_dist]     # 1D: batch
        x_s_avg = T.concatenate([T.mean(x_s[:, :x_s.shape[1]/2], 1), T.mean(x_s[:, x_s.shape[1]/2:], 1)], 1)
        x = T.concatenate([x_s_avg, x_w.reshape((batch, -1)), x_c.reshape((batch, -1)), x_d.reshape((batch, 1))], 1)

        """ Intermediate Layers """
        h1 = relu(T.dot(x, self.W_i))   # h1: 1D: batch, 2D: dim_h
        h2 = relu(T.dot(h1, self.W_h))  # h2: 1D: batch, 2D: dim_h

        """ Output Layer """
        p_y = sigmoid(T.dot(h2, self.W_o))  # p_y: 1D: batch

        """ Predicts """
        self.thresholds = theano.shared(np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX))
        self.y_hat = self.binary_predict(p_y)  # 1D: batch, 2D: 9 (thresholds)
        self.y_hat_index = T.argmax(p_y)
        self.p_y_hat = p_y[self.y_hat_index]

        """ Cost Function """
        self.nll = - T.sum(y * T.log(p_y) + (1. - y) * T.log((1. - p_y)))  # TODO: ranking criterion
        self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2

        """ Update """
        self.grad = T.grad(self.cost, self.params)
        self.updates = adam(self.params, self.grad)

        """ Check Results """
        self.result = T.eq(self.y_hat, y.reshape((y.shape[0], 1)))  # 1D: batch, 2D: 9 (thresholds)
        self.total_p = T.sum(self.y_hat, 0)
        self.total_r = T.sum(y, keepdims=True)
        self.correct = T.sum(self.result, 0)
        self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))
    def __init__(self, x_span, x_word, x_ctx, x_dist, y, init_emb, n_vocab,
                 dim_w, dim_d, dim_h, L2_reg):
        """
        :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id
        :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id
        :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id
        :param x_dist: 1D: batch; elem=distance between sentences of ant and ment
        :param y     : 1D: batch
        """

        self.input = [x_span, x_word, x_ctx, x_dist, y]
        self.x_span = x_span
        self.x_word = x_word
        self.x_ctx = x_ctx
        self.x_dist = x_dist
        self.y = y

        dim_x = dim_w * (2 + 4 + 20) + 1
        batch = y.shape[0]
        """ Params """
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab, dim_w))
        else:
            self.emb = theano.shared(init_emb)

        self.W_d = theano.shared(sample_weights(dim_d))
        self.W_i = theano.shared(sample_weights(dim_x, dim_h * 3))
        self.W_h = theano.shared(sample_weights(dim_h * 3, dim_h))
        self.W_o = theano.shared(sample_weights(dim_h))
        self.params = [self.W_d, self.W_i, self.W_h, self.W_o]
        """ Input Layer """
        x_s = self.emb[x_span]  # 1D: batch, 2D: limit * 2,      3D: dim_w
        x_w = self.emb[x_word]  # 1D: batch, 2D: 4,              3D: dim_w
        x_c = self.emb[x_ctx]  # 1D: batch, 2D: window * 2 * 2, 3D: dim_w
        x_d = self.W_d[x_dist]  # 1D: batch
        x_s_avg = T.concatenate([
            T.mean(x_s[:, :x_s.shape[1] / 2], 1),
            T.mean(x_s[:, x_s.shape[1] / 2:], 1)
        ], 1)
        x = T.concatenate([
            x_s_avg,
            x_w.reshape((batch, -1)),
            x_c.reshape((batch, -1)),
            x_d.reshape((batch, 1))
        ], 1)
        """ Intermediate Layers """
        h1 = relu(T.dot(x, self.W_i))  # h1: 1D: batch, 2D: dim_h
        h2 = relu(T.dot(h1, self.W_h))  # h2: 1D: batch, 2D: dim_h
        """ Output Layer """
        p_y = sigmoid(T.dot(h2, self.W_o))  # p_y: 1D: batch
        """ Predicts """
        self.thresholds = theano.shared(
            np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
                       dtype=theano.config.floatX))
        self.y_hat = self.binary_predict(p_y)  # 1D: batch, 2D: 9 (thresholds)
        self.y_hat_index = T.argmax(p_y)
        self.p_y_hat = p_y[self.y_hat_index]
        """ Cost Function """
        self.nll = -T.sum(y * T.log(p_y) + (1. - y) * T.log(
            (1. - p_y)))  # TODO: ranking criterion
        self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2
        """ Update """
        self.grad = T.grad(self.cost, self.params)
        self.updates = adam(self.params, self.grad)
        """ Check Results """
        self.result = T.eq(self.y_hat, y.reshape(
            (y.shape[0], 1)))  # 1D: batch, 2D: 9 (thresholds)
        self.total_p = T.sum(self.y_hat, 0)
        self.total_r = T.sum(y, keepdims=True)
        self.correct = T.sum(self.result, 0)
        self.correct_t, self.correct_f = correct_tf(self.result,
                                                    y.reshape((y.shape[0], 1)))
예제 #12
0
x = tf.placeholder(tf.float32, [None, 784])
t = tf.placeholder(tf.float32, [None, 10])

layers = [
    Dense(784, 256, tf.nn.relu),
    Dense(256, 256, tf.nn.relu),
    Dense(256, 10, tf.nn.softmax)
]

y, params = f_props(layers, x)

cost = -tf.reduce_mean(tf.reduce_sum(t * tf_log(y), axis=1))

# Choose an optimizer from sgd, sgd_clip, momentum, nesterov_momentum, adagrad, adadelta, rmsprop, adam, adamax, smorms3
updates = adam(cost, params)

train = tf.group(*updates)
test = tf.argmax(y, axis=1)

n_epochs = 10
batch_size = 100
n_batches = train_X.shape[0] // batch_size

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        train_X, train_y = shuffle(train_X, train_y, random_state=random_state)
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size
예제 #13
0
    num_epochs = 5
    step_size = 0.001

    print("Loading training data...")
    N, train_images, train_labels, test_images,  test_labels = load_mnist()

    init_params = init_random_params(param_scale)

    num_batches = int(np.ceil(len(train_images) / batch_size))
    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx+1) * batch_size)

    # Define training objective
    def objective(params, iter):
        idx = batch_indices(iter)
        return -log_posterior(params, train_images[idx], train_labels[idx], L2_reg)

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print("     Epoch     |    Train accuracy  |       Test accuracy  ")
    def print_perf(params, iter, gradient):
        if iter % num_batches == 0:
            train_acc = accuracy(params, train_images, train_labels)
            test_acc  = accuracy(params, test_images, test_labels)
            print("{:15}|{:20}|{:20}".format(iter//num_batches, train_acc, test_acc))

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad, init_params, step_size=step_size,
                            num_iters=num_epochs * num_batches, callback=print_perf)
예제 #14
0
            print(training_text.replace('\n', ' ') + "|" +
                  predicted_text.replace('\n', ' '))

    def training_loss(params, iter):
        return -rnn_log_likelihood(params, train_inputs, train_inputs)

    def callback(weights, iter, gradient):
        if iter % 10 == 0:
            print("Iteration", iter, "Train loss:", training_loss(weights, 0))
            print_training_prediction(weights)

    # Build gradient of loss function using autograd.
    training_loss_grad = grad(training_loss)
    
    print("loss", training_loss(init_params,0))
    print("grad", training_loss_grad(init_params,0))

    print("Training RNN...")
    trained_params = adam(training_loss_grad, init_params, step_size=0.1,
                          num_iters=1000, callback=callback)

    print()
    print("Generating text from RNN...")
    num_letters = 30
    for t in range(20):
        text = ""
        for i in range(num_letters):
            seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :]
            logprobs = rnn_predict(trained_params, seqs)[-1].ravel()
            text += chr(npr.choice(len(logprobs), p=np.exp(logprobs)))
        print(text)
예제 #15
0
def train_mlp(inputs,
              outputs,
              layer_sizes,  # don't include inputs and outputs
              batch_size=256,
              init_weights=None,
              param_scale=0.1,
              num_epochs=5,
              step_size=0.001,
              l1_lambda=0,
              l2_lambda=0,
              nonlinearity=util.tanh):

    # split data (again) into a training and a validation set
    (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data(
        inputs, out_data=outputs, frac=0.80)

    # define num of batches
    num_batches = int(np.ceil(tr_inputs.shape[0] / batch_size))

    # define nn arch
    num_input_dims = tr_inputs.shape[-1]
    num_output_dims = tr_outputs.shape[-1]
    layer_sizes = [num_input_dims] + layer_sizes + [num_output_dims]

    predictions, logprob, num_weights = build(layer_sizes=layer_sizes,
                                              nonlinearity=nonlinearity)

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx+1) * batch_size)

    # Define training objective
    def objective(weights, iter):
        idx = batch_indices(iter)
        return -logprob(weights, tr_inputs[idx], tr_outputs[idx]) \
            + l2_lambda * np.sum(np.power(weights,2)) \
            + l1_lambda * np.sum(np.abs(weights))

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print(
        "     Epoch     |    Train cosine  |    Train log-like  |  Holdout cosine  |  Holdout log-like  ")

    def print_perf(weights, iter, gradient):
        # make predictions
        tr_preds = predictions(weights, tr_inputs)
        va_preds = predictions(weights, va_inputs)
        # get accuracy measurements
        train_acc = cms(tr_preds, tr_outputs)
        valid_acc = cms(va_preds, va_outputs)
        # get log likelihoods
        train_ll = -logprob(weights, tr_inputs, tr_outputs)
        valid_ll = -logprob(weights, va_inputs, va_outputs)
        print("{:15}|{:20}|{:20}|{:20}|{:20}".format(
                iter//num_batches, train_acc, train_ll, valid_acc, valid_ll))

    # define init weights
    if init_weights is None:
        init_weights = param_scale * np.random.randn(num_weights)

    # optimize parameters
    trained_weights = adam(objective_grad,
                           init_weights,
                           step_size=step_size,
                           num_iters=num_epochs*num_batches,
                           callback=print_perf)

    return predictions, logprob, trained_weights
예제 #16
0
    # Train with sgd
    batch_idxs = make_batches(train_images.shape[0], batch_size)
    num_batches = len(batch_idxs)

    cur_dir = np.zeros(num_weights * 2)

    for epoch in range(num_epochs):
        batch_counter = 0
        for idxs in batch_idxs:
            log_posterior = lambda weights: logprob(weights, train_images[
                idxs], train_labels[idxs])

            objective, gradient, unpack_params = \
                black_box_variational_inference(log_posterior, num_weights, num_samples)
            '''
            grad_w = gradient(variational_params)
            cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_w
            variational_params -= learning_rate * cur_dir
            '''
            variational_params = adam(gradient,
                                      variational_params,
                                      num_batches=num_batches,
                                      batch_id=batch_counter,
                                      step_size=0.01,
                                      num_iters=10)

            weights = extract_weights(variational_params, num_weights)
            print_perf(epoch, weights)
            batch_counter += 1
예제 #17
0
def train_mlp(inputs,
              outputs,
              init_weights=None,
              num_epochs=100,
              step_size=0.001,
              batch_size=128,
              param_scale=0.01,
              l1_lambda=0,
              l2_lambda=0,
              nonlinearity=identity):

    # split data (again) into a training and a validation set
    (tr_inputs, va_inputs), (tr_outputs,
                             va_outputs) = util.split_data(inputs,
                                                           out_data=outputs,
                                                           frac=0.80)

    num_batches = int(np.ceil(tr_inputs.shape[0] / float(batch_size)))

    input_count = tr_inputs.shape[-1]
    output_count = tr_outputs.shape[-1]

    pred_fun, loglike_fun, num_weights = build(input_count,
                                               output_count,
                                               nonlinearity=nonlinearity)

    if init_weights is None:
        init_weights = np.random.randn(num_weights) * param_scale

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx + 1) * batch_size)

    def loss(weights, x, y):
        return -loglike_fun(weights, x, y) \
            + l1_lambda * np.sum(np.abs(weights)) \
            + l2_lambda * np.sum(np.power(weights, 2))

    def batch_loss(weights, iter):
        idx = batch_indices(iter)
        return loss(weights, tr_inputs[idx, :], tr_outputs[idx, :])

    print(
        "     Epoch     |    Train cosine  |    Train log-like  |  Holdout cosine  |  Holdout log-like  "
    )

    def print_perf(weights, iter, gradient):
        # make predictions
        tr_preds = pred_fun(weights, tr_inputs)
        va_preds = pred_fun(weights, va_inputs)
        # get accuracy measurements
        train_acc = cms(tr_preds, tr_outputs)
        valid_acc = cms(va_preds, va_outputs)
        # get log likelihoods
        train_ll = -loglike_fun(weights, tr_inputs, tr_outputs)
        valid_ll = -loglike_fun(weights, va_inputs, va_outputs)
        print("{:15}|{:20}|{:20}|{:20}|{:20}".format(iter // num_batches,
                                                     train_acc, train_ll,
                                                     valid_acc, valid_ll))

    grad_fun = grad(batch_loss)
    trained_weights = adam(grad_fun,
                           init_weights,
                           step_size=step_size,
                           callback=print_perf,
                           num_iters=num_epochs * num_batches)

    return pred_fun, loglike_fun, trained_weights
    testcostlist = []
    testcostlist_dropout = []
    traincostlist = []
    iterlist = []

    def print_function_dropout(params, iter, gradient):
        return

    def print_function(params, iter, gradient):
        use_dropout = False
        testcost = np.sum((neural_net_predict(params, testinputs, use_dropout) - testtargets)**2)
        testcostlist.append(testcost)

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad, init_params, step_size=step_size,
                            num_iters=200, callback=print_function)

    optimized_params_list = []
    for i in range(len(seed_inputs)):
        random.seed(seed_inputs[i])
        optimized_params_list.append(adam(objective_grad_dropout, init_params, step_size=step_size, num_iters=200, callback=print_function_dropout))

    fig2, ax = plt.subplots()
    plt.cla()
    plt.title('Dropout Result Comparisons, dropout_rate:' + str(dropout_rate))
    ax.set_xlabel("Possible Inputs")
    ax.set_ylabel("Neural Network Outputs")
    plot_inputs = np.linspace(-8, 8, num=400)
    for i in range(len(seed_inputs)):
        # Plot data and functions.
        outputs = neural_net_predict(optimized_params_list[i], np.expand_dims(plot_inputs, 1), False, True)
예제 #19
0
    # Train with sgd
    batch_idxs = make_batches(train_images.shape[0], batch_size)
    num_batches = len(batch_idxs)

    cur_dir = np.zeros(num_weights*2)

    for epoch in range(num_epochs):
        batch_counter = 0
        for idxs in batch_idxs:
            log_posterior = lambda weights: logprob(weights, train_images[idxs], train_labels[idxs])

            objective, gradient, unpack_params = \
                black_box_variational_inference(log_posterior, num_weights, num_samples)

            '''
            grad_w = gradient(variational_params)
            cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_w
            variational_params -= learning_rate * cur_dir
            '''
            variational_params = adam(gradient,
                                      variational_params,
                                      num_batches=num_batches,
                                      batch_id=batch_counter,
                                      step_size=0.01,
                                      num_iters=10)

            weights = extract_weights(variational_params, num_weights)
            print_perf(epoch, weights)
            batch_counter += 1
예제 #20
0
    # define batched loss
    batch_idxs = make_batches(N_data, batch_size=256)

    def opt_loss(W, i):
        idxs = batch_idxs[i % len(batch_idxs)]
        return loss(W, train_images[idxs], train_labels[idxs])

    # define callback function
    print("    Epoch      |    Train err  |   Test error  ")

    def print_perf(th, i, g):
        if i % 10 == 0:
            train_loss = loss(th, train_images[:1000], train_labels[:1000])
            test_perf = frac_err(th, test_images[:5000, :, :, :],
                                 test_labels[:5000, :])
            train_perf = frac_err(th, train_images[:1000, :, :, :],
                                  train_labels[:1000, :])
            print("{epoch:15} | {train_loss:15} | {train_perf:15} | {test_perf:15} " . \
                format(epoch=i,
                       train_loss = train_loss,
                       train_perf=train_perf,
                       test_perf = test_perf))

    # run sgd with momentum
    W = adam(grad(opt_loss),
             W,
             callback=print_perf,
             num_iters=20000,
             step_size=.0005)
예제 #21
0
        ax5.cla()
        ax6.cla()
        ax7.cla()
        ax.plot(inputs, targets, 'bx')
        ax.plot(plot_inputs, outputs)
        ax.set_xlabel("Possible Inputs")
        ax.set_ylabel("Neural Network Outputs")
        ax.set_ylim([-2, 2])
        plt.draw()
        ax2.matshow(params[0][0].T, cmap='cool')
        ax2.set_xlabel("Hidder Layer 1 (Incoming Weights)")
        ax3.matshow(np.array([params[0][1]]).T, cmap='cool')
        ax3.set_ylabel("Hidder Layer 1 Bias")
        ax4.matshow(params[1][0].T, cmap='cool')
        ax4.set_xlabel("Hidden Layer 2 (Incoming Weights)")
        ax5.matshow(np.array([params[1][1]]).T, cmap='cool')
        ax5.set_ylabel("Hidder Layer 2 Bias")
        ax6.matshow(params[2][0], cmap='cool')
        ax6.set_xlabel("Hidden Layer 2 (Outgoing weights)")
        ax7.matshow(np.array([params[2][1]]), cmap='cool')
        ax7.set_ylabel("Output Bias")
        #plt.savefig(str(iter) + '.jpg')
        plt.pause(1.0 / 60.0)

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad,
                            init_params,
                            step_size=step_size,
                            num_iters=100,
                            callback=print_function)
예제 #22
0
    param_scale = 0.1

    #Define the input arrays x and the desired output array y
    inputs = np.array([[1, 1, 0], [1, 0, 0], [1, 1, 1], [0, 1, 1]])
    targets = np.array([[0, 0, 1, 1]]).T

    L2_reg = 1.0

    init_params = init_random_params(param_scale, layer_sizes)

    # Define training objective
    def objective(params, iter):
        return -log_posterior(params, inputs, targets, L2_reg)

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print('Train accuracy      | Test accuracy')

    def print_final(params, iter, gradient):
        train_acc = accuracy(params, inputs, targets)
        test_acc = accuracy(params, inputs, targets)
        print("{:20}|{:20}".format(train_acc, test_acc))

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad,
                            init_params,
                            step_size=0.001,
                            num_iters=100,
                            callback=print_final)
예제 #23
0
    # Set up figure.
    fig = plt.figure(figsize=(8, 8), facecolor='white')
    ax = fig.add_subplot(111, frameon=False)
    plt.ion()
    plt.show(block=False)

    def callback(params, t, g):
        print("Iteration {} lower bound {}".format(t, -objective(params, t)))

        plt.cla()
        target_distribution = lambda x: np.exp(log_density(x, t))
        plot_isocontours(ax, target_distribution)

        mean, log_std = unpack_params(params)
        variational_contour = lambda x: mvn.pdf(x, mean,
                                                np.diag(np.exp(2 * log_std)))
        plot_isocontours(ax, variational_contour)
        plt.draw()
        plt.pause(1.0 / 30.0)

    print("Optimizing variational parameters...")
    init_mean = -1 * np.ones(D)
    init_log_std = -5 * np.ones(D)
    init_var_params = np.concatenate([init_mean, init_log_std])
    variational_params = adam(gradient,
                              init_var_params,
                              step_size=0.1,
                              num_iters=2000,
                              callback=callback)
        print("rms total for prognum", prog_num, "at breakpoint", bp, bp_rms)
        ax.plot(plot_inputs, outputs, 'g')

    LLHs = []
    LLH_xs = []

    def callback(params, t, g):
        LLH = -objective(params, t)
        LLH_xs.append(t)
        print("Iteration {} log likelihood {}".format(t, LLH))
        LLHs.append(LLH)

        y_test_pred = predictions(params, X_test)
        rms_total = ascdata.RMSE(y_test, y_test_pred)
        print("rms total", rms_total)
        plot_prediction_data(X_bp1, y_bp1, params, ax1, 1, 4194659)
        plot_prediction_data(X_bp2, y_bp2, params, ax2, 3, 4194873)
        plt.draw()
        #plt.pause(1.0/60.0)

    rs = npr.RandomState(0)
    init_params = 10 * rs.randn(num_weights)

    print("Optimizing network parameters...")
    optimized_params = adam(grad(objective), init_params,
                            step_size=0.5, num_iters=100, callback=callback)

    # fig.savefig('neuralnet_singletask_crossents.png')
    # plt.figure(2, facecolor='white')
    # plt.plot(LLH_xs, LLHs)
    # plt.savefig('neuralnet_singletask_LLH.png')
예제 #25
0
    plt.ion()
    plt.show(block=False)

    def callback(params, t, g):
        print("Iteration {} lower bound {}".format(t, -objective(params, t)))

        # Sample functions from posterior.
        mean, cov = unpack_params(params)
        rs = npr.RandomState(0)
        sample_weights = rs.randn(10, num_weights) * np.sqrt(cov) + mean
        plot_inputs = np.linspace(-8, 8, num=200)
        outputs = predictions(sample_weights, np.expand_dims(plot_inputs, 1))

        # Plot data and functions.
        plt.cla()
        ax.plot(inputs.ravel(), targets.ravel(), 'bx')
        ax.plot(plot_inputs, outputs[:, :, 0].T)
        ax.set_ylim([-2, 3])
        plt.draw()
        plt.pause(1.0/60.0)

    # Initialize variational parameters
    rs = npr.RandomState(0)
    init_mean    = rs.randn(num_weights)
    init_log_cov = -5 * np.ones(num_weights)
    init_var_params = np.concatenate([init_mean, init_log_cov])

    print("Optimizing variational parameters...")
    variational_params = adam(gradient, init_var_params,
                              step_size=0.1, num_iters=1000, callback=callback)
예제 #26
0
def train_cnn(inputs,
              outputs,
              layer_specs,
              init_weights=None,
              param_scale=0.1,
              step_size=0.001,
              batch_size=128,
              num_epochs=50,
              L2_reg=1.0):
    ''' wrapper function to train the convnet '''

    (tr_inputs, va_inputs), (tr_outputs,
                             va_outputs) = util.split_data(inputs,
                                                           out_data=outputs,
                                                           frac=0.80)

    input_shape = tr_inputs.shape
    num_data = tr_inputs.shape

    # number of batches
    num_batches = int(np.ceil(tr_inputs.shape[0] / batch_size))

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx + 1) * batch_size)

    # build CNN
    num_weights, pred_fun, loss_fun, frac_err = build(input_shape[1:],
                                                      layer_specs, L2_reg)

    def batch_loss(weights, iter):
        idx = batch_indices(iter)
        return loss_fun(weights, tr_inputs[idx], tr_outputs[idx])

    loss_grad = grad(batch_loss)

    # init weights
    if init_weights is None:
        rs = npr.RandomState()
        init_weights = rs.randn(num_weights) * param_scale

    print(
        "    Epoch      |   Train loss  |   Train err   | Validation loss  | Validation error  "
    )

    def print_perf(weights, epoch, gradients):
        va_perf = frac_err(weights, va_inputs, va_outputs)
        tr_perf = frac_err(weights, tr_inputs, tr_outputs)
        va_loss = loss_fun(weights, va_inputs, va_outputs)
        tr_loss = loss_fun(weights, tr_inputs, tr_outputs)
        print("{0:15}|{1:15}|{2:15}|{3:18}|{4:15}".format(
            epoch, tr_loss, tr_perf, va_loss, va_perf))

    # optimize parameters
    trained_weights = adam(loss_grad,
                           init_weights,
                           step_size=step_size,
                           num_iters=num_epochs,
                           callback=print_perf)

    return pred_fun, loss_fun, trained_weights
예제 #27
0
    seed = npr.RandomState(0)

    def objective(combined_params, iter):
        data_idx = batch_indices(iter)
        gen_params, rec_params = combined_params
        return -vae_lower_bound(gen_params, rec_params, train_images[data_idx],
                                seed) / data_dim

    # Get gradients of objective using autograd.
    objective_grad = grad(objective)

    print(
        "     Epoch     |    Objective  |       Fake probability | Real Probability  "
    )

    def print_perf(combined_params, iter, grad):
        if iter % 10 == 0:
            gen_params, rec_params = combined_params
            bound = np.mean(objective(combined_params, iter))
            print("{:15}|{:20}".format(iter // num_batches, bound))

            fake_data = generate_from_prior(gen_params, 20, latent_dim, seed)
            save_images(fake_data, 'vae_samples.png', vmin=0, vmax=1)

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad,
                            combined_init_params,
                            step_size=step_size,
                            num_iters=num_epochs * num_batches,
                            callback=print_perf)
            elbos.append(elbo_val)
            if t % 50 == 0:
                print("Iteration {} lower bound {}".format(t, elbo_val))

        init_mean    = -1 * np.ones(D)
        init_log_std = -5 * np.ones(D)
        init_var_params = np.concatenate([init_mean, init_log_std])
        variational_params = optfun(num_iters, init_var_params, callback)
        return np.array(elbos)

    # let's optimize this with a few different step sizes
    elbo_lists = []
    step_sizes = [.1, .25, .5]
    for step_size in step_sizes:
        # optimize with standard gradient + adam
        optfun = lambda n, init, cb: adam(gradient, init, step_size=step_size,
                                                    num_iters=n, callback=cb)
        standard_lls = optimize_and_lls(optfun)

        # optimize with natural gradient + sgd, no momentum
        optnat = lambda n, init, cb: sgd(natural_gradient, init, step_size=step_size,
                                         num_iters=n, callback=cb, mass=.001)
        natural_lls = optimize_and_lls(optnat)
        elbo_lists.append((standard_lls, natural_lls))

    # visually compare the ELBO
    plt.figure(figsize=(12,8))
    colors = ['b', 'k', 'g']
    for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists):
        plt.plot(np.arange(len(stand_lls)), stand_lls,
                 '--', label="standard (adam, step-size = %2.2f)"%ss, alpha=.5, c=col)
        plt.plot(np.arange(len(nat_lls)), nat_lls, '-',
예제 #29
0
    objective = lambda weights, t: -logprob(weights, inputs, targets)

    # Set up figure.
    fig = plt.figure(figsize=(12, 8), facecolor='white')
    ax = fig.add_subplot(111, frameon=False)
    plt.show(block=False)

    def callback(params, t, g):
        print("Iteration {} log likelihood {}".format(t,
                                                      -objective(params, t)))

        # Plot data and functions.
        plt.cla()
        ax.plot(inputs.ravel(), targets.ravel(), 'bx')
        plot_inputs = np.reshape(np.linspace(-7, 7, num=300), (300, 1))
        outputs = predictions(params, plot_inputs)
        ax.plot(plot_inputs, outputs)
        ax.set_ylim([-1, 1])
        plt.draw()
        plt.pause(1.0 / 60.0)

    rs = npr.RandomState(0)
    init_params = 0.1 * rs.randn(num_weights)

    print("Optimizing network parameters...")
    optimized_params = adam(grad(objective),
                            init_params,
                            step_size=0.01,
                            num_iters=1000,
                            callback=callback)
def experiment(train_data, valid_data, test_data, init_scale, batch_size, num_iters_hypernet, step_size_hypernet,
                 num_iters_hyper, step_size_hyper, num_iters, graph_mod, global_seed=0):
    """Run the second experiment, which consists of fitting a hypernetwork, which outputs neural network parameters.
    These neural network parameters try to fit the training data with some additional loss for the hyperparameters.
    We try to optimize the hyperparameters given the learned neural network response through the hypernetwork.
    We observe how the hypernetwork performs on the training and testing, by graphing it against the true loss.
    The true loss is found by training a neural network to convergence at a discrete number of points.

    :param train_data: The training data which is a tuple of (train_input, train_target).
    :param valid_data: The testing data which is a tuple of (valid_input, valid_target).
    :param test_data: The testing data which is a tuple of (test_input, test_target).
    :param init_scale: The scale (positive float) for the hypernetwork initialization.
    :param batch_size: The number of hyperparameters to sample for each iteration.
    :param num_iters_hypernet: The number of iterations (integer) to run the hypernetwork optimizer for.
    :param step_size_hypernet: The step size (positive float) for the hypernetwork optimizer.
    :param num_iters_hyper: The number of iterations (integer) to run the hyperparameter optimizer for.
    :param step_size_hyper: The step size (positive float) for the hypernetwork optimizer.
    :param num_iters: The number of iterations (integer) to run the optimization for.
    :param graph_mod: How many iterations (integer) to weight between each graph of the loss.
    :param global_seed: The seed (integer) to use when choosing a constant seed.
    :return: None.
    """
    assert init_scale > 0
    assert step_size_hypernet > 0 and step_size_hyper > 0
    assert num_iters > 0 and num_iters_hypernet > 0 and num_iters_hyper > 0
    global hyper_cur
    hyper_cur = -3.5  # Initialize the hyperparameter (float).

    # Define information about hyper loss and how hyper parameters are sampled.
    hyper_sample_var = 0  # 10e-4  # The variance to use when sampling hyperparameters from a Gaussian distribution.

    def sample_hypers(hyper, rs):
        """Sample a hyperparameter.

        :param hyper: The current hyperparameter ([float]).
        :param rs: A numpy randomstate.
        :return: A sampled hyperparameter (float).
        """
        return np.array([rs.randn() * hyper_sample_var + hyper]).reshape(1, -1)

    def hyper_loss(weights, hyper):
        """Find the loss for neural network that is dependant on the hyperparameter.

        :param weights: The weights ([[float]]) of the neural network.
        :param hyper: The hyperparameter (float) input to the hypernetwork.
        :return: The loss (float) of network dependant on the hyperparameter.
        """
        return -log_gaussian(weights, np.exp(hyper))

    example_hyper = sample_hypers(hyper_cur, npr.RandomState(global_seed))  # Test the sample function.
    assert example_hyper is not None

    train_inputs, train_targets = train_data
    valid_inputs, valid_targets = valid_data
    test_inputs, test_targets = test_data
    batch_ind, feature_ind = 0, 1
    elementary_input_size = np.shape(train_inputs)[feature_ind]
    elementary_output_size = np.shape(train_targets)[feature_ind]
    elementary_layer_sizes = [elementary_input_size, elementary_output_size]
    num_hypers = example_hyper.shape[feature_ind]  # The dimensionality of the hyperparameter space (integer).

    # Define neural network and function to turn a vector into its weight structure.
    example_elementary_params = init_random_params(init_scale, elementary_layer_sizes, npr.RandomState(global_seed))
    flat_elementary_params, unflatten_vector_to_network_weights = flatten(example_elementary_params)
    assert hyper_loss(example_elementary_params, example_hyper) is not None
    num_elementary_params = len(flat_elementary_params)

    # Define a hypernetwork parametrized by some hyperparameters.
    hypernet_layer_sizes = [num_hypers, num_elementary_params]  # Note that there are no hidden units.

    objective_functions = get_loss_functions(unflatten_vector_to_network_weights, sample_hypers, hyper_loss, batch_size,
                                             train_inputs, train_targets, test_inputs, test_targets, valid_inputs,
                                             valid_targets, global_seed)
    hypernet, train_objective, valid_objective, test_objective = objective_functions[:4]
    hyper_train_objective, hyper_valid_objective, hyper_test_objective = objective_functions[4:-1]
    hyper_train_stochastic_objective = objective_functions[-1]

    # Next, train a neural network from scratch with different hyperparameter values.
    real_step_size = 0.0001  # The step size to use to find the real loss (float).
    real_num_iters = 1000  # The number of iterations to use to find the real loss (integer).
    range_min = -2.0  # The min log variance for the hyper parameter of the variance of weight distribution to graph.
    range_max = 4.0  # The max log variance for the hyper parameter of the variance of weight distribution to graph.
    num_visual_points = 10  # The number of points to test the real loss of - expensive (integer).
    real_hyper_range = np.linspace(range_min + 1.0, range_max - 1.0, num_visual_points)
    real_train_loss = np.zeros(real_hyper_range.shape)
    real_train_performance = np.zeros(real_hyper_range.shape)
    real_valid_loss = np.zeros(real_hyper_range.shape)
    real_test_loss = np.zeros(real_hyper_range.shape)
    min_real_valid_loss, min_real_hyper = 10e32, 10e32
    for i, hypers in enumerate(real_hyper_range):
        print("Optimizing network parameters: ", i)
        init_params = init_random_params(init_scale, elementary_layer_sizes, npr.RandomState(global_seed))

        def cur_obj(w, seed):
            """The current objective function of the neural network.

            :param w: The weights ([float]) of the neural network.
            :param seed: The seed (integer) for sampling a hyperparameter.
            :return: The current objective value (float).
            """
            return train_objective(w, hypers, seed)

        optimized_params, _, _, _ = adam(grad(cur_obj), init_params, step_size=real_step_size, num_iters=real_num_iters)
        real_train_loss[i] = train_objective(optimized_params, hypers, global_seed)
        real_train_performance[i] = real_train_loss[i] - hyper_loss(optimized_params, hypers)
        real_valid_loss[i] = valid_objective(optimized_params, hypers, global_seed)
        if real_valid_loss[i] < min_real_valid_loss:
            min_real_valid_loss = real_valid_loss[i]
            print("Best hyperparameter found = ", hypers)
        real_test_loss[i] = test_objective(optimized_params, hypers, global_seed)

    fig, axs = create_figure_and_axs()

    # Set up the arrays to store information for plotting.
    num_hyper_test_points = 200  # Test a large number of hyperparameters with the learned function - cheap (integer)!
    learned_hyper_range = np.linspace(range_min, range_max, num_hyper_test_points) # Hyperparameters to test.
    hyper_train_loss = np.zeros(learned_hyper_range.shape)  # Hypernetwork training loss per hyperparameter.
    hyper_train_performance = np.zeros(learned_hyper_range.shape)  # Hypernetwork training performance per
    # hyperparameter.  Note that performance is loss - regularization loss.
    hyper_valid_loss, hyper_test_loss = np.zeros(learned_hyper_range.shape), np.zeros(learned_hyper_range.shape)

    def callback(hyper_weights, opt_iteration, g):
        """Do whatever work is desired on each optimization iteration.
        Draws graphs, prints information, and stores information.

        :param hyper_weights: The weights ([[float]]) of the hypernetwork.
        :param opt_iteration: The current iteration of optimization.
        :param g: The gradient ([[float]]) of the optimizer.
        :return: None.
        """
        global log_likelihoods, valid_loss, test_loss, grad_norms_hyper, grad_norms_hypernet, global_opt_iteration
        global hyper_cur
        log_likelihood = hyper_train_objective(hyper_weights, hyper_cur)
        log_likelihoods[global_opt_iteration] = log_likelihood  # Store the training loss.
        weights_cur = hypernet(hyper_weights, hyper_cur)
        train_performance[global_opt_iteration] = log_likelihood - hyper_loss(weights_cur, hyper_cur)
        valid_loss[global_opt_iteration] = hyper_valid_objective(hyper_weights, hyper_cur)
        test_loss[global_opt_iteration] = hyper_test_objective(hyper_weights, hyper_cur)
        grad_norm = np.sum([np.sum([np.sum(np.abs(weight_or_bias)) for weight_or_bias in layer]) for layer in g])
        grad_norms_hypernet[global_opt_iteration] = grad_norm
        grad_norms_hyper[global_opt_iteration] = grad_norms_hyper[global_opt_iteration-1]
        global_opt_iteration += 1
        print("Iteration {} Loss {} Grad L1 Norm {}".format(opt_iteration, log_likelihood, grad_norm))

        if global_opt_iteration % graph_mod == 0:  # Only print on every iteration that is a multiple of graph_mod.
            [ax.cla() for ax in axs]  # Clear all of the axes.
            axs[0].set_xlabel('Hyperparameter $\lambda$'), axs[0].set_ylabel('Loss $\mathcal{L}$')

            for cur, hyper in enumerate(learned_hyper_range):
                hyper_train_loss[cur] = hyper_train_objective(hyper_weights, hyper)
                weights = hypernet(hyper_weights, hyper)
                hyper_train_performance[cur] = hyper_train_loss[cur] - hyper_loss(weights, hyper)
                hyper_valid_loss[cur] = hyper_valid_objective(hyper_weights, hyper)
                hyper_test_loss[cur] = hyper_test_objective(hyper_weights, hyper)

            axs[0].plot(real_hyper_range, real_train_loss, 'bx', ms=28, label='Train loss of optimized weights')
            axs[0].plot(learned_hyper_range, hyper_train_loss, 'b-', label='Train loss of hypernetwork weights')
            axs[0].set_ylim([-1.5, 3.8])

            axs[0].plot(real_hyper_range, real_valid_loss, 'rx', ms=28, label='Valid. loss of optimized weights')
            axs[0].plot(learned_hyper_range, hyper_valid_loss, 'r-', label='Valid. loss of hypernetwork weights')
            min_hyper_found = 1.836  # Known minimum from doing a search with 1000 points over this range.
            axs[0].axvline(x=min_hyper_found, c='k', linestyle='dashed', label='Optimal hyperparameter $\lambda$')

            pdf_range = np.linspace(hyper_cur - 0.5, hyper_cur + 0.5, 100)
            axs[0].plot(pdf_range, norm.pdf(pdf_range, loc=hyper_cur, scale=0.06) / 4.0 + axs[0].get_ylim()[0], c='g',
                        label='$p (\lambda | \hat{\lambda})$')

            [ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.45),
                       borderaxespad=0.0, fancybox=True, framealpha=0.0, fontsize=28)
             for ax in axs]  # Create a legend for all the axes.
            setup_ax_and_save(axs, fig, 'hypernets_local_small')

    def callback_outer(hyper, opt_iteration, g):
        """Do whatever work is desired on each outer optimization iteration.
        Stores information.

        :param hyper: The hyperparameter (float) input to the hypernetwork.
        :param opt_iteration: The current iteration of optimization.
        :param g: The gradient ([[float]]) of the optimizer.
        :return: None.
        """
        global grad_norms_hyper, train_hypers, global_hyperopt_iteration
        grad_norms_hyper[global_opt_iteration - 1] = np.abs(g)
        train_hypers[global_hyperopt_iteration] = hyper
        global_hyperopt_iteration += 1
        print("Outer Iteration {} Hyper {} Grad L1 Norm {}".format(global_hyperopt_iteration, hyper,
                                                                   grad_norms_hyper[global_opt_iteration]))

    init_hypernet_params = init_random_params(init_scale, hypernet_layer_sizes, npr.RandomState(global_seed))
    m_hyper = None  # A record of the current m for re-starting the Adam optimizer.
    v_hyper = None  # A record of the current v for re-starting the Adam optimizer
    cur_iter_hyper = None  # A record of the current iteration for re-starting the Adam optimizer.
    for _ in range(num_iters):
        def hyper_train_stochastic_objective_current(hyper_weights, seed):
            """The objective for the hypernetwork, with a fixed hyperparameter.

            :param hyper_weights: The weights ([[float]]) of the hypernetwork.
            :param seed: The seed (integer) for sampling a hyperparameter.
            :return: The hypernetwork's loss (float).
            """
            return hyper_train_stochastic_objective(hyper_cur, hyper_weights, seed)

        init_hypernet_params = sgd(grad(hyper_train_stochastic_objective_current), init_hypernet_params,
                                   step_size=step_size_hypernet, num_iters=num_iters_hypernet, callback=callback,
                                   mass=0)

        def valid_objective_current(hyper, seed):
            """The objective for the hyperparameter, with a fixed hypernetwork.

            :param hyper: The hyperparameter (float) input to the hypernetwork.
            :param seed: The seed (integer) for sampling a hyperparameter.
            :return: The validation loss (float).
            """
            return valid_objective(hypernet(init_hypernet_params, hyper), hyper, seed)

        hyper_cur, m_hyper, v_hyper, cur_iter_hyper = adam(grad(valid_objective_current), hyper_cur,
                                                           step_size=step_size_hyper, num_iters=num_iters_hyper,
                                                           callback=callback_outer, m=m_hyper, v=v_hyper,
                                                           offset=cur_iter_hyper)
        print("The current hyperparameter is:", hyper_cur)