示例#1
0
def time_flatten():
    val = {'k':  npr.random((4, 4)),
           'k2': npr.random((3, 3)),
           'k3': 3.0,
           'k4': [1.0, 4.0, 7.0, 9.0],
           'k5': np.array([4., 5., 6.]),
           'k6': np.array([[7., 8.], [9., 10.]])}

    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)
示例#2
0
def test_dict_saxpy(motion, optimized, a, b, c):
    func = dict_saxpy
    func = tangent.tangent(func)

    func.__globals__['np'] = np
    df = tangent.grad(func, motion=motion, optimized=optimized, verbose=True)
    dx = df(dict(a=a, b=b, c=c))

    df_num = utils.numeric_grad(func)
    dx_num = df_num(dict(a=float(a), b=float(b), c=float(c)))
    flat_dx, _ = flatten(dx)
    flat_dx_num, _ = flatten(dx_num)
    assert np.allclose(flat_dx, flat_dx_num)
示例#3
0
 def g(x, *args):
     fd_grad, unflatten_fd = flatten(tangent.init_grad(x))
     y = func(deepcopy(x), *args)
     seed = np.ones_like(y)
     for d in range(fd_grad.size):
         x_flat, unflatten_x = flatten(deepcopy(x))
         x_flat[d] += eps / 2
         a = np.array(func(unflatten_x(x_flat), *args))
         x_flat, unflatten_x = flatten(deepcopy(x))
         x_flat[d] -= eps / 2
         b = np.array(func(unflatten_x(x_flat), *args))
         fd_grad[d] = np.dot((a - b) / eps, seed)
     return unflatten_fd(fd_grad)
示例#4
0
def time_flatten():
    val = {
        'k': npr.random((4, 4)),
        'k2': npr.random((3, 3)),
        'k3': 3.0,
        'k4': [1.0, 4.0, 7.0, 9.0],
        'k5': np.array([4., 5., 6.]),
        'k6': np.array([[7., 8.], [9., 10.]])
    }

    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)
示例#5
0
    def reset_weights(self):
        '''reinitialize NN weights (randomly)'''
        self.params_list = [init_weights(n_hidden=self.n_hidden)
                            for _ in range(self.Nvar)]

        flattened_params, unflat_func = flatten(self.params_list)
        self.flattened_params = flattened_params
        self.unflat_func = unflat_func
示例#6
0
def random_init(n):
    # create random weight matrix
    # input is [x,t], output is same dimension as x
    w1 = np.random.normal(0, 0.1, (n + 1, n + 1))
    w2 = np.random.normal(0, 0.1, (n, n + 1))
    b1 = np.random.normal(0, 0.1, (n + 1))
    b2 = np.random.normal(0, 0.1, (n))
    return flatten([w1, w2, b1, b2])
示例#7
0
def test_dict_saxpy(motion, optimized, a, b, c):
  func = dict_saxpy
  func = tangent.tangent(func)

  func.__globals__['np'] = np
  df = tangent.autodiff(
      func,
      motion=motion,
      optimized=optimized,
      verbose=True,
      input_derivative=INPUT_DERIVATIVE.DefaultOne)
  dx = df(dict(a=a, b=b, c=c))

  df_num = utils.numeric_grad(func)
  dx_num = df_num(dict(a=float(a), b=float(b), c=float(c)))
  flat_dx, _ = flatten(dx)
  flat_dx_num, _ = flatten(dx_num)
  assert np.allclose(flat_dx, flat_dx_num)
def PrintPerf(Params, iter, _):
    if iter == 0:
        print("     Epoch     |    Train cost  ")
    if iter % 5 == 0:
        Cost = ObjectiveFunWrap(Params, iter)
        Gradient = flatten(ObjectiveGrad(Params, iter))
        print(
            str(iter) + '  ' + str(np.round(Cost, 6)) + '  ' +
            str(np.square(Gradient[0]).sum()))
示例#9
0
def log_gaussian(weights, var):
    """Find the log probability of the weights given some centered, spherical Gaussian prior.

    :param weights: The parameters ([[float]]) of the neural network.
    :param var: The variance (positive float) of the Gaussian distribution.
    :return: The log probability (float) of the weights of the neural network.
    """
    assert var > 0
    flat_params, _ = flatten(weights)
    return -var * np.linalg.norm(flat_params, 2)  # np.mean(norm.logpdf(flat_params, 0, var))
示例#10
0
def time_grad_flatten():
    val = {'k':  npr.random((4, 4)),
           'k2': npr.random((3, 3)),
           'k3': 3.0,
           'k4': [1.0, 4.0, 7.0, 9.0],
           'k5': np.array([4., 5., 6.]),
           'k6': np.array([[7., 8.], [9., 10.]])}

    vect, unflatten = flatten(val)
    def fun(vec):
        v = unflatten(vec)
        return np.sum(v['k5']) + np.sum(v['k6'])

    grad(fun)(vect)
示例#11
0
def l2_norm(params):
    """Computes squared l2 norm of params. 
    
    Parameters
    ----------
    params : list of (weights, biases) tuples
        parameters of the net
    
    Returns
    -------
    float
        squared euclidean norm of params
    """
    flattened, _ = flatten(params)
    return np.dot(flattened, flattened)
示例#12
0
def gradient_descent(g, w_unflat, alpha_choice, max_its, version, **kwargs):
    verbose = False
    if 'verbose' in kwargs:
        verbose = kwargs['verbose']

    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w_unflat)
    grad = compute_grad(g)

    # record history
    w_hist = []
    w_hist.append(w_unflat)

    # over the line
    for k in range(max_its):
        if verbose == True:
            if np.mod(k, 5) == 0:
                print('started iteration ' + str(k) + ' of ' + str(max_its))

        # check if diminishing steplength rule used
        if alpha_choice == 'diminishing':
            alpha = 1 / float(k)
        else:
            alpha = alpha_choice

        # plug in value into func and derivative
        grad_eval = grad(w_unflat)
        grad_eval, _ = flatten(grad_eval)

        ### normalized or unnormalized descent step? ###
        if version == 'normalized':
            grad_norm = np.linalg.norm(grad_eval)
            if grad_norm == 0:
                grad_norm += 10**-6 * np.sign(2 * np.random.rand(1) - 1)
            grad_eval /= grad_norm

        # take descent step
        w = w - alpha * grad_eval

        # record weight update
        w_unflat = unflatten(w)
        w_hist.append(w_unflat)

    if verbose == True:
        print('finished all ' + str(max_its) + ' iterations')

    return w_hist
示例#13
0
def time_grad_flatten():
    val = {
        'k': npr.random((4, 4)),
        'k2': npr.random((3, 3)),
        'k3': 3.0,
        'k4': [1.0, 4.0, 7.0, 9.0],
        'k5': np.array([4., 5., 6.]),
        'k6': np.array([[7., 8.], [9., 10.]])
    }

    vect, unflatten = flatten(val)

    def fun(vec):
        v = unflatten(vec)
        return np.sum(v['k5']) + np.sum(v['k6'])

    grad(fun)(vect)
示例#14
0
    def gradient_descent(self, g, w_unflat, alpha, max_its, version, **kwargs):
        verbose = False
        if 'verbose' in kwargs:
            verbose = kwargs['verbose']

        # flatten the input function, create gradient based on flat function
        g_flat, unflatten, w = flatten_func(g, w_unflat)
        grad = compute_grad(g)

        # record history
        w_hist = []
        w_hist.append(w_unflat)

        # over the line
        for k in range(max_its):
            # plug in value into func and derivative
            grad_eval = grad(w_unflat)
            grad_eval, _ = flatten(grad_eval)

            ### normalized or unnormalized descent step? ###
            if version == 'normalized':
                grad_norm = np.linalg.norm(grad_eval)
                if grad_norm == 0:
                    grad_norm += 10**-6 * np.sign(2 * np.random.rand(1) - 1)
                grad_eval /= grad_norm

            # take descent step
            w = w - alpha * grad_eval

            # record weight update
            w_unflat = unflatten(w)
            w_hist.append(w_unflat)

        if verbose == True:
            print('...optimization complete!')
            time.sleep(1.5)
            clear_output()

        return w_hist
示例#15
0
def experiment(train_data,
               valid_data,
               init_scale,
               num_iters_hypernet,
               step_size_hypernet,
               step_size,
               num_iters,
               batch_size_data,
               global_seed=0):
    """Run the second experiment, which consists of fitting a hypernetwork, which outputs neural network parameters.
    These neural network parameters try to fit the training data with some additional loss for the hyperparameters.
    We try to optimize the hyperparameters given the learned neural network response through the hypernetwork.
    We observe how the hypernetwork performs on the training and testing, by graphing it against the true loss.
    The true loss is found by training a neural network to convergence at a discrete number of points.

    :param train_data: The training data.
    :param valid_data: The testing data.
    :param init_scale: The scale (positive float) for the hypernetwork initialization.
    :param num_iters_hypernet: The number of iterations (integer) to run the hypernetwork optimizer for.
    :param step_size_hypernet: The step size (positive float) for the hypernetwork optimizer.
    :param step_size: The step size (positive float) for the loss approximation optimizer.
    :param num_iters: The number of iterations (integer) to run the optimization for.
    :param batch_size_data: The number of data points (integer) for a batch.
    :param global_seed: The seed (integer) to use when choosing a constant seed.
    :return: None, but saves pictures.
    """
    assert init_scale > 0
    assert step_size_hypernet > 0, step_size > 0
    assert num_iters > 0, num_iters_hypernet > 0

    def hyper_loss(weights, hyper):
        """Find the loss for neural network that is dependant on the hyperparameter.

        :param weights: The weights ([[float]]) of the neural network.
        :param hyper: The hyperparameter (float) input to the hypernetwork.
        :return: The loss (float) of network dependant on the hyperparameter.
        """
        return -log_gaussian(weights, hyper)

    train_inputs, train_targets = train_data
    valid_inputs, valid_target = valid_data
    batch_ind, feature_ind = 0, 1
    elementary_input_size = np.shape(train_inputs)[feature_ind]
    elementary_output_size = np.shape(train_targets)[feature_ind]
    elementary_layer_sizes = [elementary_input_size, elementary_output_size]
    num_hypers = 10  # The dimensionality of the hyperparameter space (integer).
    batch_size_elementary = 100  # The number of elementary data points to sample (i.e not hyperparameters).

    # Define neural network and function to turn a vector into its weight structure.
    example_elementary_params = init_random_params(
        init_scale, elementary_layer_sizes, npr.RandomState(global_seed))
    flat_elementary_params, unflatten_vector_to_network_weights = flatten(
        example_elementary_params)
    num_elementary_params = len(flat_elementary_params)

    rs_train = npr.RandomState(global_seed)

    def train_objective(weights, hyper, seed):
        """The objective for training a neural network.

        :param weights: The weights ([[float]]) of the neural network.
        :param hyper: The hyperparameter (float) input to the hypernetwork.
        :param seed: The seed (integer) for sampling.
        :return: The training loss (float).
        """
        idx = rs_train.randint(len(train_inputs) - batch_size_elementary)
        return -pred_loss(
            weights, train_inputs[idx:idx + batch_size_elementary],
            train_targets[idx:idx + batch_size_elementary]) + hyper_loss(
                weights, hyper)

    def valid_objective(weights, hyper, seed):
        """The objective for validating a neural network.

        :param weights: The weights ([[float]]) of the neural network.
        :param hyper: The hyperparameter (float) input to the hypernetwork.
        :param seed: The seed (integer) for sampling a hyperparameter.
        :return: The validation loss (float).
        """
        return -pred_loss(weights, valid_inputs, valid_target)

    # TODO: Rename valid_objective to prediction loss, and move train objective into data generator block

    pickle_name = 'learn_vs_true_loss_scatter.pickle'
    if not os.path.isfile(
            pickle_name
    ):  # Train a neural network from scratch with different hyperparameter values.
        real_step_size = 0.0001  # The step size to use to find the real loss (float).
        real_num_iters = 1000  # The number of iterations to use to find the real loss (integer).
        num_data = 2**10 * 10
        data_inputs, data_target_params, data_target_loss = [], [], []
        for i in range(num_data):
            hyper_train = rs_train.rand(num_hypers) * 6.0 - 3.0
            print("Optimizing network parameters: ", i)
            init_params = init_random_params(init_scale,
                                             elementary_layer_sizes,
                                             npr.RandomState(global_seed))

            def cur_obj(w, seed):
                """The current objective function of the neural network.

                :param w: The weights ([float]) of the neural network.
                :param seed: The seed (integer) for sampling a hyperparameter.
                :return: The current objective value (float).
                """
                return train_objective(w, hyper_train, seed)

            optimized_params, _, _, _ = adam(grad(cur_obj),
                                             init_params,
                                             step_size=real_step_size,
                                             num_iters=real_num_iters)
            loss = valid_objective(optimized_params, hyper_train, global_seed)
            data_inputs += [hyper_train]
            flatten_opt_param, unflatten_vector_to_network_weights = flatten(
                optimized_params)
            data_target_params += [flatten_opt_param]
            data_target_loss += [loss]
        data_inputs = np.array(data_inputs)
        data_target_params = np.array(data_target_params)
        data_target_loss = np.array(data_target_loss)

        with open(pickle_name, 'wb') as handle:
            pickle.dump(
                {
                    'inputs': data_inputs,
                    'target_params': data_target_params,
                    'target_loss': data_target_loss
                },
                handle,
                protocol=pickle.HIGHEST_PROTOCOL)

    with open(pickle_name, 'rb') as handle:
        pickle_data = pickle.load(handle)
        data_inputs = pickle_data['inputs']
        data_target_params = pickle_data['target_params']
        data_target_loss = pickle_data['target_loss']

    batch_size_sample = batch_size_data
    train_ind, valid_ind = batch_size_data, batch_size_data
    data_inputs_train, data_inputs_valid = data_inputs[:
                                                       train_ind], data_inputs[
                                                           valid_ind:]
    data_target_params_train, _ = data_target_params[:
                                                     train_ind], data_target_params[
                                                         valid_ind:]
    data_target_loss_train, data_target_loss_valid = data_target_loss[:train_ind], data_target_loss[
        valid_ind:]

    # New training for lambda, W, and lambda, Loss
    weight_layer_sizes = [num_hypers, num_elementary_params]
    init_weight_params = init_random_params(init_scale, weight_layer_sizes,
                                            npr.RandomState(global_seed))

    def train_weight_objective_loss(weights, seed):
        """The objective for training a neural network.

        :param weights: The weights ([[float]]) of the neural network.
        :param seed: The seed (integer) for sampling.
        :return: The training loss (float).
        """
        local_data_inputs = [
            rs_train.rand(num_hypers) * 6.0 - 3.0
            for _ in range(batch_size_sample)
        ]
        losses = [
            train_objective(
                unflatten_vector_to_network_weights(
                    nn_predict(weights, np.array([local_data_input]),
                               identity)[0]), local_data_input, global_seed)
            for local_data_input in local_data_inputs
        ]
        return np.mean(np.array(losses))

    def callback_weights_loss(weights, opt_iteration, g):
        """A callback for optimization.

        :param weights: The hypernetwork weights
        :param opt_iteration: The optimization iteration
        :param g: The gradient.
        :return: None
        """
        if opt_iteration % 10 == 0:
            print("Sampled Valid Loss Target: ", opt_iteration, ", Loss: ",
                  train_weight_objective_loss(weights, 0))

    weight_params_loss, _, _, _ = adam(grad(train_weight_objective_loss),
                                       init_weight_params,
                                       step_size=step_size_hypernet,
                                       num_iters=num_iters_hypernet + 100,
                                       callback=callback_weights_loss)

    init_weight_params = init_random_params(init_scale, weight_layer_sizes,
                                            npr.RandomState(global_seed))

    def train_weight_objective_loss_target(weights, seed):
        """The objective for training a neural network.

        :param weights: The weights ([[float]]) of the neural network.
        :param seed: The seed (integer) for sampling.
        :return: The training loss (float).
        """
        idx = rs_train.randint(
            np.maximum(len(data_inputs_train) - batch_size_data, 1))
        local_data_inputs = data_inputs_train[idx:idx + batch_size_data]
        losses = [
            train_objective(
                unflatten_vector_to_network_weights(
                    nn_predict(weights, np.array([local_data_input]),
                               identity)[0]), local_data_input, global_seed)
            for local_data_input in local_data_inputs
        ]
        return np.mean(np.array(losses))

    def callback_weights_loss_target(weights, opt_iteration, g):
        """A callback for optimization.

        :param weights: The hypernetwork weights
        :param opt_iteration: The optimization iteration
        :param g: The gradient.
        :return: None
        """
        if opt_iteration % 10 == 0:
            print("Fixed Valid Loss Target: ", opt_iteration, ", Loss: ",
                  train_weight_objective_loss_target(weights, 0))

    weight_params_loss_target, _, _, _ = adam(
        grad(train_weight_objective_loss_target),
        init_weight_params,
        step_size=step_size_hypernet,
        num_iters=num_iters_hypernet,
        callback=callback_weights_loss_target)

    print("Preparing the data for plotting...")
    kernel = RBF()
    gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=1)
    gp.fit(data_inputs_train, data_target_loss_train)
    gp_loss_predictions, sigma = gp.predict(data_inputs_valid, return_std=True)

    def hypernet_loss(weights, local_data_input):
        """Find the loss for the hypernetwork.

        :param weights: The hypernet weights
        :param local_data_input: A hyperparameter.
        :return: None
        """
        weight_predictions_valid = nn_predict(weights, [local_data_input],
                                              identity)
        weight_predictions_valid = unflatten_vector_to_network_weights(
            weight_predictions_valid[0])
        return valid_objective(weight_predictions_valid, None, global_seed)

    loss_weight_predictions_loss = [
        hypernet_loss(weight_params_loss, data_input)
        for data_input in data_inputs_valid
    ]
    loss_weight_predictions_loss_target = [
        hypernet_loss(weight_params_loss_target, data_input)
        for data_input in data_inputs_valid
    ]

    fig, axs = create_figure_and_axs(fig_width=21,
                                     fig_height=7,
                                     num_cols=3,
                                     ms_size=34)

    print("Drawing the scatter plot...")
    min_v, max_v = 0.6, 1.1
    axs[0].hexbin(data_target_loss_valid,
                  gp_loss_predictions,
                  extent=[min_v, max_v, min_v, max_v],
                  cmap='Reds',
                  mincnt=1)
    axs[1].hexbin(data_target_loss_valid,
                  loss_weight_predictions_loss_target,
                  extent=[min_v, max_v, min_v, max_v],
                  cmap='Greens',
                  mincnt=1)
    axs[2].hexbin(data_target_loss_valid,
                  loss_weight_predictions_loss,
                  extent=[min_v, max_v, min_v, max_v],
                  cmap='Blues',
                  mincnt=1)

    print(
        "____________________________________________________________________________"
    )
    print("Number of train data points: ", batch_size_data)
    print("GP Predicted Best: ", np.min(gp_loss_predictions),
          ", Actual Result: ",
          data_target_loss_valid[np.argmin(gp_loss_predictions)])
    print(
        "Fixed Hypernet Predicted Best: ",
        np.min(loss_weight_predictions_loss_target), ", Actual Result: ",
        data_target_loss_valid[np.argmin(loss_weight_predictions_loss_target)])
    print("Stochastic Hypernet Predicted Best: ",
          np.min(loss_weight_predictions_loss), ", Actual Result: ",
          data_target_loss_valid[np.argmin(loss_weight_predictions_loss)])
    print("Actual Best: ", np.min(data_target_loss_valid))
    print(
        "____________________________________________________________________________"
    )

    orient_line = np.linspace(min_v, max_v, 100)
    for ax in axs:
        ax.plot(orient_line, orient_line, color='k')
        ax.set_xlim([min_v, max_v])
        ax.set_ylim([min_v, max_v])

    # axs[0].set_title('GP Mean')
    # axs[1].set_title('Hyper-train fixed')
    # axs[2].set_title('Hyper-train')

    axs[0].set_ylabel('Inferred Loss')

    #axs[1].set_xlabel('True loss')

    axs[1].set_yticks([])
    axs[2].set_yticks([])

    axs[0].set_xticks([.7, .8, .9, 1.0])
    axs[1].set_xticks([.7, .8, .9, 1.0])
    axs[2].set_xticks([.7, .8, .9, 1.0])
    axs[0].set_yticks([.7, .8, .9, 1.0])
    setup_ax_and_save(axs,
                      fig,
                      'learn_vs_true_loss_scatter',
                      do_xticks=False,
                      do_yticks=False,
                      y_mod=750.0,
                      dpi=300)
    for key, ax in enumerate(axs):
        #if key is 0:
        extent = ax.get_window_extent().transformed(
            fig.dpi_scale_trans.inverted())
        fig.savefig('figures/ax' + str(key) + '_scatter.png',
                    bbox_inches=extent.expanded(1.32, 1.15))
        fig.savefig('figures/ax' + str(key) + '_scatter.pdf',
                    bbox_inches=extent.expanded(1.32, 1.15))
        #else:
        #extent = full_extent(ax, do_yticks=False).transformed(fig.dpi_scale_trans.inverted())
        #fig.savefig('figures/ax' + str(key) + '_scatter.png', bbox_inches=extent.expanded(1.0, 1.15))

    print("Drawing the histograms...")
    [ax.cla() for ax in axs]

    bins = 50
    axs[0].hist(gp_loss_predictions - data_target_loss_valid,
                bins=bins,
                color='r',
                normed=True,
                edgecolor='r')
    axs[1].hist(loss_weight_predictions_loss_target - data_target_loss_valid,
                bins=bins,
                color='g',
                normed=True,
                edgecolor='g')
    axs[2].hist(loss_weight_predictions_loss - data_target_loss_valid,
                bins=bins,
                color='b',
                normed=True,
                edgecolor='b')

    axs[0].set_ylabel('Frequency')
    axs[1].set_xlabel('Inferred - true loss')

    y_min, y_max = 10e32, -10e32
    for ax in axs:
        ylim = ax.get_ylim()
        y_min, y_max = np.minimum(y_min, ylim[0]), np.maximum(y_max, ylim[1])
    x_min, x_max = -0.35, 0.6
    for ax in axs:
        ax.set_xlim([x_min, x_max]), ax.set_ylim([y_min, y_max])
        ax.axvline(0, ymax=1.0, linestyle='--', color='Black')

    setup_ax_and_save(axs, fig, 'learn_vs_true_loss_hist', do_xticks=False)
    for key, ax in enumerate(axs):
        extent = full_extent(ax).transformed(fig.dpi_scale_trans.inverted())
        if key is 0:
            fig.savefig('figures/ax' + str(key) + '_hist.png',
                        bbox_inches=extent)  #.expand(1.32, 1.15))
            fig.savefig('figures/ax' + str(key) + '_hist.pdf',
                        bbox_inches=extent)
        else:
            fig.savefig('figures/ax' + str(key) + '_hist.png',
                        bbox_inches=extent)
            fig.savefig('figures/ax' + str(key) + '_hist.pdf',
                        bbox_inches=extent)
示例#16
0
D = np.array([[0.], [0.]])

SysRaw = {'A': A, 'B': B, 'C': C, 'D': D, 'dt': dt, 'delay': 0}

# Define equivalent state-space system with delayed input
# Number of samples delay
NumDelay = 0
# SS system
Sys = InitiateDelayedinputSystem(SysRaw, NumDelay)

#%%
# Initialize neural net parameters
ParamsInitial = InitParams(LayerSizes)

# Flatten parameter tuple
ParamsInitialFlat, UnflattenParams = flatten(ParamsInitial)
NetParams = ParamsInitialFlat
# Get gradient of objective using autograd.
#ObjectiveGrad = grad(ObjectiveFunWrap,argnum=0)

#%%

algo = algorithm(cmaes(gen=50))
algo.set_verbosity(2)
prob = problem(test_fit_bitch())
pop = population(prob, 30)
pop = algo.evolve(pop)

uda = algo.extract(cmaes)
uda.get_log()
示例#17
0
    def reset_weights(self):
        """Reset parameters of a NN."""

        self.params_list = _init_weights(self.x, sizes=self.sizes)
        self.flattened_params, self.unflat_func = flatten(self.params_list)
示例#18
0
def l2_norm(params):
    """Computes l2 norm of params by flattening them into a vector."""
    flattened, _ = flatten(params)
    return np.dot(flattened, flattened)
示例#19
0
def l2_norm(params):
    """Computes l2 norm of params by flattening them into a vector."""
    flattened, _ = flatten(params)
    return np.dot(flattened, flattened)
示例#20
0
    idx = np.arange(n)

    def get_batch(X, Y, size):
        np.random.shuffle(idx)
        return X[idx[:size]], Y[idx[:size]]

    clip = 1.
    tcondition, tcount = .2, 10

    count = 0
    while True:
        X_batch, Y_batch = get_batch(X, Y, 100)

        gradients = grad(params, X_batch, Y_batch, forward)
        gnorm = np.sqrt(np.square(flatten(gradients)[0]).sum())
        gclip = clip / gnorm
        for p, g in zip(params, gradients):
            for j in range(2):
                if gclip < 1.:
                    g[j][:] *= gclip

                p[j][:] -= g[j]
                # gnorm += np.square(g[j]).sum()
        l = loss(params, X_batch, Y_batch, forward)
        # output, _ = forward(params, X_batch)
        print(l, gnorm)
        if l < tcondition:
            count += 1
            if count == tcount:
                break
示例#21
0
def l2_norm(params):
    flattened, _ = flatten(params)
    return np.dot(flattened, flattened)
示例#22
0
def experiment(train_data,
               valid_data,
               test_data,
               init_scale,
               batch_size,
               num_iters_hypernet,
               step_size_hypernet,
               graph_mod,
               global_seed=0):
    """Run the first experiment, which consists of fitting a hypernetwork, which outputs neural network parameters.
    These neural network parameters try to fit the training data with some additional loss for the hyperparameters.
    We observe how the hypernetwork performs on the training and testing, by graphing it against the true loss.
    The true loss is found by training a neural network to convergence at a discrete number of points.

    :param train_data: The training data which is a tuple of (train_input, train_target).
    :param valid_data: The testing data which is a tuple of (valid_input, valid_target).
    :param test_data: The testing data which is a tuple of (test_input, test_target).
    :param init_scale: The scale (positive float) for the hypernetwork initialization.
    :param batch_size: The number of hyperparameters to sample for each iteration.
    :param num_iters_hypernet: The number of iterations (integer) to run the hypernetwork optimizer for.
    :param step_size_hypernet: The step size (positive float) for the hypernetwork optimizer.
    :param graph_mod: How many iterations (integer) to weight between each graph of the loss.
    :param global_seed: The seed (integer) to use when choosing a constant seed.
    :return: None.
    """
    assert init_scale > 0
    assert step_size_hypernet > 0
    assert num_iters_hypernet > 0
    global hyper_cur
    hyper_cur = 1.0  # Initialize the hyperparameter (float).

    # Define information about hyper loss and how hyper parameters are sampled.
    hyper_sample_var = 3.0  # The variance to use when sampling hyperparameters from a Gaussian distribution.

    def sample_hypers(hyper, rs):
        """Sample a hyperparameter.

        :param hyper: The current hyperparameter ([float]).
        :param rs: A numpy randomstate.
        :return: A sampled hyperparameter (float).
        """
        ret = np.array([rs.randn() * hyper_sample_var + hyper]).reshape(1, -1)
        return np.clip(ret, -100.0, 4.0)  # The bounds on our graph.

    def hyper_loss(weights, hyper):
        """Find the loss for neural network that is dependant on the hyperparameter.

        :param weights: The weights ([[float]]) of the neural network.
        :param hyper: The hyperparameter (float) input to the hypernetwork.
        :return: The loss (float) of network dependant on the hyperparameter.
        """
        return -log_gaussian(weights, np.exp(hyper))

    example_hyper = sample_hypers(
        hyper_cur, npr.RandomState(global_seed))  # Test the sample function.
    assert example_hyper is not None

    train_inputs, train_targets = train_data
    valid_inputs, valid_targets = valid_data
    test_inputs, test_targets = test_data
    batch_ind, feature_ind = 0, 1
    elementary_input_size = np.shape(train_inputs)[feature_ind]
    elementary_output_size = np.shape(train_targets)[feature_ind]
    elementary_layer_sizes = [elementary_input_size, elementary_output_size]
    num_hypers = example_hyper.shape[
        feature_ind]  # The dimensionality of the hyperparameter space (integer).

    # Define neural network and function to turn a vector into its weight structure.
    example_elementary_params = init_random_params(
        init_scale, elementary_layer_sizes, npr.RandomState(global_seed))
    flat_elementary_params, unflatten_vector_to_network_weights = flatten(
        example_elementary_params)
    assert hyper_loss(example_elementary_params, example_hyper) is not None
    num_elementary_params = len(flat_elementary_params)

    # Define a hypernetwork parametrized by some hyperparameters.
    hypernet_layer_sizes = [num_hypers, 50, num_elementary_params
                            ]  # Note that there are no hidden units.

    objective_functions = get_loss_functions(
        unflatten_vector_to_network_weights, sample_hypers, hyper_loss,
        batch_size, train_inputs, train_targets, test_inputs, test_targets,
        valid_inputs, valid_targets, global_seed)
    hypernet, train_objective, valid_objective, test_objective = objective_functions[:
                                                                                     4]
    hyper_train_objective, hyper_valid_objective, hyper_test_objective = objective_functions[
        4:-1]
    hyper_train_stochastic_objective = objective_functions[-1]

    # Next, train a neural network from scratch with different hyperparameter values.
    real_step_size = 0.0001  # The step size to use to find the real loss (float).
    real_num_iters = 1000  # The number of iterations to use to find the real loss (integer).
    range_min = -2.0  # The min log variance for the hyper parameter of the variance of weight distribution to graph.
    range_max = 4.0  # The max log variance for the hyper parameter of the variance of weight distribution to graph.
    num_visual_points = 10  # The number of points to test the real loss of - expensive (integer).
    real_hyper_range = np.linspace(range_min + 1.0, range_max - 1.0,
                                   num_visual_points)
    real_train_loss = np.zeros(real_hyper_range.shape)
    real_train_performance = np.zeros(real_hyper_range.shape)
    real_valid_loss = np.zeros(real_hyper_range.shape)
    real_test_loss = np.zeros(real_hyper_range.shape)
    min_real_valid_loss, min_real_hyper = 10e32, 10e32
    for i, hypers in enumerate(real_hyper_range):
        print("Optimizing network parameters: ", i)
        init_params = init_random_params(init_scale, elementary_layer_sizes,
                                         npr.RandomState(global_seed))

        def cur_obj(w, seed):
            """The current objective function of the neural network.

            :param w: The weights ([float]) of the neural network.
            :param seed: The seed (integer) for sampling a hyperparameter.
            :return: The current objective value (float).
            """
            return train_objective(w, hypers, seed)

        optimized_params, _, _, _ = adam(grad(cur_obj),
                                         init_params,
                                         step_size=real_step_size,
                                         num_iters=real_num_iters)
        real_train_loss[i] = train_objective(optimized_params, hypers,
                                             global_seed)
        real_train_performance[i] = real_train_loss[i] - hyper_loss(
            optimized_params, hypers)
        real_valid_loss[i] = valid_objective(optimized_params, hypers,
                                             global_seed)
        if real_valid_loss[i] < min_real_valid_loss:
            min_real_valid_loss = real_valid_loss[i]
            print("Best hyperparameter found = ", hypers)
        real_test_loss[i] = test_objective(optimized_params, hypers,
                                           global_seed)

    fig, axs = create_figure_and_axs()

    # Set up the arrays to store information for plotting.
    num_hyper_test_points = 200  # Test a large number of hyperparameters with the learned function - cheap (integer)!
    learned_hyper_range = np.linspace(
        range_min, range_max,
        num_hyper_test_points)  # Hyperparameters to test.
    hyper_train_loss = np.zeros(
        learned_hyper_range.shape
    )  # Hypernetwork training loss per hyperparameter.
    hyper_train_performance = np.zeros(
        learned_hyper_range.shape)  # Hypernetwork training performance per
    # hyperparameter.  Note that performance is loss - regularization loss.
    hyper_valid_loss, hyper_test_loss = np.zeros(
        learned_hyper_range.shape), np.zeros(learned_hyper_range.shape)

    def callback(hyper_weights, opt_iteration, g):
        """Do whatever work is desired on each optimization iteration.
        Draws graphs, prints information, and stores information.

        :param hyper_weights: The weights ([[float]]) of the hypernetwork.
        :param opt_iteration: The current iteration of optimization.
        :param g: The gradient ([[float]]) of the optimizer.
        :return: None.
        """
        global log_likelihoods, valid_loss, test_loss, grad_norms_hyper, grad_norms_hypernet, global_opt_iteration
        global hyper_cur
        log_likelihood = hyper_train_objective(hyper_weights, hyper_cur)
        log_likelihoods[
            global_opt_iteration] = log_likelihood  # Store the training loss.
        weights_cur = hypernet(hyper_weights, hyper_cur)
        train_performance[global_opt_iteration] = log_likelihood - hyper_loss(
            weights_cur, hyper_cur)
        valid_loss[global_opt_iteration] = hyper_valid_objective(
            hyper_weights, hyper_cur)
        test_loss[global_opt_iteration] = hyper_test_objective(
            hyper_weights, hyper_cur)
        grad_norm = np.sum([
            np.sum(
                [np.sum(np.abs(weight_or_bias)) for weight_or_bias in layer])
            for layer in g
        ])
        grad_norms_hypernet[global_opt_iteration] = grad_norm
        grad_norms_hyper[global_opt_iteration] = grad_norms_hyper[
            global_opt_iteration - 1]
        global_opt_iteration += 1
        print("Iteration {} Loss {} Grad L1 Norm {}".format(
            opt_iteration, log_likelihood, grad_norm))

        if global_opt_iteration % graph_mod == 0:  # Only print on every iteration that is a multiple of graph_mod.
            [ax.cla() for ax in axs]  # Clear all of the axes.
            axs[0].set_xlabel('Hyperparameter $\lambda$')
            axs[0].set_ylabel(
                'Validation Loss $\mathcal{L}_{\mathrm{Valid.}}$')

            for cur, hyper in enumerate(learned_hyper_range):
                hyper_train_loss[cur] = hyper_train_objective(
                    hyper_weights, hyper)
                weights = hypernet(hyper_weights, hyper)
                hyper_train_performance[
                    cur] = hyper_train_loss[cur] - hyper_loss(weights, hyper)
                hyper_valid_loss[cur] = hyper_valid_objective(
                    hyper_weights, hyper)
                hyper_test_loss[cur] = hyper_test_objective(
                    hyper_weights, hyper)

            axs[0].plot(real_hyper_range,
                        real_valid_loss,
                        'kx',
                        ms=28,
                        label="Cross-validation")
            axs[0].plot(learned_hyper_range,
                        hyper_valid_loss,
                        'r-',
                        label="Optimized hypernetwork")
            min_hyper_found = 1.8  # Known minimum from doing a search with ~1000 points over this range.
            axs[0].axvline(x=min_hyper_found,
                           ymax=0.66,
                           c='k',
                           linestyle='dashed',
                           label='Optimal hyperparameter $\lambda^{*}$')

            [
                ax.legend(loc='upper left',
                          borderaxespad=0.0,
                          fancybox=True,
                          framealpha=0.0,
                          fontsize=28) for ax in axs
            ]
            setup_ax_and_save(axs, fig, 'hypernets_global_small')

    def hyper_train_stochastic_objective_current(hyper_weights, seed):
        """The objective for the hypernetwork, with a fixed hyperparameter.

        :param hyper_weights: The weights ([[float]]) of the hypernetwork.
        :param seed: The seed (integer) for sampling a hyperparameter.
        :return: The hypernetwork's loss (float).
        """
        return hyper_train_stochastic_objective(hyper_cur, hyper_weights, seed)

    init_hypernet_params = init_random_params(init_scale, hypernet_layer_sizes,
                                              npr.RandomState(global_seed))
    adam(grad(hyper_train_stochastic_objective_current),
         init_hypernet_params,
         step_size=step_size_hypernet,
         num_iters=num_iters_hypernet,
         callback=callback)