def init_optimizer(self, closure):
        def f_df(newparams, data):
            x, y_ = Variable(data['x']), Variable(data['y'])
            dfdtheta = []
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    p.grad.data.zero_()
                p.data = torch.from_numpy(newparams[i]).float()

            loss = closure(x, y_)

            for i, p in enumerate(self.params):
                dfdtheta.append(p.grad.data.numpy())

            loss = loss.data.numpy()
            return loss, dfdtheta

        # create the array of subfunction specific arguments
        sub_refs = []
        for i in range(self.N):
            # extract a single minibatch of training data.
            sub_refs.append({
                'x':
                self.data[i * self.batch_size:(i + 1) *
                          self.batch_size, :, :, :],
                'y':
                self.target[i * self.batch_size:(i + 1) * self.batch_size]
            })
        params_init = []
        for p in self.params:
            params_init.append(p.data.numpy())

        optimizer = SFO(f_df, params_init, sub_refs)
        return optimizer
示例#2
0
    def SFO_variations(self, num_passes=20):
        """
        Train model using several variations on the standard SFO algorithm.
        """

        np.random.seed(0)  # make experiments repeatable
        self.learner_name = 'SFO standard'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init,
                             self.model.subfunction_references)
        x = self.optimizer.optimize(num_passes=num_passes)

        np.random.seed(0)  # make experiments repeatable
        self.learner_name = 'SFO all active'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             init_subf=len(self.model.subfunction_references))
        x = self.optimizer.optimize(num_passes=num_passes)

        np.random.seed(0)  # make experiments repeatable
        self.learner_name = 'SFO rank 1'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             hessian_algorithm='rank1')
        x = self.optimizer.optimize(num_passes=num_passes)

        self.learner_name = 'SFO random'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             subfunction_selection='random')
        x = self.optimizer.optimize(num_passes=num_passes)

        self.learner_name = 'SFO cyclic'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             subfunction_selection='cyclic')
        x = self.optimizer.optimize(num_passes=num_passes)
示例#3
0
    def SFO(self, num_passes=20, learner_name='SFO', **kwargs):
        """ Train model using SFO."""
        self.learner_name = learner_name
        print("\n\n" + self.learner_name)

        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init,
                             self.model.subfunction_references, **kwargs)
        # # check the gradients
        # self.optimizer.check_grad()
        x = self.optimizer.optimize(num_passes=num_passes)
示例#4
0
def main(shape, spacing, origin, nbl, space_order, xs, xr, tn, f0, npasses,
         batch_size, **kwargs):

    # Get true model
    true_model = get_true_model(shape, spacing, origin, nbl, space_order)

    # Get smooth model
    smooth_model = get_smooth_model(shape, spacing, origin, nbl, space_order)

    # Compute initial born perturbation from m - m0
    dm = (true_model.vp.data**(-2) - smooth_model.vp.data**(-2))

    # Geometry
    nsrc = xs.shape[0]
    nrec = xr.shape[0]
    geometry0 = set_geometry(smooth_model, nsrc, nrec, f0, tn, t0=0)

    # Compute observed data in parallel (inverse crime).
    # In real life we would read the SEG-Y data here.
    futures = []
    for i in range(geometry0.nsrc):
        args = [dm, i, smooth_model, geometry0, space_order]
        futures.append(forward_modeling.remote(*args))
    dobs = np.zeros((geometry0.nt * geometry0.nrec, geometry0.nsrc),
                    dtype=np.float32)
    for i in range(geometry0.nsrc):
        dobs[:, i] = ray.get(futures[i])

    # List containing an identifying element for each subfunction
    sub_refs = set_subreferences(dobs, geometry0, batch_size)

    # Initial guess
    theta_init = np.zeros(smooth_model.shape, dtype=np.float32)

    # # initialize the optimizer
    optimizer = SFO(f_df_multi_shots, theta_init, sub_refs,
                    [geometry0, smooth_model, space_order])

    # # run the optimizer for npasses pass through the data
    theta = optimizer.optimize(num_passes=npasses)

    # Write inverted reflectivity to disk
    file = open('output/dvel-final.bin', "wb")
    scopy = theta.reshape(smooth_model.shape).astype(
        np.float32).copy(order='C')
    file.write(scopy)

    # Create a plot with the minibatch function values
    plt.plot(np.array(optimizer.hist_f_flat))
    plt.xlabel('Iteration')
    plt.ylabel('Minibatch Function Value')
    plt.title('Convergence Trace')
    plt.savefig('output/history_sfo.png')
示例#5
0
def explore_MN(burnin_steps=2, test_steps=2):

    M_arr = []
    N_arr = []
    N = 100
    #N = 50
    for M in np.linspace(1, 1e6, 5):
        #for M in np.linspace(1, 1e3, 4):
        M_arr.append(int(M))
        N_arr.append(int(N))
    M = 1e6
    #M = 1e3
    for N in np.linspace(1, 200, 5):
        #for N in np.linspace(1,50,4):
        M_arr.append(int(M))
        N_arr.append(int(N))

    T_arr = []

    for ii in range(len(M_arr)):
        M = M_arr[ii]
        N = N_arr[ii]

        print "case %d of %d, M=%g, N=%g" % (ii + 1, len(M_arr), M, N)

        # make the model
        model = models.toy(num_subfunctions=N, num_dims=M)
        # initialize the optimizer
        optimizer = SFO(model.f_df,
                        model.theta_init,
                        model.subfunction_references,
                        display=1)
        # burn in the optimizer, to make sure the subspace has eg. reached its full size
        optimizer.optimize(num_passes=burnin_steps)

        # time spent in optimizer during burning
        t0 = optimizer.time_pass - optimizer.time_func
        steps0 = np.sum(optimizer.eval_count)
        optimizer.optimize(num_passes=test_steps)
        t1 = optimizer.time_pass - optimizer.time_func
        t_diff = t1 - t0
        steps1 = np.sum(optimizer.eval_count)
        actual_test_steps = float(steps1 - steps0) / float(N)
        T_arr.append(t_diff / actual_test_steps)
        print T_arr[-1]

    return np.array(M_arr), np.array(N_arr), np.array(T_arr)
示例#6
0
    def __init__(self,
                 model,
                 calculate_full_objective=True,
                 num_projection_dims=5,
                 full_objective_per_pass=4):
        """
        Trains the model using a variety of optimization algorithms.
        This class also wraps the objective and gradient of the model,
        so that it can evaluate and store the full objective for each
        step in the optimization.

        This is WAY SLOWER than just calling the optimizers, because
        it evaluates the FULL objective and gradient instead of a single
        subfunction several times per pass.

        Designed to be used by figure_convergence.py.
        """

        self.model = model
        self.history = {
            'f': defaultdict(list),
            'x_projection': defaultdict(list),
            'events': defaultdict(list),
            'x': defaultdict(list)
        }

        # we use SFO to flatten/unflatten parameters for the other optimizers
        self.x_map = SFO(self.model.f_df, self.model.theta_init,
                         self.model.subfunction_references)
        self.xinit_flat = self.x_map.theta_original_to_flat(
            self.model.theta_init)
        self.calculate_full_objective = calculate_full_objective

        M = self.xinit_flat.shape[0]
        self.x_projection_matrix = np.random.randn(num_projection_dims,
                                                   M) / np.sqrt(M)

        self.num_subfunctions = len(self.model.subfunction_references)
        self.full_objective_period = int(self.num_subfunctions /
                                         full_objective_per_pass)
示例#7
0
def optim_vae_sfo(model,
                  x,
                  v_init,
                  w_init,
                  n_batch,
                  n_passes,
                  hook,
                  n_resample=20,
                  resample_keepmem=False,
                  bernoulli_x=False,
                  display=0):

    # Shuffle columns of dataset x
    ndict.shuffleCols(x)

    # create minibatches
    n_tot = x.itervalues().next().shape[1]
    minibatches = []
    n_minibatches = n_tot / n_batch
    if (n_tot % n_batch) != 0: raise Exception()

    # Divide into minibatches
    def make_minibatch(i):
        _x = ndict.getCols(x, i * n_batch, (i + 1) * n_batch)
        _eps = model.gen_eps(n_batch)
        if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x'])
        return [i, _x, _eps]

    for i in range(n_minibatches):
        minibatches.append(make_minibatch(i))

    L = [0.]
    n_L = [0]

    def f_df(w, minibatch):

        i_minibatch = minibatch[0]
        x_minibatch = minibatch[1]
        eps_minibatch = minibatch[2]

        # Get gradient
        logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch,
                                                  eps_minibatch)

        # Get gradient w.r.t. priors
        logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w'])
        gv = {i: gv[i] + float(n_batch) / n_tot * gv_prior[i] for i in gv}
        gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw}

        f = (logpx.sum() + logpz.sum() - logqz.sum())
        L[0] += -f / (1. * n_batch)
        n_L[0] += 1
        f += float(n_batch) / n_tot * logpv
        f += float(n_batch) / n_tot * logpw

        for i in gv:
            gv[i] *= -1. / n_batch
        for i in gw:
            gw[i] *= -1. / n_batch
        f *= -1. / n_batch

        #print 'norms gv:'
        #ndict.pNorm(gv)
        #print 'norms gw'
        #ndict.pNorm(gw)

        return f, {'v': gv, 'w': gw}

    w_init = {'v': v_init, 'w': w_init}

    from sfo import SFO
    optimizer = SFO(f_df, w_init, minibatches, display=display)

    #optimizer.check_grad()

    # loop
    for i in range(n_passes):
        w = optimizer.optimize(num_passes=1)
        LB = L[0] / (1. * n_L[0])
        hook(i, w['v'], w['w'], LB)
        L[0] = 0
        n_L[0] = 0
        # Reset noise epsilon of some minibatches
        for j in range(n_minibatches):
            if n_resample > 0 and i % n_resample == j % n_resample:
                minibatches[j] = make_minibatch(j)
                optimizer.replace_subfunction(j, resample_keepmem,
                                              minibatches[j])

    print "Finished!"
示例#8
0
    def train(self,
              images,
              batch_size=50,
              num_epochs=20,
              method='SGD',
              train_means=False,
              train_top_layer=False,
              momentum=0.9,
              learning_rate=1.,
              decay1=0.9,
              decay2=0.999,
              precondition=True):
        """
		@type  images: C{ndarray}/C{list}
		@param images: an array or a list of images
		"""

        print 'Preprocessing...'

        inputs, outputs = self._preprocess(images)

        if precondition:
            print 'Preconditioning...'

            # remove correlations
            inputs, outputs = self._precondition(inputs, outputs)

        # indicates which layers will be trained
        train_layers = [self.num_layers -
                        1] if train_top_layer else range(self.num_layers)

        print 'Creating SLSTMs...'

        # create SLSTMs
        for l in range(self.num_layers):
            self.slstm[l] = SLSTM(
                num_rows=inputs.shape[1],
                num_cols=inputs.shape[2],
                num_channels=inputs.shape[3] if l < 1 else self.num_hiddens,
                num_hiddens=self.num_hiddens,
                batch_size=min([batch_size, self.MAX_BATCH_SIZE]),
                nonlinearity=self.nonlinearity,
                extended=self.extended,
                slstm=self.slstm[l],
                verbosity=self.verbosity)

        # compute loss function and its gradient
        def f_df(params, idx):
            # set model parameters
            for l in train_layers:
                self.slstm[l].set_parameters(params['slstm'][l])
            self.mcgsm._set_parameters(params['mcgsm'],
                                       {'train_means': train_means})

            # select batch and compute hidden activations
            Y = outputs[idx:idx + batch_size]
            H = inputs[idx:idx + batch_size]

            for l in range(self.num_layers):
                H = self.slstm[l].forward(H)

            # form inputs to MCGSM
            H_flat = H.reshape(-1, self.num_hiddens).T
            Y_flat = Y.reshape(-1, self.num_channels).T

            norm_const = -H_flat.shape[1]

            # compute gradients
            df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat)
            df_dh = df_dh.T.reshape(*H.shape) / norm_const

            # ignore bottom-right pixel (BSDS300)
            df_dh[:, -1, -1] = 0.

            # average negative log-likelihood
            f = sum(loglik) / norm_const

            df_dtheta = {}
            df_dtheta['slstm'] = [0.] * self.num_layers

            for l in range(self.num_layers)[::-1]:
                if l not in train_layers:
                    break
                if l > min(train_layers):
                    # derivative with respect to inputs of layer l are derivatives
                    # of hidden states of layer l - 1
                    df_dtheta['slstm'][l] = self.slstm[l].backward(
                        df_dh, force_backward=True)
                    df_dh = df_dtheta['slstm'][l]['inputs']
                    del df_dtheta['slstm'][l]['inputs']

                else:
                    # no need to compute derivatives with respect to input units
                    df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh)

            # compute gradient of MCGSM
            df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient(
                H_flat, Y_flat, parameters={'train_means': train_means
                                            }) * log(2.) * self.mcgsm.dim_out

            return f, df_dtheta

        # collect current parameters
        params = {}
        params['slstm'] = [0.] * self.num_layers
        for l in range(self.num_layers)[::-1]:
            if l not in train_layers:
                break
            params['slstm'][l] = self.slstm[l].parameters()
        params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means})

        # a start index for each batch
        start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size)

        print 'Training...'

        if method.upper() == 'SFO':
            try:
                # optimize using sum-of-functions optimizer
                optimizer = SFO(f_df,
                                params,
                                start_indices,
                                display=self.verbosity)
                params_opt = optimizer.optimize(num_passes=num_epochs)

                # set model parameters
                for l in range(self.num_layers):
                    self.slstm[l].set_parameters(params_opt['slstm'][l])
                self.mcgsm._set_parameters(params_opt['mcgsm'],
                                           {'train_means': train_means})

            except KeyboardInterrupt:
                pass

            return optimizer.hist_f_flat

        elif method.upper() == 'SGD':
            loss = []
            diff = {
                'slstm': [0.] * self.num_layers,
                'mcgsm': zeros_like(params['mcgsm'])
            }

            for l in train_layers:
                diff['slstm'][l] = {}
                for key in params['slstm'][l]:
                    diff['slstm'][l][key] = zeros_like(params['slstm'][l][key])

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1,
                               batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f)

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params['slstm'][l]:
                            diff['slstm'][l][key] = momentum * diff['slstm'][
                                l][key] - df['slstm'][l][key]
                            params['slstm'][l][key] = params['slstm'][l][
                                key] + learning_rate * diff['slstm'][l][key]

                    # update MCGSM parameters
                    diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm']
                    params['mcgsm'] = params[
                        'mcgsm'] + learning_rate * diff['mcgsm']

                    if self.verbosity > 0:
                        print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
                            n, loss[-1],
                            mean(loss[-max([10, 20000 // batch_size]):]))

            return loss

        elif method.upper() == 'ADAM':
            loss = []
            diff_mean = {
                'slstm': [0.] * self.num_layers,
                'mcgsm': zeros_like(params['mcgsm'])
            }
            diff_sqrd = {
                'slstm': [0.] * self.num_layers,
                'mcgsm': zeros_like(params['mcgsm'])
            }

            for l in train_layers:
                diff_mean['slstm'][l] = {}
                diff_sqrd['slstm'][l] = {}
                for key in params['slstm'][l]:
                    diff_mean['slstm'][l][key] = zeros_like(
                        params['slstm'][l][key])
                    diff_sqrd['slstm'][l][key] = zeros_like(
                        params['slstm'][l][key])

            # step counter
            t = 1

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1,
                               batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f)

                    # include bias correction in step width
                    step_width = learning_rate / (
                        1. - power(decay1, t)) * sqrt(1. - power(decay2, t))
                    t += 1

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params['slstm'][l]:
                            diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \
                             + (1. - decay1) * df['slstm'][l][key]
                            diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \
                             + (1. - decay2) * square(df['slstm'][l][key])

                            params['slstm'][l][key] = params['slstm'][l][key] - \
                             step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key]))

                    # update MCGSM parameters
                    diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + (
                        1. - decay1) * df['mcgsm']
                    diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + (
                        1. - decay2) * square(df['mcgsm'])
                    params['mcgsm'] = params['mcgsm'] - \
                     step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm']))

                    if self.verbosity > 0:
                        print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
                            n, loss[-1],
                            mean(loss[-max([10, 20000 // batch_size]):]))

            return loss

        else:
            raise ValueError('Unknown method \'{0}\'.'.format(method))
M = 20  # number visible units
J = 10  # number hidden units
D = 100000  # full data batch size
N = int(np.sqrt(D) / 10.)  # number minibatches
# generate random training data
v = randn(M, D)

# create the array of subfunction specific arguments
sub_refs = []
for i in range(N):
    # extract a single minibatch of training data.
    sub_refs.append(v[:, i::N])

# initialize parameters
theta_init = {'W': randn(J, M), 'b_h': randn(J, 1), 'b_v': randn(M, 1)}
# initialize the optimizer
optimizer = SFO(f_df, theta_init, sub_refs)
# # uncomment the following line to test the gradient of f_df
# optimizer.check_grad()
# run the optimizer for 1 pass through the data
theta = optimizer.optimize(num_passes=1)
# continue running the optimizer for another 20 passes through the data
theta = optimizer.optimize(num_passes=20)

# plot the convergence trace
plt.plot(np.array(optimizer.hist_f_flat))
plt.xlabel('Iteration')
plt.ylabel('Minibatch Function Value')
plt.title('Convergence Trace')
plt.show()
示例#10
0
 def getOptimizer(self):
     self.batches = self.getSFOBatches()
     return SFO(self.Net._getCost_dCost,
                self.initial_p,
                self.batches,
                display=self.iprint)