def init_optimizer(self, closure): def f_df(newparams, data): x, y_ = Variable(data['x']), Variable(data['y']) dfdtheta = [] for i, p in enumerate(self.params): if p.grad is not None: p.grad.data.zero_() p.data = torch.from_numpy(newparams[i]).float() loss = closure(x, y_) for i, p in enumerate(self.params): dfdtheta.append(p.grad.data.numpy()) loss = loss.data.numpy() return loss, dfdtheta # create the array of subfunction specific arguments sub_refs = [] for i in range(self.N): # extract a single minibatch of training data. sub_refs.append({ 'x': self.data[i * self.batch_size:(i + 1) * self.batch_size, :, :, :], 'y': self.target[i * self.batch_size:(i + 1) * self.batch_size] }) params_init = [] for p in self.params: params_init.append(p.data.numpy()) optimizer = SFO(f_df, params_init, sub_refs) return optimizer
def SFO_variations(self, num_passes=20): """ Train model using several variations on the standard SFO algorithm. """ np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO standard' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references) x = self.optimizer.optimize(num_passes=num_passes) np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO all active' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, init_subf=len(self.model.subfunction_references)) x = self.optimizer.optimize(num_passes=num_passes) np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO rank 1' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, hessian_algorithm='rank1') x = self.optimizer.optimize(num_passes=num_passes) self.learner_name = 'SFO random' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, subfunction_selection='random') x = self.optimizer.optimize(num_passes=num_passes) self.learner_name = 'SFO cyclic' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, subfunction_selection='cyclic') x = self.optimizer.optimize(num_passes=num_passes)
def SFO(self, num_passes=20, learner_name='SFO', **kwargs): """ Train model using SFO.""" self.learner_name = learner_name print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, **kwargs) # # check the gradients # self.optimizer.check_grad() x = self.optimizer.optimize(num_passes=num_passes)
def main(shape, spacing, origin, nbl, space_order, xs, xr, tn, f0, npasses, batch_size, **kwargs): # Get true model true_model = get_true_model(shape, spacing, origin, nbl, space_order) # Get smooth model smooth_model = get_smooth_model(shape, spacing, origin, nbl, space_order) # Compute initial born perturbation from m - m0 dm = (true_model.vp.data**(-2) - smooth_model.vp.data**(-2)) # Geometry nsrc = xs.shape[0] nrec = xr.shape[0] geometry0 = set_geometry(smooth_model, nsrc, nrec, f0, tn, t0=0) # Compute observed data in parallel (inverse crime). # In real life we would read the SEG-Y data here. futures = [] for i in range(geometry0.nsrc): args = [dm, i, smooth_model, geometry0, space_order] futures.append(forward_modeling.remote(*args)) dobs = np.zeros((geometry0.nt * geometry0.nrec, geometry0.nsrc), dtype=np.float32) for i in range(geometry0.nsrc): dobs[:, i] = ray.get(futures[i]) # List containing an identifying element for each subfunction sub_refs = set_subreferences(dobs, geometry0, batch_size) # Initial guess theta_init = np.zeros(smooth_model.shape, dtype=np.float32) # # initialize the optimizer optimizer = SFO(f_df_multi_shots, theta_init, sub_refs, [geometry0, smooth_model, space_order]) # # run the optimizer for npasses pass through the data theta = optimizer.optimize(num_passes=npasses) # Write inverted reflectivity to disk file = open('output/dvel-final.bin', "wb") scopy = theta.reshape(smooth_model.shape).astype( np.float32).copy(order='C') file.write(scopy) # Create a plot with the minibatch function values plt.plot(np.array(optimizer.hist_f_flat)) plt.xlabel('Iteration') plt.ylabel('Minibatch Function Value') plt.title('Convergence Trace') plt.savefig('output/history_sfo.png')
def explore_MN(burnin_steps=2, test_steps=2): M_arr = [] N_arr = [] N = 100 #N = 50 for M in np.linspace(1, 1e6, 5): #for M in np.linspace(1, 1e3, 4): M_arr.append(int(M)) N_arr.append(int(N)) M = 1e6 #M = 1e3 for N in np.linspace(1, 200, 5): #for N in np.linspace(1,50,4): M_arr.append(int(M)) N_arr.append(int(N)) T_arr = [] for ii in range(len(M_arr)): M = M_arr[ii] N = N_arr[ii] print "case %d of %d, M=%g, N=%g" % (ii + 1, len(M_arr), M, N) # make the model model = models.toy(num_subfunctions=N, num_dims=M) # initialize the optimizer optimizer = SFO(model.f_df, model.theta_init, model.subfunction_references, display=1) # burn in the optimizer, to make sure the subspace has eg. reached its full size optimizer.optimize(num_passes=burnin_steps) # time spent in optimizer during burning t0 = optimizer.time_pass - optimizer.time_func steps0 = np.sum(optimizer.eval_count) optimizer.optimize(num_passes=test_steps) t1 = optimizer.time_pass - optimizer.time_func t_diff = t1 - t0 steps1 = np.sum(optimizer.eval_count) actual_test_steps = float(steps1 - steps0) / float(N) T_arr.append(t_diff / actual_test_steps) print T_arr[-1] return np.array(M_arr), np.array(N_arr), np.array(T_arr)
def __init__(self, model, calculate_full_objective=True, num_projection_dims=5, full_objective_per_pass=4): """ Trains the model using a variety of optimization algorithms. This class also wraps the objective and gradient of the model, so that it can evaluate and store the full objective for each step in the optimization. This is WAY SLOWER than just calling the optimizers, because it evaluates the FULL objective and gradient instead of a single subfunction several times per pass. Designed to be used by figure_convergence.py. """ self.model = model self.history = { 'f': defaultdict(list), 'x_projection': defaultdict(list), 'events': defaultdict(list), 'x': defaultdict(list) } # we use SFO to flatten/unflatten parameters for the other optimizers self.x_map = SFO(self.model.f_df, self.model.theta_init, self.model.subfunction_references) self.xinit_flat = self.x_map.theta_original_to_flat( self.model.theta_init) self.calculate_full_objective = calculate_full_objective M = self.xinit_flat.shape[0] self.x_projection_matrix = np.random.randn(num_projection_dims, M) / np.sqrt(M) self.num_subfunctions = len(self.model.subfunction_references) self.full_objective_period = int(self.num_subfunctions / full_objective_per_pass)
def optim_vae_sfo(model, x, v_init, w_init, n_batch, n_passes, hook, n_resample=20, resample_keepmem=False, bernoulli_x=False, display=0): # Shuffle columns of dataset x ndict.shuffleCols(x) # create minibatches n_tot = x.itervalues().next().shape[1] minibatches = [] n_minibatches = n_tot / n_batch if (n_tot % n_batch) != 0: raise Exception() # Divide into minibatches def make_minibatch(i): _x = ndict.getCols(x, i * n_batch, (i + 1) * n_batch) _eps = model.gen_eps(n_batch) if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x']) return [i, _x, _eps] for i in range(n_minibatches): minibatches.append(make_minibatch(i)) L = [0.] n_L = [0] def f_df(w, minibatch): i_minibatch = minibatch[0] x_minibatch = minibatch[1] eps_minibatch = minibatch[2] # Get gradient logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch, eps_minibatch) # Get gradient w.r.t. priors logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w']) gv = {i: gv[i] + float(n_batch) / n_tot * gv_prior[i] for i in gv} gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw} f = (logpx.sum() + logpz.sum() - logqz.sum()) L[0] += -f / (1. * n_batch) n_L[0] += 1 f += float(n_batch) / n_tot * logpv f += float(n_batch) / n_tot * logpw for i in gv: gv[i] *= -1. / n_batch for i in gw: gw[i] *= -1. / n_batch f *= -1. / n_batch #print 'norms gv:' #ndict.pNorm(gv) #print 'norms gw' #ndict.pNorm(gw) return f, {'v': gv, 'w': gw} w_init = {'v': v_init, 'w': w_init} from sfo import SFO optimizer = SFO(f_df, w_init, minibatches, display=display) #optimizer.check_grad() # loop for i in range(n_passes): w = optimizer.optimize(num_passes=1) LB = L[0] / (1. * n_L[0]) hook(i, w['v'], w['w'], LB) L[0] = 0 n_L[0] = 0 # Reset noise epsilon of some minibatches for j in range(n_minibatches): if n_resample > 0 and i % n_resample == j % n_resample: minibatches[j] = make_minibatch(j) optimizer.replace_subfunction(j, resample_keepmem, minibatches[j]) print "Finished!"
def train(self, images, batch_size=50, num_epochs=20, method='SGD', train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1., decay1=0.9, decay2=0.999, precondition=True): """ @type images: C{ndarray}/C{list} @param images: an array or a list of images """ print 'Preprocessing...' inputs, outputs = self._preprocess(images) if precondition: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) print 'Creating SLSTMs...' # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params['slstm'][l]) self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means}) # select batch and compute hidden activations Y = outputs[idx:idx + batch_size] H = inputs[idx:idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # ignore bottom-right pixel (BSDS300) df_dh[:, -1, -1] = 0. # average negative log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta['slstm'][l] = self.slstm[l].backward( df_dh, force_backward=True) df_dh = df_dtheta['slstm'][l]['inputs'] del df_dtheta['slstm'][l]['inputs'] else: # no need to compute derivatives with respect to input units df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient( H_flat, Y_flat, parameters={'train_means': train_means }) * log(2.) * self.mcgsm.dim_out return f, df_dtheta # collect current parameters params = {} params['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params['slstm'][l] = self.slstm[l].parameters() params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means}) # a start index for each batch start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size) print 'Training...' if method.upper() == 'SFO': try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt['slstm'][l]) self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == 'SGD': loss = [] diff = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff['slstm'][l] = {} for key in params['slstm'][l]: diff['slstm'][l][key] = zeros_like(params['slstm'][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff['slstm'][l][key] = momentum * diff['slstm'][ l][key] - df['slstm'][l][key] params['slstm'][l][key] = params['slstm'][l][ key] + learning_rate * diff['slstm'][l][key] # update MCGSM parameters diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm'] params['mcgsm'] = params[ 'mcgsm'] + learning_rate * diff['mcgsm'] if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss elif method.upper() == 'ADAM': loss = [] diff_mean = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } diff_sqrd = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff_mean['slstm'][l] = {} diff_sqrd['slstm'][l] = {} for key in params['slstm'][l]: diff_mean['slstm'][l][key] = zeros_like( params['slstm'][l][key]) diff_sqrd['slstm'][l][key] = zeros_like( params['slstm'][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # include bias correction in step width step_width = learning_rate / ( 1. - power(decay1, t)) * sqrt(1. - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \ + (1. - decay1) * df['slstm'][l][key] diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \ + (1. - decay2) * square(df['slstm'][l][key]) params['slstm'][l][key] = params['slstm'][l][key] - \ step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key])) # update MCGSM parameters diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + ( 1. - decay1) * df['mcgsm'] diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + ( 1. - decay2) * square(df['mcgsm']) params['mcgsm'] = params['mcgsm'] - \ step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm'])) if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss else: raise ValueError('Unknown method \'{0}\'.'.format(method))
M = 20 # number visible units J = 10 # number hidden units D = 100000 # full data batch size N = int(np.sqrt(D) / 10.) # number minibatches # generate random training data v = randn(M, D) # create the array of subfunction specific arguments sub_refs = [] for i in range(N): # extract a single minibatch of training data. sub_refs.append(v[:, i::N]) # initialize parameters theta_init = {'W': randn(J, M), 'b_h': randn(J, 1), 'b_v': randn(M, 1)} # initialize the optimizer optimizer = SFO(f_df, theta_init, sub_refs) # # uncomment the following line to test the gradient of f_df # optimizer.check_grad() # run the optimizer for 1 pass through the data theta = optimizer.optimize(num_passes=1) # continue running the optimizer for another 20 passes through the data theta = optimizer.optimize(num_passes=20) # plot the convergence trace plt.plot(np.array(optimizer.hist_f_flat)) plt.xlabel('Iteration') plt.ylabel('Minibatch Function Value') plt.title('Convergence Trace') plt.show()
def getOptimizer(self): self.batches = self.getSFOBatches() return SFO(self.Net._getCost_dCost, self.initial_p, self.batches, display=self.iprint)