def main(shape, spacing, origin, nbl, space_order, xs, xr, tn, f0, npasses, batch_size, **kwargs): # Get true model true_model = get_true_model(shape, spacing, origin, nbl, space_order) # Get smooth model smooth_model = get_smooth_model(shape, spacing, origin, nbl, space_order) # Compute initial born perturbation from m - m0 dm = (true_model.vp.data**(-2) - smooth_model.vp.data**(-2)) # Geometry nsrc = xs.shape[0] nrec = xr.shape[0] geometry0 = set_geometry(smooth_model, nsrc, nrec, f0, tn, t0=0) # Compute observed data in parallel (inverse crime). # In real life we would read the SEG-Y data here. futures = [] for i in range(geometry0.nsrc): args = [dm, i, smooth_model, geometry0, space_order] futures.append(forward_modeling.remote(*args)) dobs = np.zeros((geometry0.nt * geometry0.nrec, geometry0.nsrc), dtype=np.float32) for i in range(geometry0.nsrc): dobs[:, i] = ray.get(futures[i]) # List containing an identifying element for each subfunction sub_refs = set_subreferences(dobs, geometry0, batch_size) # Initial guess theta_init = np.zeros(smooth_model.shape, dtype=np.float32) # # initialize the optimizer optimizer = SFO(f_df_multi_shots, theta_init, sub_refs, [geometry0, smooth_model, space_order]) # # run the optimizer for npasses pass through the data theta = optimizer.optimize(num_passes=npasses) # Write inverted reflectivity to disk file = open('output/dvel-final.bin', "wb") scopy = theta.reshape(smooth_model.shape).astype( np.float32).copy(order='C') file.write(scopy) # Create a plot with the minibatch function values plt.plot(np.array(optimizer.hist_f_flat)) plt.xlabel('Iteration') plt.ylabel('Minibatch Function Value') plt.title('Convergence Trace') plt.savefig('output/history_sfo.png')
def init_optimizer(self, closure): def f_df(newparams, data): x, y_ = Variable(data['x']), Variable(data['y']) dfdtheta = [] for i, p in enumerate(self.params): if p.grad is not None: p.grad.data.zero_() p.data = torch.from_numpy(newparams[i]).float() loss = closure(x, y_) for i, p in enumerate(self.params): dfdtheta.append(p.grad.data.numpy()) loss = loss.data.numpy() return loss, dfdtheta # create the array of subfunction specific arguments sub_refs = [] for i in range(self.N): # extract a single minibatch of training data. sub_refs.append({ 'x': self.data[i * self.batch_size:(i + 1) * self.batch_size, :, :, :], 'y': self.target[i * self.batch_size:(i + 1) * self.batch_size] }) params_init = [] for p in self.params: params_init.append(p.data.numpy()) optimizer = SFO(f_df, params_init, sub_refs) return optimizer
def __init__(self, model, calculate_full_objective=True, num_projection_dims=5, full_objective_per_pass=4): """ Trains the model using a variety of optimization algorithms. This class also wraps the objective and gradient of the model, so that it can evaluate and store the full objective for each step in the optimization. This is WAY SLOWER than just calling the optimizers, because it evaluates the FULL objective and gradient instead of a single subfunction several times per pass. Designed to be used by figure_convergence.py. """ self.model = model self.history = {'f':defaultdict(list), 'x_projection':defaultdict(list), 'events':defaultdict(list), 'x':defaultdict(list)} # we use SFO to flatten/unflatten parameters for the other optimizers self.x_map = SFO(self.model.f_df, self.model.theta_init, self.model.subfunction_references) self.xinit_flat = self.x_map.theta_original_to_flat(self.model.theta_init) self.calculate_full_objective = calculate_full_objective M = self.xinit_flat.shape[0] self.x_projection_matrix = np.random.randn(num_projection_dims, M)/np.sqrt(M) self.num_subfunctions = len(self.model.subfunction_references) self.full_objective_period = int(self.num_subfunctions/full_objective_per_pass)
def SFO_variations(self, num_passes=20): """ Train model using several variations on the standard SFO algorithm. """ np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO standard' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references) x = self.optimizer.optimize(num_passes=num_passes) np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO all active' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, init_subf=len(self.model.subfunction_references)) x = self.optimizer.optimize(num_passes=num_passes) np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO rank 1' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, hessian_algorithm='rank1') x = self.optimizer.optimize(num_passes=num_passes) self.learner_name = 'SFO random' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, subfunction_selection='random') x = self.optimizer.optimize(num_passes=num_passes) self.learner_name = 'SFO cyclic' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, subfunction_selection='cyclic') x = self.optimizer.optimize(num_passes=num_passes)
def explore_MN(burnin_steps=2, test_steps=2): M_arr = [] N_arr = [] N = 100 #N = 50 for M in np.linspace(1, 1e6, 5): #for M in np.linspace(1, 1e3, 4): M_arr.append(int(M)) N_arr.append(int(N)) M = 1e6 #M = 1e3 for N in np.linspace(1,200,5): #for N in np.linspace(1,50,4): M_arr.append(int(M)) N_arr.append(int(N)) T_arr = [] for ii in range(len(M_arr)): M = M_arr[ii] N = N_arr[ii] print "case %d of %d, M=%g, N=%g"%(ii+1, len(M_arr), M, N) # make the model model = models.toy(num_subfunctions=N, num_dims=M) # initialize the optimizer optimizer = SFO(model.f_df, model.theta_init, model.subfunction_references, display=1) # burn in the optimizer, to make sure the subspace has eg. reached its full size optimizer.optimize(num_passes=burnin_steps) # time spent in optimizer during burning t0 = optimizer.time_pass - optimizer.time_func steps0 = np.sum(optimizer.eval_count) optimizer.optimize(num_passes=test_steps) t1 = optimizer.time_pass - optimizer.time_func t_diff = t1 - t0 steps1 = np.sum(optimizer.eval_count) actual_test_steps = float(steps1 - steps0)/float(N) T_arr.append(t_diff/actual_test_steps) print T_arr[-1] return np.array(M_arr), np.array(N_arr), np.array(T_arr)
def SFO(self, num_passes=20, learner_name='SFO', **kwargs): """ Train model using SFO.""" self.learner_name = learner_name print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, **kwargs) # # check the gradients # self.optimizer.check_grad() x = self.optimizer.optimize(num_passes=num_passes)
def __init__(self, model, calculate_full_objective=True, num_projection_dims=5, full_objective_per_pass=4): """ Trains the model using a variety of optimization algorithms. This class also wraps the objective and gradient of the model, so that it can evaluate and store the full objective for each step in the optimization. This is WAY SLOWER than just calling the optimizers, because it evaluates the FULL objective and gradient instead of a single subfunction several times per pass. Designed to be used by figure_convergence.py. """ self.model = model self.history = { 'f': defaultdict(list), 'x_projection': defaultdict(list), 'events': defaultdict(list), 'x': defaultdict(list) } # we use SFO to flatten/unflatten parameters for the other optimizers self.x_map = SFO(self.model.f_df, self.model.theta_init, self.model.subfunction_references) self.xinit_flat = self.x_map.theta_original_to_flat( self.model.theta_init) self.calculate_full_objective = calculate_full_objective M = self.xinit_flat.shape[0] self.x_projection_matrix = np.random.randn(num_projection_dims, M) / np.sqrt(M) self.num_subfunctions = len(self.model.subfunction_references) self.full_objective_period = int(self.num_subfunctions / full_objective_per_pass)
def explore_MN(burnin_steps=2, test_steps=2): M_arr = [] N_arr = [] N = 100 #N = 50 for M in np.linspace(1, 1e6, 5): #for M in np.linspace(1, 1e3, 4): M_arr.append(int(M)) N_arr.append(int(N)) M = 1e6 #M = 1e3 for N in np.linspace(1, 200, 5): #for N in np.linspace(1,50,4): M_arr.append(int(M)) N_arr.append(int(N)) T_arr = [] for ii in range(len(M_arr)): M = M_arr[ii] N = N_arr[ii] print "case %d of %d, M=%g, N=%g" % (ii + 1, len(M_arr), M, N) # make the model model = models.toy(num_subfunctions=N, num_dims=M) # initialize the optimizer optimizer = SFO(model.f_df, model.theta_init, model.subfunction_references, display=1) # burn in the optimizer, to make sure the subspace has eg. reached its full size optimizer.optimize(num_passes=burnin_steps) # time spent in optimizer during burning t0 = optimizer.time_pass - optimizer.time_func steps0 = np.sum(optimizer.eval_count) optimizer.optimize(num_passes=test_steps) t1 = optimizer.time_pass - optimizer.time_func t_diff = t1 - t0 steps1 = np.sum(optimizer.eval_count) actual_test_steps = float(steps1 - steps0) / float(N) T_arr.append(t_diff / actual_test_steps) print T_arr[-1] return np.array(M_arr), np.array(N_arr), np.array(T_arr)
M = 20 # number visible units J = 10 # number hidden units D = 100000 # full data batch size N = int(np.sqrt(D) / 10.0) # number minibatches # generate random training data v = randn(M, D) # create the array of subfunction specific arguments sub_refs = [] for i in range(N): # extract a single minibatch of training data. sub_refs.append(v[:, i::N]) # initialize parameters theta_init = {"W": randn(J, M), "b_h": randn(J, 1), "b_v": randn(M, 1)} # initialize the optimizer optimizer = SFO(f_df, theta_init, sub_refs) # # uncomment the following line to test the gradient of f_df # optimizer.check_grad() # run the optimizer for 1 pass through the data theta = optimizer.optimize(num_passes=1) # continue running the optimizer for another 20 passes through the data theta = optimizer.optimize(num_passes=20) # plot the convergence trace plt.plot(np.array(optimizer.hist_f_flat)) plt.xlabel("Iteration") plt.ylabel("Minibatch Function Value") plt.title("Convergence Trace") plt.show()
class train: """ Trains the model using a variety of optimization algorithms. This class also wraps the objective and gradient of the model, so that it can evaluate and store the full objective for each step in the optimization. This is WAY SLOWER than just calling the optimizers, because it evaluates the FULL objective and gradient instead of a single subfunction several times per pass. Designed to be used by figure_convergence.py. """ def __init__(self, model, calculate_full_objective=True, num_projection_dims=5, full_objective_per_pass=4): """ Trains the model using a variety of optimization algorithms. This class also wraps the objective and gradient of the model, so that it can evaluate and store the full objective for each step in the optimization. This is WAY SLOWER than just calling the optimizers, because it evaluates the FULL objective and gradient instead of a single subfunction several times per pass. Designed to be used by figure_convergence.py. """ self.model = model self.history = {'f':defaultdict(list), 'x_projection':defaultdict(list), 'events':defaultdict(list), 'x':defaultdict(list)} # we use SFO to flatten/unflatten parameters for the other optimizers self.x_map = SFO(self.model.f_df, self.model.theta_init, self.model.subfunction_references) self.xinit_flat = self.x_map.theta_original_to_flat(self.model.theta_init) self.calculate_full_objective = calculate_full_objective M = self.xinit_flat.shape[0] self.x_projection_matrix = np.random.randn(num_projection_dims, M)/np.sqrt(M) self.num_subfunctions = len(self.model.subfunction_references) self.full_objective_period = int(self.num_subfunctions/full_objective_per_pass) def f_df_wrapper(self, *args, **kwargs): """ This (slightly hacky) function stands between the optimizer and the objective function. It evaluates the objective on the full function every full_objective_function times a subfunction is evaluated, and stores the history of the full objective function value. """ ## call the true subfunction objective function, passing through all parameters f, df = self.model.f_df(*args, **kwargs) if len(self.history['f'][self.learner_name]) == 0: # this is the first time step for this learner self.last_f = np.inf self.last_idx = -1 self.nsteps_this_learner = 0 self.nsteps_this_learner += 1 # only record the step every once every self.full_objective_period steps if np.mod(self.nsteps_this_learner, self.full_objective_period) != 1 and self.full_objective_period > 1: return f, df # the full objective function on all subfunctions if self.calculate_full_objective: new_f = 0. for ref in self.model.full_objective_references: new_f += self.model.f_df(args[0], ref)[0] else: new_f = f events = dict() # holds anything special about this step # a unique identifier for the current subfunction new_idx = id(args[1]) if 'SFO' in self.learner_name: events = dict(self.optimizer.events) # append the full objective value, projections, etc to the history self.history['f'][self.learner_name].append(new_f) x_proj = np.dot(self.x_projection_matrix, self.x_map.theta_original_to_flat(args[0])).ravel() self.history['x_projection'][self.learner_name].append(x_proj) self.history['events'][self.learner_name].append(events) self.history['x'][self.learner_name] = args[0] print("full f %g"%(new_f)) # store the prior values self.last_f = new_f self.last_idx = new_idx return f, df def f_df_wrapper_flattened(self, x_flat, subfunction_references, *args, **kwargs): """ Calculate the subfunction objective and gradient. Takes a 1d parameter vector, and returns a 1d gradient, even if the parameters for f_df are a list or a dictionary. x_flat should be the flattened version of the parameters. """ x = self.x_map.theta_flat_to_original(x_flat) f = 0. df = 0. for sr in subfunction_references: fl, dfl = self.f_df_wrapper(x, sr, *args, **kwargs) dfl_flat = self.x_map.theta_original_to_flat(dfl) f += fl df += dfl_flat return f, df.ravel() def SGD(self, num_passes=20): """ Train model using SGD with various learning rates """ # get the number of minibatches N = len(self.model.subfunction_references) # step through all the hyperparameters. eta is step length. for eta in 10**np.linspace(-5,2,8): # label this convergence trace using the optimizer name and hyperparameter self.learner_name = "SGD %.4f"%eta print("\n\n" + self.learner_name) # initialize the parameters x = self.xinit_flat.copy() ## perform stochastic gradient descent for _ in range(num_passes*N): # number of minibatch evaluations # choose a minibatch at random idx = np.random.randint(N) sr = self.model.subfunction_references[idx] # evaluate the objective and gradient for that minibatch fl, dfl = self.f_df_wrapper_flattened(x.reshape((-1,1)), (sr,)) # update the parameters x -= dfl.reshape(x.shape) * eta # if the objective has diverged, skip the rest of the run for this hyperparameter if not np.isfinite(fl): print("Non-finite subfunction.") break def LBFGS(self, num_passes=20): """ Train model using LBFGS """ self.learner_name = "LBFGS" print("\n\n" + self.learner_name) _, _, _ = fmin_l_bfgs_b( self.f_df_wrapper_flattened, self.xinit_flat.copy(), disp=1, args=(self.model.subfunction_references, ), maxfun=num_passes) def SFO(self, num_passes=20, learner_name='SFO'): """ Train model using SFO.""" self.learner_name = learner_name print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references) # # check the gradients # self.optimizer.check_grad() x = self.optimizer.optimize(num_passes=num_passes) def SFO_variations(self, num_passes=20): """ Train model using several variations on the standard SFO algorithm. """ np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO standard' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references) x = self.optimizer.optimize(num_passes=num_passes) np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO all active' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, init_subf=len(self.model.subfunction_references)) x = self.optimizer.optimize(num_passes=num_passes) np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO rank 1' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, hessian_algorithm='rank1') x = self.optimizer.optimize(num_passes=num_passes) self.learner_name = 'SFO random' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, subfunction_selection='random' ) x = self.optimizer.optimize(num_passes=num_passes) self.learner_name = 'SFO cyclic' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, subfunction_selection='cyclic' ) x = self.optimizer.optimize(num_passes=num_passes) def SAG(self, num_passes=20): """ Train model using SAG with line search, for various initial Lipschitz """ # larger L is easier, so start large for L in 10**(-np.linspace(-3, 3, 7)): self.learner_name = "SAG %.4f"%L #learner_name = "SAG (diverges)" print("\n\n" + self.learner_name) self.optimizer = SAG(self.f_df_wrapper_flattened, self.xinit_flat.copy(), self.model.subfunction_references, L=L) x = self.optimizer.optimize(num_passes=num_passes) print(np.mean(self.optimizer.f), "average value at last evaluation") def LBFGS_minibatch(self, num_passes=20, data_fraction=0.1, num_steps=10): """ Perform LBFGS on minibatches of size data_fraction of the full datastep, and with num_steps LBFGS steps per minibatch.""" self.learner_name = "LBFGS minibatch" x = self.xinit_flat.copy() for epoch in range(num_passes): idx = random_choice(len(self.model.subfunction_references), int(data_fraction*len(self.model.subfunction_references)), replace=False) sr = [] for ii in idx: sr.append(self.model.subfunction_references[ii]) x, _, _ = fmin_l_bfgs_b( self.f_df_wrapper_flattened, x, args=(sr, ), disp=1, maxfun=num_steps) def SGD_momentum(self, num_passes=20): """ Train model using SGD with various learning rates and momentums""" learning_rates = 10**np.linspace(-5,2,8) momentums = np.array([0.5, 0.9, 0.95, 0.99]) params = product(learning_rates, momentums) N = len(self.model.subfunction_references) for eta, momentum in params: self.learner_name = "SGD_momentum eta=%.5f, mu=%.2f" % (eta, momentum) print("\n\n" + self.learner_name) f = np.ones((N))*np.nan x = self.xinit_flat.copy() # Prevous step inc = 0.0 for epoch in range(num_passes): for minibatch in range(N): idx = np.random.randint(N) sr = self.model.subfunction_references[idx] fl, dfl = self.f_df_wrapper_flattened(x.reshape((-1,1)), (sr,)) inc = momentum * inc - eta * dfl.reshape(x.shape) x += inc f[idx] = fl if not np.isfinite(fl): print("Non-finite subfunction. Ending run.") break if not np.isfinite(fl): print("Non-finite subfunction. Ending run.") break print(np.mean(f[np.isfinite(f)]), "average finite value at last evaluation") def ADA(self, num_passes=20): """ Train model using ADAgrad with various learning rates """ for eta in 10**np.linspace(-3,1,5): self.learner_name = "ADAGrad %.4f"%eta print("\n\n" + self.learner_name) self.optimizer = ADAGrad(self.f_df_wrapper_flattened, self.xinit_flat.copy(), self.model.subfunction_references, learning_rate=eta) x = self.optimizer.optimize(num_passes=num_passes) print(np.mean(self.optimizer.f), "average value at last evaluation")
def train(self, images, batch_size=50, num_epochs=20, method='SGD', train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1., decay1=0.9, decay2=0.999, precondition=True): """ @type images: C{ndarray}/C{list} @param images: an array or a list of images """ print 'Preprocessing...' inputs, outputs = self._preprocess(images) if precondition: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) print 'Creating SLSTMs...' # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params['slstm'][l]) self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means}) # select batch and compute hidden activations Y = outputs[idx:idx + batch_size] H = inputs[idx:idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # ignore bottom-right pixel (BSDS300) df_dh[:, -1, -1] = 0. # average negative log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta['slstm'][l] = self.slstm[l].backward( df_dh, force_backward=True) df_dh = df_dtheta['slstm'][l]['inputs'] del df_dtheta['slstm'][l]['inputs'] else: # no need to compute derivatives with respect to input units df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient( H_flat, Y_flat, parameters={'train_means': train_means }) * log(2.) * self.mcgsm.dim_out return f, df_dtheta # collect current parameters params = {} params['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params['slstm'][l] = self.slstm[l].parameters() params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means}) # a start index for each batch start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size) print 'Training...' if method.upper() == 'SFO': try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt['slstm'][l]) self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == 'SGD': loss = [] diff = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff['slstm'][l] = {} for key in params['slstm'][l]: diff['slstm'][l][key] = zeros_like(params['slstm'][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff['slstm'][l][key] = momentum * diff['slstm'][ l][key] - df['slstm'][l][key] params['slstm'][l][key] = params['slstm'][l][ key] + learning_rate * diff['slstm'][l][key] # update MCGSM parameters diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm'] params['mcgsm'] = params[ 'mcgsm'] + learning_rate * diff['mcgsm'] if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss elif method.upper() == 'ADAM': loss = [] diff_mean = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } diff_sqrd = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff_mean['slstm'][l] = {} diff_sqrd['slstm'][l] = {} for key in params['slstm'][l]: diff_mean['slstm'][l][key] = zeros_like( params['slstm'][l][key]) diff_sqrd['slstm'][l][key] = zeros_like( params['slstm'][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # include bias correction in step width step_width = learning_rate / ( 1. - power(decay1, t)) * sqrt(1. - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \ + (1. - decay1) * df['slstm'][l][key] diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \ + (1. - decay2) * square(df['slstm'][l][key]) params['slstm'][l][key] = params['slstm'][l][key] - \ step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key])) # update MCGSM parameters diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + ( 1. - decay1) * df['mcgsm'] diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + ( 1. - decay2) * square(df['mcgsm']) params['mcgsm'] = params['mcgsm'] - \ step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm'])) if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss else: raise ValueError('Unknown method \'{0}\'.'.format(method))
M = 20 # number visible units J = 10 # number hidden units D = 100000 # full data batch size N = int(np.sqrt(D) / 10.) # number minibatches # generate random training data v = randn(M, D) # create the array of subfunction specific arguments sub_refs = [] for i in range(N): # extract a single minibatch of training data. sub_refs.append(v[:, i::N]) # initialize parameters theta_init = {'W': randn(J, M), 'b_h': randn(J, 1), 'b_v': randn(M, 1)} # initialize the optimizer optimizer = SFO(f_df, theta_init, sub_refs) # # uncomment the following line to test the gradient of f_df # optimizer.check_grad() # run the optimizer for 1 pass through the data theta = optimizer.optimize(num_passes=1) # continue running the optimizer for another 20 passes through the data theta = optimizer.optimize(num_passes=20) # plot the convergence trace plt.plot(np.array(optimizer.hist_f_flat)) plt.xlabel('Iteration') plt.ylabel('Minibatch Function Value') plt.title('Convergence Trace') plt.show()
pp.axes(xlim=(-xlm, xlm), ylim=(-ylm, ylm)) pp.scatter(forward_data[-1,:,0],forward_data[-1,:,1],c='b',alpha=.2) #pp.figure(7) #pp.suptitle('Histogram: Model Density vs. Distance from Origin') #pp.axes(xlim=(0.25,2.25),ylim=(0,5),xlabel='Distance from Origin',ylabel='Probability Density') #pp.hist(np.sqrt(np.sum(samples[-1]**2,axis=1)),50,normed=True,color='r') #pp.figure(8) #pp.suptitle(r'Learned $\beta$ Schedule') #pp.axes(xlabel='t', ylabel=r'$\beta$') #pp.plot(np.arange(nsteps),(1.0/(1.0+np.exp(-opt_params[-1])))*beta_max) pp.show() exit() if automate_training: optimizer = SFO(f_df, init_params, subfuncs) end_loss=99.0 while end_loss>-2.50: linalgerror=False try: opt_params = optimizer.optimize(num_passes=2) end_loss = f_df(opt_params,fdata)[0] except np.linalg.linalg.LinAlgError: linalgerror=True if np.isnan(end_loss) or linalgerror: mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32) mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32) mu_biases=np.zeros(nhid_mu).astype(np.float32) mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32) mu_b=np.zeros((ntgates, nx)).astype(np.float32)
# Compiling the sampling function samplesT, tT, sample_updates=get_samps(nsamps, paramsT) sample_T=theano.function([mu_centersT, mu_spreadsT, mu_biasesT, mu_MT, mu_bT,mu_t_centersT,mu_t_spreadsT, cov_centersT, cov_spreadsT, cov_biasesT, cov_MT, cov_bT,cov_t_centersT,cov_t_spreadsT], samplesT, allow_input_downcast=True) def sample(params): out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5], params[6],params[7],params[8],params[9],params[10],params[11],params[12],params[13]) return out if automate_training: optimizer = SFO(f_df, init_params, subfuncs) end_loss=99.0 while end_loss>-2.50: linalgerror=False try: opt_params = optimizer.optimize(num_passes=2) end_loss = f_df(opt_params,fdata)[0] except np.linalg.linalg.LinAlgError: linalgerror=True if np.isnan(end_loss) or linalgerror: mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32) mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32) mu_biases=np.zeros(nhid_mu).astype(np.float32) mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32) mu_b=np.zeros((ntgates, nx)).astype(np.float32)
class train: """ Trains the model using a variety of optimization algorithms. This class also wraps the objective and gradient of the model, so that it can evaluate and store the full objective for each step in the optimization. This is WAY SLOWER than just calling the optimizers, because it evaluates the FULL objective and gradient instead of a single subfunction several times per pass. Designed to be used by figure_convergence.py. """ def __init__(self, model, calculate_full_objective=True, num_projection_dims=5, full_objective_per_pass=4): """ Trains the model using a variety of optimization algorithms. This class also wraps the objective and gradient of the model, so that it can evaluate and store the full objective for each step in the optimization. This is WAY SLOWER than just calling the optimizers, because it evaluates the FULL objective and gradient instead of a single subfunction several times per pass. Designed to be used by figure_convergence.py. """ self.model = model self.history = { 'f': defaultdict(list), 'x_projection': defaultdict(list), 'events': defaultdict(list), 'x': defaultdict(list) } # we use SFO to flatten/unflatten parameters for the other optimizers self.x_map = SFO(self.model.f_df, self.model.theta_init, self.model.subfunction_references) self.xinit_flat = self.x_map.theta_original_to_flat( self.model.theta_init) self.calculate_full_objective = calculate_full_objective M = self.xinit_flat.shape[0] self.x_projection_matrix = np.random.randn(num_projection_dims, M) / np.sqrt(M) self.num_subfunctions = len(self.model.subfunction_references) self.full_objective_period = int(self.num_subfunctions / full_objective_per_pass) def f_df_wrapper(self, *args, **kwargs): """ This (slightly hacky) function stands between the optimizer and the objective function. It evaluates the objective on the full function every full_objective_function times a subfunction is evaluated, and stores the history of the full objective function value. """ ## call the true subfunction objective function, passing through all parameters f, df = self.model.f_df(*args, **kwargs) if len(self.history['f'][self.learner_name]) == 0: # this is the first time step for this learner self.last_f = np.inf self.last_idx = -1 self.nsteps_this_learner = 0 self.nsteps_this_learner += 1 # only record the step every once every self.full_objective_period steps if np.mod(self.nsteps_this_learner, self.full_objective_period ) != 1 and self.full_objective_period > 1: return f, df # the full objective function on all subfunctions if self.calculate_full_objective: new_f = 0. for ref in self.model.full_objective_references: new_f += self.model.f_df(args[0], ref)[0] else: new_f = f events = dict() # holds anything special about this step # a unique identifier for the current subfunction new_idx = id(args[1]) if 'SFO' in self.learner_name: events = dict(self.optimizer.events) # append the full objective value, projections, etc to the history self.history['f'][self.learner_name].append(new_f) x_proj = np.dot(self.x_projection_matrix, self.x_map.theta_original_to_flat(args[0])).ravel() self.history['x_projection'][self.learner_name].append(x_proj) self.history['events'][self.learner_name].append(events) self.history['x'][self.learner_name] = args[0] print("full f %g" % (new_f)) # store the prior values self.last_f = new_f self.last_idx = new_idx return f, df def f_df_wrapper_flattened(self, x_flat, subfunction_references, *args, **kwargs): """ Calculate the subfunction objective and gradient. Takes a 1d parameter vector, and returns a 1d gradient, even if the parameters for f_df are a list or a dictionary. x_flat should be the flattened version of the parameters. """ x = self.x_map.theta_flat_to_original(x_flat) f = 0. df = 0. for sr in subfunction_references: fl, dfl = self.f_df_wrapper(x, sr, *args, **kwargs) dfl_flat = self.x_map.theta_original_to_flat(dfl) f += fl df += dfl_flat return f, df.ravel() def SGD(self, num_passes=20): """ Train model using SGD with various learning rates """ # get the number of minibatches N = len(self.model.subfunction_references) # step through all the hyperparameters. eta is step length. for eta in 10**np.linspace(-5, 2, 8): # label this convergence trace using the optimizer name and hyperparameter self.learner_name = "SGD %.4f" % eta print("\n\n" + self.learner_name) # initialize the parameters x = self.xinit_flat.copy() ## perform stochastic gradient descent for _ in range(num_passes * N): # number of minibatch evaluations # choose a minibatch at random idx = np.random.randint(N) sr = self.model.subfunction_references[idx] # evaluate the objective and gradient for that minibatch fl, dfl = self.f_df_wrapper_flattened(x.reshape((-1, 1)), (sr, )) # update the parameters x -= dfl.reshape(x.shape) * eta # if the objective has diverged, skip the rest of the run for this hyperparameter if not np.isfinite(fl): print("Non-finite subfunction.") break def LBFGS(self, num_passes=20): """ Train model using LBFGS """ self.learner_name = "LBFGS" print("\n\n" + self.learner_name) _, _, _ = fmin_l_bfgs_b(self.f_df_wrapper_flattened, self.xinit_flat.copy(), disp=1, args=(self.model.subfunction_references, ), maxfun=num_passes) def SFO(self, num_passes=20, learner_name='SFO', **kwargs): """ Train model using SFO.""" self.learner_name = learner_name print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, **kwargs) # # check the gradients # self.optimizer.check_grad() x = self.optimizer.optimize(num_passes=num_passes) def SFO_variations(self, num_passes=20): """ Train model using several variations on the standard SFO algorithm. """ np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO standard' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references) x = self.optimizer.optimize(num_passes=num_passes) np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO all active' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, init_subf=len(self.model.subfunction_references)) x = self.optimizer.optimize(num_passes=num_passes) np.random.seed(0) # make experiments repeatable self.learner_name = 'SFO rank 1' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, hessian_algorithm='rank1') x = self.optimizer.optimize(num_passes=num_passes) self.learner_name = 'SFO random' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, subfunction_selection='random') x = self.optimizer.optimize(num_passes=num_passes) self.learner_name = 'SFO cyclic' print("\n\n" + self.learner_name) self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references, subfunction_selection='cyclic') x = self.optimizer.optimize(num_passes=num_passes) def SAG(self, num_passes=20): """ Train model using SAG with line search, for various initial Lipschitz """ # larger L is easier, so start large for L in 10**(-np.linspace(-3, 3, 7)): self.learner_name = "SAG %.4f" % L #learner_name = "SAG (diverges)" print("\n\n" + self.learner_name) self.optimizer = SAG(self.f_df_wrapper_flattened, self.xinit_flat.copy(), self.model.subfunction_references, L=L) x = self.optimizer.optimize(num_passes=num_passes) print(np.mean(self.optimizer.f), "average value at last evaluation") def LBFGS_minibatch(self, num_passes=20, data_fraction=0.1, num_steps=10): """ Perform LBFGS on minibatches of size data_fraction of the full datastep, and with num_steps LBFGS steps per minibatch.""" self.learner_name = "LBFGS minibatch" x = self.xinit_flat.copy() for epoch in range(num_passes): idx = random_choice(len(self.model.subfunction_references), int(data_fraction * len(self.model.subfunction_references)), replace=False) sr = [] for ii in idx: sr.append(self.model.subfunction_references[ii]) x, _, _ = fmin_l_bfgs_b(self.f_df_wrapper_flattened, x, args=(sr, ), disp=1, maxfun=num_steps) def SGD_momentum(self, num_passes=20): """ Train model using SGD with various learning rates and momentums""" learning_rates = 10**np.linspace(-5, 2, 8) momentums = np.array([0.5, 0.9, 0.95, 0.99]) params = product(learning_rates, momentums) N = len(self.model.subfunction_references) for eta, momentum in params: self.learner_name = "SGD_momentum eta=%.5f, mu=%.2f" % (eta, momentum) print("\n\n" + self.learner_name) f = np.ones((N)) * np.nan x = self.xinit_flat.copy() # Prevous step inc = 0.0 for epoch in range(num_passes): for minibatch in range(N): idx = np.random.randint(N) sr = self.model.subfunction_references[idx] fl, dfl = self.f_df_wrapper_flattened( x.reshape((-1, 1)), (sr, )) inc = momentum * inc - eta * dfl.reshape(x.shape) x += inc f[idx] = fl if not np.isfinite(fl): print("Non-finite subfunction. Ending run.") break if not np.isfinite(fl): print("Non-finite subfunction. Ending run.") break print(np.mean(f[np.isfinite(f)]), "average finite value at last evaluation") def ADA(self, num_passes=20): """ Train model using ADAgrad with various learning rates """ for eta in 10**np.linspace(-3, 1, 5): self.learner_name = "ADAGrad %.4f" % eta print("\n\n" + self.learner_name) self.optimizer = ADAGrad(self.f_df_wrapper_flattened, self.xinit_flat.copy(), self.model.subfunction_references, learning_rate=eta) x = self.optimizer.optimize(num_passes=num_passes) print(np.mean(self.optimizer.f), "average value at last evaluation")
def getOptimizer(self): self.batches = self.getSFOBatches() return SFO(self.Net._getCost_dCost, self.initial_p, self.batches, display=self.iprint)
def train( self, images, batch_size=50, num_epochs=20, method="SGD", train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1.0, decay1=0.9, decay2=0.999, precondition=True, ): """ Train model via stochastic gradient descent (SGD) or sum-of-functions optimizer (SFO). @type images: C{ndarray}/C{list} @param images: an array or a list of training images (e.g., Nx32x32x3) @type batch_size: C{int} @param batch_size: batch size used by SGD @type num_epochs: C{int} @param num_epochs: number of passes through the training set @type method: C{str} @param method: either 'SGD', 'SFO', or 'ADAM' @type train_means: C{bool} @param train_means: whether or not to optimize the mean parameters of the MCGSM @type train_top_layer: C{bool} @param train_top_layer: if true, only the MCGSM and spatial LSTM at the top layer is trained @type momentum: C{float} @param momentum: momentum rate used by SGD @type learning_rate: C{float} @param learning_rate: learning rate used by SGD @type decay1: C{float} @param decay1: hyperparameter used by ADAM @type decay2: C{float} @param decay2: hyperparameter used by ADAM @type precondition: C{bool} @param precondition: whether or not to perform conditional whitening @rtype: C{list} @return: evolution of negative log-likelihood (bits per pixel) over the training """ if images.shape[1] < self.input_mask.shape[0] or images.shape[2] < self.input_mask.shape[1]: raise ValueError("Images too small.") if self.verbosity > 0: print "Preprocessing..." inputs, outputs = self._preprocess(images) if precondition: if self.verbosity > 0: print "Preconditioning..." # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) if self.verbosity > 0: print "Creating SLSTMs..." # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity, ) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params["slstm"][l]) self.mcgsm._set_parameters(params["mcgsm"], {"train_means": train_means}) # select batch and compute hidden activations Y = outputs[idx : idx + batch_size] H = inputs[idx : idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # average log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta["slstm"] = [0.0] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh, force_backward=True) df_dh = df_dtheta["slstm"][l]["inputs"] del df_dtheta["slstm"][l]["inputs"] else: # no need to compute derivatives with respect to input units df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta["mcgsm"] = ( self.mcgsm._parameter_gradient(H_flat, Y_flat, parameters={"train_means": train_means}) * log(2.0) * self.mcgsm.dim_out ) return f, df_dtheta # collect current parameters params = {} params["slstm"] = [0.0] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params["slstm"][l] = self.slstm[l].parameters() params["mcgsm"] = self.mcgsm._parameters({"train_means": train_means}) # a start index for each batch start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size) if self.verbosity > 0: print "Training..." if method.upper() == "SFO": try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt["slstm"][l]) self.mcgsm._set_parameters(params_opt["mcgsm"], {"train_means": train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == "SGD": loss = [] diff = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])} for l in train_layers: diff["slstm"][l] = {} for key in params["slstm"][l]: diff["slstm"][l][key] = zeros_like(params["slstm"][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f / log(2.0) / self.num_channels) # update SLSTM parameters for l in train_layers: for key in params["slstm"][l]: diff["slstm"][l][key] = momentum * diff["slstm"][l][key] - df["slstm"][l][key] params["slstm"][l][key] = params["slstm"][l][key] + learning_rate * diff["slstm"][l][key] # update MCGSM parameters diff["mcgsm"] = momentum * diff["mcgsm"] - df["mcgsm"] params["mcgsm"] = params["mcgsm"] + learning_rate * diff["mcgsm"] if self.verbosity > 0: print "{0:>5} {1:>10.4f} {2:>10.4f}".format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :]) ) return loss elif method.upper() == "ADAM": loss = [] diff_mean = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])} diff_sqrd = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])} for l in train_layers: diff_mean["slstm"][l] = {} diff_sqrd["slstm"][l] = {} for key in params["slstm"][l]: diff_mean["slstm"][l][key] = zeros_like(params["slstm"][l][key]) diff_sqrd["slstm"][l][key] = zeros_like(params["slstm"][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f / log(2.0) / self.num_channels) # include bias correction in step width step_width = learning_rate / (1.0 - power(decay1, t)) * sqrt(1.0 - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params["slstm"][l]: diff_mean["slstm"][l][key] = ( decay1 * diff_mean["slstm"][l][key] + (1.0 - decay1) * df["slstm"][l][key] ) diff_sqrd["slstm"][l][key] = decay2 * diff_sqrd["slstm"][l][key] + (1.0 - decay2) * square( df["slstm"][l][key] ) params["slstm"][l][key] = params["slstm"][l][key] - step_width * diff_mean["slstm"][l][ key ] / (1e-8 + sqrt(diff_sqrd["slstm"][l][key])) # update MCGSM parameters diff_mean["mcgsm"] = decay1 * diff_mean["mcgsm"] + (1.0 - decay1) * df["mcgsm"] diff_sqrd["mcgsm"] = decay2 * diff_sqrd["mcgsm"] + (1.0 - decay2) * square(df["mcgsm"]) params["mcgsm"] = params["mcgsm"] - step_width * diff_mean["mcgsm"] / ( 1e-8 + sqrt(diff_sqrd["mcgsm"]) ) if self.verbosity > 0: print "{0:>5} {1:>10.4f} {2:>10.4f}".format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :]) ) return loss else: raise ValueError("Unknown method '{0}'.".format(method))
def optim_vae_sfo(model, x, v_init, w_init, n_batch, n_passes, hook, n_resample=20, resample_keepmem=False, bernoulli_x=False, display=0): # Shuffle columns of dataset x ndict.shuffleCols(x) # create minibatches n_tot = x.itervalues().next().shape[1] minibatches = [] n_minibatches = n_tot / n_batch if (n_tot % n_batch) != 0: raise Exception() # Divide into minibatches def make_minibatch(i): _x = ndict.getCols(x, i * n_batch, (i + 1) * n_batch) _eps = model.gen_eps(n_batch) if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x']) return [i, _x, _eps] for i in range(n_minibatches): minibatches.append(make_minibatch(i)) L = [0.] n_L = [0] def f_df(w, minibatch): i_minibatch = minibatch[0] x_minibatch = minibatch[1] eps_minibatch = minibatch[2] # Get gradient logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch, eps_minibatch) # Get gradient w.r.t. priors logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w']) gv = {i: gv[i] + float(n_batch) / n_tot * gv_prior[i] for i in gv} gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw} f = (logpx.sum() + logpz.sum() - logqz.sum()) L[0] += -f / (1. * n_batch) n_L[0] += 1 f += float(n_batch) / n_tot * logpv f += float(n_batch) / n_tot * logpw for i in gv: gv[i] *= -1. / n_batch for i in gw: gw[i] *= -1. / n_batch f *= -1. / n_batch #print 'norms gv:' #ndict.pNorm(gv) #print 'norms gw' #ndict.pNorm(gw) return f, {'v': gv, 'w': gw} w_init = {'v': v_init, 'w': w_init} from sfo import SFO optimizer = SFO(f_df, w_init, minibatches, display=display) #optimizer.check_grad() # loop for i in range(n_passes): w = optimizer.optimize(num_passes=1) LB = L[0] / (1. * n_L[0]) hook(i, w['v'], w['w'], LB) L[0] = 0 n_L[0] = 0 # Reset noise epsilon of some minibatches for j in range(n_minibatches): if n_resample > 0 and i % n_resample == j % n_resample: minibatches[j] = make_minibatch(j) optimizer.replace_subfunction(j, resample_keepmem, minibatches[j]) print "Finished!"
def fit(self, train_X, optimizer, param_init = None, sample_every=None): self.opt = optimizer n_train, n_vis = train_X.shape batch_size = self.batch_size if sample_every == None: sample_every = 10000000 #theano.config.profile = True #theano.config.exception_verbosity='high' assert(n_vis == self.nv) train_X = self.shared_dataset(train_X) n_batches = np.ceil(n_train / float(batch_size)).astype('int') # theano variables for managing data (index minibatches, n examples in batch) index, n_ex = T.iscalars('batch_index', 'n_ex') batch_start = index*batch_size batch_stop = T.minimum(n_ex, (index + 1)*batch_size) effective_batch_size = batch_stop - batch_start # theano variables for learning lr = T.scalar('lr', dtype=theano.config.floatX) mom = T.scalar('mom', dtype=theano.config.floatX) if self.k == 1: # this one is for scaning over a batch and getting connectivity for each example # return grads too because T.grads through scan is awful # takes ~3x longer, but can experiment connectivity #K, grads = self.mpf.rbm_K2G(self.X, effective_batch_size) # this tiles out the minibatch matrix into a 3D tensor to compute connectivity #K, offs, y, y1, z= self.mpf.rbm_K(self.X, effective_batch_size) K = self.mpf.rbm_K(self.X, effective_batch_size) elif self.k == 2: if DEBUG: return_values = self.mpf.debug_rbm_K_2wise(self.X, effective_batch_size) K = return_values[-1] else: K = self.mpf.rbm_K_2wise(self.X, effective_batch_size) else: raise('NotImplemented') reg = self.L1_reg * self.mpf.L1 + self.L2_reg * self.mpf.L2 reg_grad = T.grad(reg, self.mpf.theta) # if not scan (tile out matrix into tensor) cost = K + reg grads = T.grad(cost, self.mpf.theta) # otherwise #grads = grads + reg_grad if param_init == None: self.mpf.theta.set_value(random_theta(D, DH, k=self.k)) else: self.mpf.theta.set_value(np.asarray(np.concatenate(param_init), dtype=theano.config.floatX)) if optimizer == 'sgd': updates = [] theta = self.mpf.theta theta_update = self.mpf.theta_update upd = mom * theta_update - lr * grads updates.append((theta_update, upd)) updates.append((theta, theta + upd)) print 'compiling theano function' if DEBUG: return_values = list(return_values) return_values.append(cost) return_values.append(grads) train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=return_values, updates=updates, givens={self.X: train_X[batch_start:batch_stop]}) else: train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=cost, updates=updates, givens={self.X: train_X[batch_start:batch_stop]}) self.current_epoch = 0 start = time.time() learning_rate_init = self.learning_rate while self.current_epoch < self.n_epochs: print 'epoch:', self.current_epoch self.current_epoch += 1 effective_mom = self.final_momentum if self.current_epoch > self.momentum_switchover else self.initial_momentum avg_epoch_cost = 0 last_debug = None for minibatch_idx in xrange(n_batches): avg_cost = train_model(minibatch_idx, n_train, self.learning_rate, effective_mom) #print '\t\t', np.isnan(gr).sum(), np.isnan(yy).sum(), np.isnan(yy1).sum(), np.isnan(zz).sum() if DEBUG: return_values, avg_cost, gradients = avg_cost[:-2], avg_cost[-2], avg_cost[-1] print_debug(return_values, last_debug) last_debug = return_values avg_epoch_cost += avg_cost #print '\t', minibatch_idx, avg_cost print '\t avg epoch cost:', avg_epoch_cost/n_batches self.learning_rate *= self.learning_rate_decay theta_fit = split_theta(self.mpf.theta.get_value(), self.mpf.n_visible, self.mpf.n_hidden, k=self.mpf.k) if (self.current_epoch % sample_every == 0): sample_and_save(theta_fit, self.mpf.n_hidden, self.current_epoch, learning_rate_init, self.mpf.k, self.opt) theta_opt = self.mpf.theta.get_value() end = time.time() elif optimizer == 'cg' or optimizer == 'bfgs': print "compiling theano functions" get_batch_size = theano.function([index, n_ex], effective_batch_size, name='get_batch_size') batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') def train_fn_cost_grads(theta_value): print 'nbatches', n_batches self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_losses_grads = [batch_cost_gradst(i, n_train) for i in xrange(n_batches)] train_losses = [i[0] for i in train_losses_grads] train_grads = [i[1] for i in train_losses_grads] train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)] print len(train_losses), len(train_grads) print train_losses[0].shape, train_grads[0].shape returns = np.average(train_losses, weights=train_batch_sizes), np.average(train_grads, weights=train_batch_sizes, axis=0) return returns def train_fn_cost(theta_value): print 'nbatches', n_batches self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_costs = [batch_cost(i, n_train) for i in xrange(n_batches)] train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)] return np.average(train_costs, weights=train_batch_sizes) def train_fn_grads(theta_value): print 'nbatches', n_batches self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_grads = [batch_grads(i, n_train) for i in xrange(n_batches)] train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)] return np.average(train_grads, weights=train_batch_sizes, axis=0) ############### # TRAIN MODEL # ############### def my_callback(): print 'wtf' from scipy.optimize import minimize from scipy.optimize import fmin_bfgs, fmin_l_bfgs_b if optimizer == 'cg': pass elif optimizer == 'bfgs': print 'using bfgs' #theta_opt, f_theta_opt, info = fmin_l_bfgs_b(train_fn, self.mpf.theta.get_value(), iprint=1, maxfun=self.n_epochs) start = time.time() disp = True print 'ready to minimize' #result_obj = minimize(train_fn, self.mpf.theta.get_value(), jac=True, method='BFGS', options={'maxiter':self.n_epochs, 'disp':disp}, callback=my_callback()) #theta_opt = fmin_bfgs(f=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs) theta_opt, fff, ddd = fmin_l_bfgs_b(func=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs) print 'done minimize ya right' end = time.time() elif optimizer == 'sof': print "compiling theano functions" batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') def train_fn(theta_value, i): self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_losses, train_grads = batch_cost_grads(i, n_train) return train_losses, train_grads ############### # TRAIN MODEL # ############### if param_init == None: theta.set_value(random_theta(D, DH)) else: w0, bh0, bv0 = param_init self.mpf.theta.set_value(np.asarray(np.concatenate((w0, bh0, bv0)), dtype=theano.config.floatX)) print 'using sof' sys.path.append('/export/mlrg/ebuchman/Programming/Sum-of-Functions-Optimizer') from sfo import SFO print 'n batches', n_batches print 'n epochs' , self.n_epochs optimizer = SFO(train_fn, self.mpf.theta.get_value(), np.arange(n_batches)) start = time.time() theta_opt = optimizer.optimize(num_passes = self.n_epochs) end = time.time() self.mpf.theta.set_value(theta_opt.astype(theano.config.floatX), borrow=True) return end-start
# Compiling the sampling function samplesT, tT, sample_updates=get_samps(nsamps, paramsT) sample_T=theano.function([mu_centersT, mu_spreadsT, mu_biasesT, mu_MT, mu_bT, cov_centersT, cov_spreadsT, cov_biasesT, cov_MT, cov_bT], samplesT, allow_input_downcast=True) def sample(params): out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5], params[6],params[7],params[8],params[9]) return out if automate_training: optimizer = SFO(f_df, init_params, subfuncs) end_loss=99.0 while end_loss>-2.50: linalgerror=False try: opt_params = optimizer.optimize(num_passes=2) end_loss = f_df(opt_params,fdata)[0] except np.linalg.linalg.LinAlgError: linalgerror=True if np.isnan(end_loss) or linalgerror: mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32) mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32) mu_biases=np.zeros(nhid_mu).astype(np.float32) mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32) mu_b=np.zeros((ntgates, nx)).astype(np.float32)
def train(self, images, batch_size=50, num_epochs=20, method='SGD', train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1., decay1=0.9, decay2=0.999, precondition=True): """ @type images: C{ndarray}/C{list} @param images: an array or a list of images """ print 'Preprocessing...' inputs, outputs = self._preprocess(images) if precondition: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) print 'Creating SLSTMs...' # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params['slstm'][l]) self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means}) # select batch and compute hidden activations Y = outputs[idx:idx + batch_size] H = inputs[idx:idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # ignore bottom-right pixel (BSDS300) df_dh[:, -1, -1] = 0. # average negative log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh, force_backward=True) df_dh = df_dtheta['slstm'][l]['inputs'] del df_dtheta['slstm'][l]['inputs'] else: # no need to compute derivatives with respect to input units df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient(H_flat, Y_flat, parameters={'train_means': train_means}) * log(2.) * self.mcgsm.dim_out return f, df_dtheta # collect current parameters params = {} params['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params['slstm'][l] = self.slstm[l].parameters() params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means}) # a start index for each batch start_indices = range( 0, inputs.shape[0] - batch_size + 1, batch_size) print 'Training...' if method.upper() == 'SFO': try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt['slstm'][l]) self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == 'SGD': loss = [] diff = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm'])} for l in train_layers: diff['slstm'][l] = {} for key in params['slstm'][l]: diff['slstm'][l][key] = zeros_like(params['slstm'][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff['slstm'][l][key] = momentum * diff['slstm'][l][key] - df['slstm'][l][key] params['slstm'][l][key] = params['slstm'][l][key] + learning_rate * diff['slstm'][l][key] # update MCGSM parameters diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm'] params['mcgsm'] = params['mcgsm'] + learning_rate * diff['mcgsm'] if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss elif method.upper() == 'ADAM': loss = [] diff_mean = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm'])} diff_sqrd = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm'])} for l in train_layers: diff_mean['slstm'][l] = {} diff_sqrd['slstm'][l] = {} for key in params['slstm'][l]: diff_mean['slstm'][l][key] = zeros_like(params['slstm'][l][key]) diff_sqrd['slstm'][l][key] = zeros_like(params['slstm'][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # include bias correction in step width step_width = learning_rate / (1. - power(decay1, t)) * sqrt(1. - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \ + (1. - decay1) * df['slstm'][l][key] diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \ + (1. - decay2) * square(df['slstm'][l][key]) params['slstm'][l][key] = params['slstm'][l][key] - \ step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key])) # update MCGSM parameters diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + (1. - decay1) * df['mcgsm'] diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + (1. - decay2) * square(df['mcgsm']) params['mcgsm'] = params['mcgsm'] - \ step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm'])) if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss else: raise ValueError('Unknown method \'{0}\'.'.format(method))
def optim_vae_sfo(model, x, v_init, w_init, n_batch, n_passes, hook, n_resample=20, resample_keepmem=False, bernoulli_x=False, display=0): # Shuffle columns of dataset x ndict.shuffleCols(x) # create minibatches n_tot = x.itervalues().next().shape[1] minibatches = [] n_minibatches = n_tot / n_batch if (n_tot%n_batch) != 0: raise Exception() # Divide into minibatches def make_minibatch(i): _x = ndict.getCols(x, i * n_batch, (i+1) * n_batch) _eps = model.gen_eps(n_batch) if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x']) return [i, _x, _eps] for i in range(n_minibatches): minibatches.append(make_minibatch(i)) L = [0.] n_L = [0] def f_df(w, minibatch): i_minibatch = minibatch[0] x_minibatch = minibatch[1] eps_minibatch = minibatch[2] # Get gradient logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch, eps_minibatch) # Get gradient w.r.t. priors logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w']) gv = {i: gv[i] + float(n_batch)/n_tot * gv_prior[i] for i in gv} gw = {i: gw[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw} f = (logpx.sum() + logpz.sum() - logqz.sum()) L[0] += -f/(1.*n_batch) n_L[0] += 1 f += float(n_batch)/n_tot * logpv f += float(n_batch)/n_tot * logpw for i in gv: gv[i] *= -1./n_batch for i in gw: gw[i] *= -1./n_batch f *= -1./n_batch #print 'norms gv:' #ndict.pNorm(gv) #print 'norms gw' #ndict.pNorm(gw) return f, {'v':gv,'w':gw} w_init = {'v':v_init, 'w':w_init} from sfo import SFO optimizer = SFO(f_df, w_init, minibatches, display=display) #optimizer.check_grad() # loop for i in range(n_passes): w = optimizer.optimize(num_passes=1) LB = L[0]/(1.*n_L[0]) hook(i, w['v'], w['w'], LB) L[0] = 0 n_L[0] = 0 # Reset noise epsilon of some minibatches for j in range(n_minibatches): if n_resample > 0 and i%n_resample == j%n_resample: minibatches[j] = make_minibatch(j) optimizer.replace_subfunction(j, resample_keepmem, minibatches[j]) print "Finished!"
# Compiling the sampling function samplesT, tT, sample_updates=get_samps(nsamps, paramsT) sample_T=theano.function([muW0T, muW1T, muW2T, mub0T, mub1T, mub2T, covW0T, covW1T, covW2T, covb0T, covb1T, covb2T], samplesT, allow_input_downcast=True) def sample(params): out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5], params[6],params[7],params[8],params[9],params[10],params[11]) return out # Creating the optimizer optimizer = SFO(f_df, init_params, subfuncs) # Running the optimization init_loss = f_df(init_params,subfuncs[0])[0] print init_loss keyin='' while keyin!='y': opt_params = optimizer.optimize(num_passes=24*4) end_loss = f_df(opt_params,subfuncs[0])[0] print 'Current loss: ', end_loss W=opt_params[0] pp.scatter(W[0,:],W[1,:]); pp.show() keyin=raw_input('End optimization? (y)')
# Compiling the sampling function samplesT, tT, sample_updates=get_samps(nsamps, paramsT) sample_T=theano.function([muW0T, muW1T, muW2T, mub0T, mub1T, mub2T, covW0T, covW1T, covW2T, covb0T, covb1T, covb2T], samplesT, allow_input_downcast=True) def sample(params): out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5], params[6],params[7],params[8],params[9],params[10],params[11]) return out # Creating the optimizer optimizer = SFO(f_df, init_params, subfuncs) # Running the optimization init_loss = f_df(init_params,subfuncs[0])[0] print init_loss keyin='' while keyin!='y': opt_params = optimizer.optimize(num_passes=12) end_loss = f_df(opt_params,subfuncs[0])[0] print 'Current loss: ', end_loss W=opt_params[0] pp.scatter(W[0,:],W[1,:]); pp.show() keyin=raw_input('End optimization? (y)')