def test_zero_optimal(self): """ minimizes the kl divergence between q and p using batch gradient descent and checks that the result is zero""" rng = np.random.RandomState([1,2,3]) dim = self.dim num_trials = 3 mu = rng.randn(dim).astype(floatX) beta = rng.uniform(.1,10.,(dim,)).astype(floatX) self.p.mu.set_value(mu) mu = rng.randn(dim).astype(floatX) self.q.mu.set_value(mu) self.p.beta.set_value(beta) beta = rng.uniform(.1,10.,(dim,)).astype(floatX) self.q.beta.set_value(beta) kl = kl_divergence(self.q,self.p) p = self.p q = self.q optimizer = BatchGradientDescent( max_iter = 100, line_search_mode = 'exhaustive', verbose = True, objective = kl, conjugate = True, params = [ p.mu, p.beta, q.mu, q.beta ], param_constrainers = [ p.censor_updates, q.censor_updates ]) #optimizer.verbose = True kl = optimizer.minimize() if kl < 0.: if config.floatX == 'float32': neg_tol = 4.8e-7 else: neg_tol = 0. if kl < - neg_tol: raise AssertionError("KL divergence should " "be non-negative but is "+ str(kl)) warnings.warn("KL divergence is not very numerically stable, evidently") tol = 5.4e-5 if kl > tol: print 'kl:',kl print 'tol:',tol assert kl <= tol assert not (kl > tol )
def createGradientFunctions(self): #create X = T.dmatrices("X") mu, logSigma, u, v, f, R = T.dcols("mu", "logSigma", "u", "v", "f", "R") mu = sharedX(np.random.normal(10, 10, (self.dimTheta, 1)), name='mu') logSigma = sharedX(np.random.uniform(0, 4, (self.dimTheta, 1)), name='logSigma') logLambd = sharedX(np.matrix(np.random.uniform(0, 10)), name='logLambd') logLambd = T.patternbroadcast(T.dmatrix("logLambd"), [1, 1]) negKL = 0.5 * T.sum(1 + 2 * logSigma - mu**2 - T.exp(logSigma)**2) theta = mu + T.exp(logSigma) * v W = theta y = X[:, 0] X_sim = X[:, 1:] f = (T.dot(X_sim, W) + u).flatten() gradvariables = [mu, logSigma, logLambd] logLike = T.sum(-(0.5 * np.log(2 * np.pi) + logLambd) - 0.5 * ((y - f) / (T.exp(logLambd)))**2) logp = (negKL + logLike) / self.m optimizer = -logp self.negKL = th.function([mu, logSigma], negKL, on_unused_input='ignore') self.f = th.function(gradvariables + [X, u, v], f, on_unused_input='ignore') self.logLike = th.function(gradvariables + [X, u, v], logLike, on_unused_input='ignore') derivatives = T.grad(logp, gradvariables) derivatives.append(logp) self.gradientfunction = th.function(gradvariables + [X, u, v], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [X, u, v], logp, on_unused_input='ignore') self.optimizer = BatchGradientDescent(objective=optimizer, params=gradvariables, inputs=[X, u, v], conjugate=True, max_iter=1)
def test_zero_optimal(self): """ minimizes the kl divergence between q and p using batch gradient descent and checks that the result is zero""" rng = np.random.RandomState([1, 2, 3]) dim = self.dim num_trials = 3 mu = rng.randn(dim).astype(floatX) beta = rng.uniform(.1, 10., (dim, )).astype(floatX) self.p.mu.set_value(mu) mu = rng.randn(dim).astype(floatX) self.q.mu.set_value(mu) self.p.beta.set_value(beta) beta = rng.uniform(.1, 10., (dim, )).astype(floatX) self.q.beta.set_value(beta) kl = kl_divergence(self.q, self.p) p = self.p q = self.q optimizer = BatchGradientDescent( objective=kl, params=[p.mu, p.beta, q.mu, q.beta], param_constrainers=[p.censor_updates, q.censor_updates]) #optimizer.verbose = True kl = optimizer.minimize() if kl < 0.: raise AssertionError("KL divergence should " "be non-negative but is " + str(kl)) tol = 5.4e-5 assert kl <= tol assert not (kl > tol)
def fit(self, params=None, l1=.0, l2=.0): NLL = self.loss_symbolic(self.L, self.y, self.mu, self.R, self.eta, self.eps) if params is None: params = [self.eta] # Symbolic Theano variables that represent the L1 and L2 regularization terms L1, L2 = .0, .0 for param in params: L1 += T.sum(abs(param)) L2 += T.sum(param**2) regularized_NLL = NLL + l1 * L1 + l2 * L2 minimizer = BatchGradientDescent(objective=regularized_NLL, params=params, inputs=[], verbose=1) minimizer.minimize()
def fit(self, params=None, l1=.0, l2=.0): """ Fit the model by minimizing the Leave One Out (LOO) loss using gradient-based optimization. """ loo_loss = self.loss_symbolic(self.L, self.y, self.mu, self.R, self.eta, self.eps) if params is None: params = [self.eta] # Symbolic Theano variables that represent the L1 and L2 regularization terms L1, L2 = .0, .0 for param in params: L1 += T.sum(abs(param)) L2 += T.sum(param**2) regularized_loo_loss = loo_loss + l1 * L1 + l2 * L2 minimizer = BatchGradientDescent(objective=regularized_loo_loss, params=params, inputs=[], verbose=1) minimizer.minimize()
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model : object A Python object representing the model to train loosely \ implementing the interface of models.model.Model. dataset : pylearn2.datasets.dataset.Dataset Dataset object used to draw training data """ self.model = model if self.cost is None: self.cost = model.get_default_cost() if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but " + "model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) data_specs = self.cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space, # named according to the sources. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = 'BGD_[%s]' % source arg = space.make_theano_batch(name=name) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with their data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, **fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients( model, nested_args, **fixed_var_descr.fixed_vars) assert isinstance(grads, OrderedDict) assert isinstance(grad_updates, OrderedDict) if cost_value is None: raise ValueError("BGD is incompatible with " + str(self.cost) + " because it is intractable, but BGD uses the " + "cost function value to do line searches.") # obj_prereqs has to be a list of function f called with f(*data), # where data is a data tuple coming from the iterator. # this function enables capturing "mapping" and "f", while # enabling the "*data" syntax def capture(f, mapping=mapping): new_f = lambda *args: f(mapping.flatten(args, return_tuple=True)) return new_f obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch] if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, obj_prereqs=obj_prereqs, cost_monitoring_args=fixed_var_descr.fixed_vars) params = model.get_params() self.optimizer = BatchGradientDescent( objective=cost_value, gradients=grads, gradient_updates=grad_updates, params=params, param_constrainers=[model.censor_updates], lr_scalers=model.get_lr_scalers(), inputs=theano_args, verbose=self.verbose_optimization, max_iter=self.updates_per_batch, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode, theano_function_mode=self.theano_function_mode, init_alpha=self.init_alpha) # These monitoring channels keep track of shared variables, # which do not need inputs nor data. if self.monitoring_dataset is not None: self.monitor.add_channel( name='ave_step_size', ipt=None, val=self.optimizer.ave_step_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_size', ipt=None, val=self.optimizer.ave_grad_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_mult', ipt=None, val=self.optimizer.ave_grad_mult, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True
max_beta=beta), -1) J = nce(model, X, T.concatenate(Y, axis=0)) accs = [] for Y_i in Y: pos_prob = 1. / ( 1. + T.exp(model.free_energy(X) - model.free_energy(Y_i))) acc = (pos_prob > .5).mean() accs.append(acc) acc = sum(accs) / float(len(accs)) print '\tinit accuracy ', function([], acc)() #Minimize the objective function with batch gradient descent minimizer = BatchGradientDescent( objective=J, params=model.get_params(), param_constrainers=[model.censor_updates]) print '\tinit obj:', minimizer.obj() #minimizer.verbose = True minimizer.minimize() print '\tfinal obj:', minimizer.obj() recovered_beta = model.beta.get_value() recovered_mu = model.mu.get_value() print '\trecovered beta:', recovered_beta print '\trecovered mu:', recovered_mu kl = kl_divergence(true, model) kl = function([], kl)()
def test_batch_gradient_descent(): """ Verify that batch gradient descent works by checking that it minimizes a quadratic function f(x) = x^T A x + b^T x + c correctly for several sampled values of A, b, and c. The ground truth minimizer is x = np.linalg.solve(A,-b)""" n = 3 A = T.matrix(name='A') b = T.vector(name='b') c = T.scalar(name='c') x = sharedX(np.zeros((n, )), name='x') half = np.cast[config.floatX](0.5) obj = half * T.dot(T.dot(x, A), x) + T.dot(b, x) + c minimizer = BatchGradientDescent(objective=obj, params=[x], inputs=[A, b, c]) num_samples = 3 rng = np.random.RandomState([1, 2, 3]) for i in xrange(num_samples): A = np.cast[config.floatX](rng.randn(1.5 * n, n)) A = np.cast[config.floatX](np.dot(A.T, A)) A += np.cast[config.floatX](np.identity(n) * .02) b = np.cast[config.floatX](rng.randn(n)) c = np.cast[config.floatX](rng.randn()) x.set_value(np.cast[config.floatX](rng.randn(n))) analytical_x = np.linalg.solve(A, -b) actual_obj = minimizer.minimize(A, b, c) actual_x = x.get_value() #Check that the value returned by the minimize method #is the objective function value at the parameters #chosen by the minimize method cur_obj = minimizer.obj(A, b, c) assert np.allclose(actual_obj, cur_obj) x.set_value(analytical_x) analytical_obj = minimizer.obj(A, b, c) #make sure the objective function is accurate to first 4 digits condition1 = not np.allclose(analytical_obj, actual_obj) condition2 = np.abs(analytical_obj - actual_obj) >= 1e-4 * np.abs(analytical_obj) if (config.floatX == 'float64' and condition1) \ or (config.floatX == 'float32' and condition2): print 'objective function value came out wrong on sample ', i print 'analytical obj', analytical_obj print 'actual obj', actual_obj """ The following section of code was used to verify that numerical error can make the objective function look non-convex print 'Checking for numerically induced non-convex behavior' def f(x): return 0.5 * np.dot(x,np.dot(A,x)) + np.dot(b,x) + c x.set_value(actual_x) minimizer._compute_grad(A,b,c) minimizer._normalize_grad() d = minimizer.param_to_grad_shared[x].get_value() x = actual_x.copy() prev = f(x) print prev step_size = 1e-4 x += step_size * d cur = f(x) print cur cur_sgn = np.sign(cur-prev) flip_cnt = 0 for i in xrange(10000): x += step_size * d prev = cur cur = f(x) print cur prev_sgn = cur_sgn cur_sgn = np.sign(cur-prev) if cur_sgn != prev_sgn: print 'flip' flip_cnt += 1 if flip_cnt > 1: print "Non-convex!" from matplotlib import pyplot as plt y = [] x = actual_x.copy() for j in xrange(10000): y.append(f(x)) x += step_size * d plt.plot(y) plt.show() assert False print 'None found' """ #print 'actual x',actual_x #print 'A:' #print A #print 'b:' #print b #print 'c:' #print c x.set_value(actual_x) minimizer._compute_grad(A, b, c) x_grad = minimizer.param_to_grad_shared[x] actual_grad = x_grad.get_value() correct_grad = 0.5 * np.dot(A, x.get_value()) + 0.5 * np.dot( A.T, x.get_value()) + b if not np.allclose(actual_grad, correct_grad): print 'gradient was wrong at convergence point' print 'actual grad: ' print actual_grad print 'correct grad: ' print correct_grad print 'max difference: ', np.abs(actual_grad - correct_grad).max() assert False minimizer._normalize_grad() d = minimizer.param_to_grad_shared[x].get_value() step_len = ( np.dot(b,d) + 0.5 * np.dot(d,np.dot(A,actual_x)) \ + 0.5 * np.dot(actual_x,np.dot(A,d)) ) / np.dot(d, np.dot(A,d)) g = np.dot(A, actual_x) + b deriv = np.dot(g, d) print 'directional deriv at actual', deriv print 'optimal step_len', step_len optimal_x = actual_x - d * step_len g = np.dot(A, optimal_x) + b deriv = np.dot(g, d) print 'directional deriv at optimal: ', deriv x.set_value(optimal_x) print 'obj at optimal: ', minimizer.obj(A, b, c) print 'eigenvalue range:' val, vec = np.linalg.eig(A) print(val.min(), val.max()) print 'condition number: ', (val.max() / val.min()) assert False
p, h = state p_shape = layer.get_output_space().shape i = p_shape[0] / 2 j = p_shape[1] / 2 act = p[0,filter_idx,i,j] obj = - act + norm_penalty * T.square(X).sum() assert obj.ndim == 0 optimizer = BatchGradientDescent(objective = obj, params = [X], inputs = None, param_constrainers = None, max_iter = 1000, verbose = True, tol = None, init_alpha = (.001, .005, .01, .05, .1)) optimizer.minimize() img = X.get_value()[0,:,:,:] print 'max mag: ',np.abs(img).max() print 'norm: ',np.square(img).sum() print 'min: ',img.min() print 'max: ',img.max() img /= np.abs(img).max()
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model self.monitor = Monitor.get_monitor(model) X = T.matrix() Y = T.matrix() dnce = DNCE( self.noise) if self.monitoring_dataset is not None: if not self.monitoring_dataset.has_targets(): Y = None self.monitor.set_dataset(dataset=self.monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) X.tag.test_value = self.monitoring_dataset.get_batch_design(2) channels = model.get_monitoring_channels(X,Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) dnce.noise_per_clean = self.noise_per_clean obj = dnce(model,X) dnce.noise_per_clean = None self.monitor.add_channel('DNCE',ipt=X,val=obj) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None if Y is not None: ipt = (X,Y) else: ipt = X self.monitor.add_channel(name=name, ipt=ipt, val=J, prereqs=prereqs) X = sharedX( dataset.get_batch_design(1), 'X') Y = [] updates = {} for i in xrange(self.noise_per_clean): Y_i = sharedX( X.get_value().copy() ) updates[Y_i] = self.noise.random_design_matrix(X) Y.append(Y_i) self.update_noise = function([], updates = updates) obj = dnce(model,X,Y) self.optimizer = BatchGradientDescent( objective = obj, params = model.get_params(), param_constrainers = [ model.censor_updates ], max_iter = 5) self.X = X self.Y = Y self.first = True self.bSetup = True
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.cost is None: self.cost = model.get_default_cost() if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) X = self.model.get_input_space().make_theano_batch() X.name = 'BGD_X' self.topo = X.ndim != 2 if self.topo: assert self.model.get_input_space().axes == ('b', 0, 1, 'c') Y = T.matrix() Y.name = 'BGD_Y' if config.compute_test_value != 'off': X.tag.test_value = self.model.get_input_space().get_origin_batch(self.batch_size).astype(X.dtype) Y_batch = self.model.get_output_space().get_origin_batch(self.batch_size).astype(Y.dtype) assert Y_batch.ndim == 2 for i in xrange(Y_batch.shape[0]): Y_batch[i, i % Y_batch.shape[1]] = 1 Y.tag.test_value = Y_batch fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y) self.on_load_batch = fixed_var_descr.on_load_batch if not self.cost.supervised: Y = None if self.cost.supervised: obj = self.cost(model, X, Y, ** fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients(model, X, Y, ** fixed_var_descr.fixed_vars) ipt = (X,Y) else: obj = self.cost(model, X, ** fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients(model, X, ** fixed_var_descr.fixed_vars) ipt = X Y = None assert isinstance(grads, OrderedDict) assert isinstance(grad_updates, OrderedDict) if obj is None: raise ValueError("BGD is incompatible with "+str(self.cost)+" because " " it is intractable, but BGD uses the cost function value to do " " line searches.") # TODO: replace the following if block with a call to monitor.setup (it does the same thing; # this will reduce code duplication) # may need to still manually add some BGD-specific channels like ave_step_size here if self.monitoring_dataset is not None: if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]): Y = None channels = model.get_monitoring_channels(X,Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) channels.update(self.cost.get_monitoring_channels(model, X, Y, ** fixed_var_descr.fixed_vars)) for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) # The monitor compiles all channels for the same dataset into one function, and # runs all prereqs before calling the function. So we only need to register the # on_load_batch prereq once per monitoring dataset. self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=obj, dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None if Y is not None: ipt = (X,Y) else: ipt = X self.monitor.add_channel(name= prefix + name, ipt=ipt, val=J, dataset = monitoring_dataset, prereqs=prereqs) if self.cost.supervised: ipts = [X, Y] else: ipts = [X] params = model.get_params() self.optimizer = BatchGradientDescent( objective = obj, gradients = grads, gradient_updates = grad_updates, params = params, param_constrainers = [ model.censor_updates ], lr_scalers = model.get_lr_scalers(), inputs = ipts, verbose = self.verbose_optimization, max_iter = self.updates_per_batch, reset_alpha = self.reset_alpha, conjugate = self.conjugate, reset_conjugate = self.reset_conjugate, min_init_alpha = self.min_init_alpha, line_search_mode = self.line_search_mode, theano_function_mode=self.theano_function_mode, init_alpha=self.init_alpha) if self.monitoring_dataset is not None: self.monitor.add_channel(name='ave_step_size', ipt=ipt, val = self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel(name='ave_grad_size', ipt=ipt, val = self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel(name='ave_grad_mult', ipt=ipt, val = self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.set_batch_size: model.set_batch_size(self.batch_size) if self.batch_size is None: self.batch_size = model.force_batch_size model.cost = self.cost model.mask_gen = self.mask_gen self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) prereq = self.get_setup_batch_object() #We want to use big batches. We need to make several theano calls on each #batch. To avoid paying the GPU latency every time, we use a shared variable #but the shared variable needs to stay allocated during the time that the #monitor is working, and we don't want the monitor to increase the memory #overhead. So we make the monitor work off of the same shared variable space = model.get_input_space() X = sharedX(space.get_origin_batch(model.batch_size), 'BGD_X') self.space = space rng = np.random.RandomState([2012, 7, 20]) test_mask = space.get_origin_batch(model.batch_size) test_mask = rng.randint(0, 2, test_mask.shape) if hasattr(self.mask_gen, 'sync_channels') and self.mask_gen.sync_channels: if test_mask.ndim != 4: raise NotImplementedError() test_mask = test_mask[:, :, :, 0] assert test_mask.ndim == 3 drop_mask = sharedX(np.cast[X.dtype](test_mask), name='drop_mask') self.drop_mask = drop_mask assert drop_mask.ndim == test_mask.ndim Y = None drop_mask_Y = None if self.cost.supervised: Y = sharedX( model.get_output_space().get_origin_batch(model.batch_size), 'BGD_Y') self.Y = Y test_mask_Y = rng.randint(0, 2, (model.batch_size, )) drop_mask_Y = sharedX(np.cast[Y.dtype](test_mask_Y), name='drop_mask_Y') self.drop_mask_Y = drop_mask_Y dmx, dmy = self.mask_gen(X, Y) updates = OrderedDict([ (drop_mask, dmx),\ (drop_mask_Y, dmy)] ) else: updates = OrderedDict([(drop_mask, self.mask_gen(X))]) obj = self.cost(model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y) gradients, gradient_updates = self.cost.get_gradients( model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y) if hasattr(model.inference_procedure, 'V_dropout'): include_prob = model.inference_procedure.include_prob theano_rng = MRG_RandomStreams(2012 + 11 + 20) for elem in flatten([ model.inference_procedure.V_dropout, model.inference_procedure.H_dropout ]): updates[elem] = theano_rng.binomial( p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob self.update_mask = function([], updates=updates) if self.monitoring_dataset is not None: if not any([ dataset.has_targets() for dataset in self.monitoring_dataset.values() ]): Y = None assert X.name is not None channels = model.get_monitoring_channels(X, Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) assert X.name is not None wtf = self.cost.get_monitoring_channels(model, X=X, Y=Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y) for key in wtf: channels[key] = wtf[key] for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) #we only need to put the prereq in once to make sure it gets run #adding it more times shouldn't hurt, but be careful #each time you say "self.setup_batch" you get a new object with a #different id, and if you install n of those the prereq will run n #times. It won't cause any wrong results, just a big slowdown warnings.warn( "This is weird-- ipt=(X,Y)=tell the monitor to replace X, Y with the givens dict, " " but you don't actually want them to be replaced.") ipt = X if Y is not None: ipt = [X, Y] self.monitor.add_channel(prefix + 'objective', ipt=ipt, val=obj, dataset=monitoring_dataset, prereqs=[prereq]) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = [] prereqs = list(prereqs) prereqs.append(prereq) if Y is not None: ipt = (X, Y) else: ipt = X self.monitor.add_channel(name=prefix + name, ipt=ipt, val=J, dataset=monitoring_dataset, prereqs=prereqs) self.accumulate = self.combine_batches > 1 if self.accumulate: self.inputs = [ elem for elem in [X, Y, drop_mask, drop_mask_Y] if elem is not None ] else: self.inputs = None self.optimizer = BatchGradientDescent( objective=obj, inputs=self.inputs, verbose=1, gradients=gradients, gradient_updates=gradient_updates, params=model.get_params(), lr_scalers=model.get_lr_scalers(), param_constrainers=[model.censor_updates], max_iter=self.max_iter, tol=3e-7, init_alpha=self.init_alpha, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode, accumulate=self.accumulate, theano_function_mode=self.theano_function_mode) self.X = X if self.monitoring_dataset is not None: self.monitor.add_channel( name='ave_step_size', ipt=ipt, val=self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_size', ipt=ipt, val=self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_mult', ipt=ipt, val=self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError( "batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) X = self.model.get_input_space().make_theano_batch() self.topo = X.ndim != 2 Y = T.matrix() if self.cost.supervised: obj = self.cost(model, X, Y) grads, grad_updates = self.cost.get_gradients(model, X, Y) ipt = (X, Y) else: obj = self.cost(model, X) grads, grad_updates = self.cost.get_gradients(model, X) ipt = X if obj is None: raise ValueError( "BGD is incompatible with " + str(self.cost) + " because " " it is intractable, but BGD uses the cost function value to do " " line searches.") if self.monitoring_dataset is not None: if not any([ dataset.has_targets() for dataset in self.monitoring_dataset.values() ]): Y = None channels = model.get_monitoring_channels(X, Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) channels.update(self.cost.get_monitoring_channels(model, X, Y)) for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) self.monitor.add_channel(prefix + 'objective', ipt=ipt, val=obj, dataset=monitoring_dataset) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None if Y is not None: ipt = (X, Y) else: ipt = X self.monitor.add_channel(name=prefix + name, ipt=ipt, val=J, dataset=monitoring_dataset, prereqs=prereqs) if ipt is X: ipts = [X] else: ipts = ipt self.optimizer = BatchGradientDescent( objective=obj, gradients=grads, gradient_updates=grad_updates, params=model.get_params(), param_constrainers=[model.censor_updates], lr_scalers=model.get_lr_scalers(), inputs=ipts, verbose=True, max_iter=self.updates_per_batch, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode) self.first = True self.bSetup = True
outputs = model.fprop(normed, return_all=True) output = outputs[layer_idx] neuron = output[tuple(idxs)] from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent bgd = BatchGradientDescent(objective=-neuron, params=[X], inputs=None, max_iter=100, lr_scalers=None, verbose=3, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=True, gradients=None, gradient_updates=None, accumulate=False, theano_function_mode=None, param_constrainers=None) bgd.minimize() X = normed.eval()[:,:,:,0].transpose(1,2,0) import numpy as np X /= np.abs(X).max() print (X.min(), X.max())