class SGD(TrainingAlgorithm): """ SGD = (Minibatch) Stochastic Gradient Descent. A TrainingAlgorithm that does stochastic gradient descent on minibatches of training examples. For theoretical background on this algorithm, see Yoshua Bengio's machine learning course notes on the subject: http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html Parameters ---------- learning_rate : float The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost : pylearn2.costs.cost.Cost, optional Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. batch_size : int, optional The size of the batch to be used. If not specified, the model will be asked for the batch size, so you must have specified the batch size there. (Some models are rigidly defined to only work with one batch size) monitoring_batch_size : int, optional The size of the monitoring batches. monitoring_batches : int, optional At the start of each epoch, we run "monitoring", to evaluate quantities such as the validation set error. monitoring_batches, if specified, determines the number of batches to draw from the iterator for each monitoring dataset. Unnecessary if not using monitoring or if `monitor_iteration_mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). TODO: make it possible to specify different monitoring_batches for each monitoring dataset. The Monitor itself already supports this. monitoring_dataset : Dataset or dictionary, optional If not specified, no monitoring is used. If specified to be a Dataset, monitor on that Dataset. If specified to be dictionary, the keys should be string names of datasets, and the values should be Datasets. All monitoring channels will be computed for all monitoring Datasets and will have the dataset name and an underscore prepended to them. monitor_iteration_mode : str, optional The iteration mode used to iterate over the examples in all monitoring datasets. If not specified, defaults to 'sequential'. TODO: make it possible to specify different modes for different datasets. termination_criterion : instance of \ pylearn2.termination_criteria.TerminationCriterion, optional Used to determine when the algorithm should stop running. If not specified, runs forever--or more realistically, until external factors halt the python process (Kansas 1977). update_callbacks : list, optional If specified, each member of the list should be a callable that accepts an SGD instance as its only argument. All callbacks will be called with this SGD instance after each SGD step. learning_rule : training_algorithms.learning_rule.LearningRule, optional A learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule: .. code-block:: none param := param - learning_rate * d cost / d param This argument allows more sophisticated learning rules, such as SGD with momentum. init_momentum : float, **DEPRECATED** option Use learning_rule instead. If None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. set_batch_size : bool, optional Defaults to False. If True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size train_iteration_mode : str, optional Defaults to 'shuffled_sequential'. The iteration mode to use for iterating through training examples. batches_per_iter : int, optional The number of batches to draw from the iterator over training examples. If iteration mode is 'sequential' or 'shuffled_sequential', this is unnecessary; when unspecified we will iterate over all examples. theano_function_mode : a valid argument to theano.function's \ 'mode' parameter, optional The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. monitoring_costs : list, optional a list of Cost instances. The Monitor will also include all channels defined by these Costs, even though we don't train using them. seed : valid argument to np.random.RandomState, optional The seed used for the random number generate to be passed to the training dataset iterator (if any) """ def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule = None, init_momentum = None, set_batch_size = False, train_iteration_mode = None, batches_per_iter=None, theano_function_mode = None, monitoring_costs=None, seed=[2012, 10, 5], discriminator_steps=1): self.discriminator_steps = discriminator_steps self.train_generator = 0 if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn("init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead") # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batch_size = monitoring_batch_size self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batch_size is not None: raise ValueError("Specified a monitoring batch size " + "but not a monitoring dataset.") if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = make_np_rng(seed, which_method=["randn","randint"]) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. Parameters ---------- model : a Model instance dataset : Dataset """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if np.any(np.isinf(param.get_value()))] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]): nan_params = [param for param in model.get_params() if np.any(np.isnan(param.get_value()))] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() # test if force batch size and batch size if getattr(model, "force_batch_size", False) and \ any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for dataset in self.monitoring_dataset.values()) and \ not has_uniform_batch_size(self.monitor_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set monitor_iteration_mode to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i self.params = params grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") assert len(updates.keys()) == 0 def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False): updates = OrderedDict() assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator) if learn_discriminator: cur_params = model.discriminator.get_params() else: cur_params = model.generator.get_params() def check(): for param in params: if param not in cur_params: assert param not in updates cur_grads = OrderedDict() for param in cur_params: cur_grads[param] = grads[param] for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype cur_lr_scalers = OrderedDict() for param in cur_params: if param in lr_scalers: lr_scaler = lr_scalers[param] cur_lr_scalers[param] = lr_scaler log.info('Parameter and initial learning rate summary:') for param in cur_params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) updates.update(self.learning_rule.get_updates( learning_rate, cur_grads, cur_lr_scalers)) check() for param in cur_params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' check() model.modify_updates(updates) check() for param in cur_params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) check() if dont_you_fucking_dare_touch_the_generator: for param in model.generator.get_params(): assert param not in updates with log_timing(log, 'Compiling sgd_update'): return function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.d_func = get_func(1, 0, dont_you_fucking_dare_touch_the_generator=True) self.g_func = get_func(0, 1) def train(self, dataset): """ Runs one epoch of SGD training on the specified dataset. Parameters ---------- dataset : Dataset """ if not hasattr(self, 'd_func'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError("Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng = rng, num_batches = self.batches_per_iter) on_load_batch = self.on_load_batch i = 0 for batch in iterator: for callback in on_load_batch: callback(*batch) if self.train_generator and i == self.discriminator_steps: self.g_func(*batch) i = 0 else: self.d_func(*batch) i += 1 # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.train_generator = not self.train_generator def continue_learning(self, model): """ Returns True if the algorithm should continue running, or False if it has reached convergence / started overfitting and should stop. Parameters ---------- model : a Model instance """ if self.termination_criterion is None: return True else: return self.termination_criterion.continue_learning(self.model)
class SGD(TrainingAlgorithm): """ Stochastic Gradient Descent WRITEME: what is a good reference to read about this algorithm? A TrainingAlgorithm that does gradient descent on minibatches. """ def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule = None, init_momentum = None, set_batch_size = False, train_iteration_mode = None, batches_per_iter=None, theano_function_mode = None, monitoring_costs=None, seed=[2012, 10, 5]): """ WRITEME learning_rate: The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost: a pylearn2.costs.cost.Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. init_momentum: **DEPRECATED** if None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. learning_rule: training_algorithms.learning_rule.LearningRule, a learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule. set_batch_size: if True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size theano_function_mode: The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc """ if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of Costs to represent " " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn("init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead") # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = np.random.RandomState(seed) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs def setup(self, model, dataset): if self.cost is None: self.cost = model.get_default_cost() inf_params = [ param for param in model.get_params() if np.any(np.isinf(param.get_value())) ] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]): nan_params = [ param for param in model.get_params() if np.any(np.isnan(param.get_value())) ] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model batch_size = self.batch_size if hasattr(model, "force_batch_size"): if model.force_batch_size > 0: if batch_size is not None: if batch_size != model.force_batch_size: if self.set_batch_size: model.set_batch_size(batch_size) else: raise ValueError("batch_size argument to SGD conflicts with model's force_batch_size attribute") else: self.batch_size = model.force_batch_size model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size = self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: self.monitor.setup( dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode ) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates( learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.censor_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params def train(self, dataset): if not hasattr(self, 'sgd_update'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError("Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng = rng, num_batches = self.batches_per_iter) on_load_batch = self.on_load_batch for batch in iterator: for callback in on_load_batch: callback(mapping.nest(batch)) self.sgd_update(*batch) # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) def continue_learning(self, model): if self.termination_criterion is None: return True else: return self.termination_criterion.continue_learning(self.model)
class SGD(TrainingAlgorithm): """ SGD = (Minibatch) Stochastic Gradient Descent. A TrainingAlgorithm that does stochastic gradient descent on minibatches of training examples. For theoretical background on this algorithm, see Yoshua Bengio's machine learning course notes on the subject: http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html Parameters ---------- learning_rate : float The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost : pylearn2.costs.cost.Cost, optional Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. batch_size : int, optional The size of the batch to be used. If not specified, the model will be asked for the batch size, so you must have specified the batch size there. (Some models are rigidly defined to only work with one batch size) monitoring_batch_size : int, optional The size of the monitoring batches. monitoring_batches : int, optional At the start of each epoch, we run "monitoring", to evaluate quantities such as the validation set error. monitoring_batches, if specified, determines the number of batches to draw from the iterator for each monitoring dataset. Unnecessary if not using monitoring or if `monitor_iteration_mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). TODO: make it possible to specify different monitoring_batches for each monitoring dataset. The Monitor itself already supports this. monitoring_dataset : Dataset or dictionary, optional If not specified, no monitoring is used. If specified to be a Dataset, monitor on that Dataset. If specified to be dictionary, the keys should be string names of datasets, and the values should be Datasets. All monitoring channels will be computed for all monitoring Datasets and will have the dataset name and an underscore prepended to them. monitor_iteration_mode : str, optional The iteration mode used to iterate over the examples in all monitoring datasets. If not specified, defaults to 'sequential'. TODO: make it possible to specify different modes for different datasets. termination_criterion : instance of \ pylearn2.termination_criteria.TerminationCriterion, optional Used to determine when the algorithm should stop running. If not specified, runs forever--or more realistically, until external factors halt the python process (Kansas 1977). update_callbacks : list, optional If specified, each member of the list should be a callable that accepts an SGD instance as its only argument. All callbacks will be called with this SGD instance after each SGD step. learning_rule : training_algorithms.learning_rule.LearningRule, optional A learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule: .. code-block:: none param := param - learning_rate * d cost / d param This argument allows more sophisticated learning rules, such as SGD with momentum. init_momentum : float, **DEPRECATED** option Use learning_rule instead. If None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. set_batch_size : bool, optional Defaults to False. If True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size train_iteration_mode : str, optional Defaults to 'shuffled_sequential'. The iteration mode to use for iterating through training examples. batches_per_iter : int, optional The number of batches to draw from the iterator over training examples. If iteration mode is 'sequential' or 'shuffled_sequential', this is unnecessary; when unspecified we will iterate over all examples. theano_function_mode : a valid argument to theano.function's \ 'mode' parameter, optional The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. monitoring_costs : OrderedDict, optional A dictionary of Cost instances. Keys should be string containing the name of the cost. The Monitor will also include all channels defined by these Costs, even though we don't train using them. seed : valid argument to np.random.RandomState, optional The seed used for the random number generate to be passed to the training dataset iterator (if any) """ def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule = None, init_momentum = None, set_batch_size = False, train_iteration_mode = None, batches_per_iter=None, theano_function_mode = None, monitoring_costs=None, seed=[2012, 10, 5]): if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn("init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead") # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batch_size = monitoring_batch_size self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batch_size is not None: raise ValueError("Specified a monitoring batch size " + "but not a monitoring dataset.") if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = make_np_rng(seed, which_method=["randn","randint"]) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs def _setup_monitor(self): """ Set up monitor to model the objective value, learning rate, momentum (if applicable), and extra channels defined by the cost. This method must be called after `learning_rule.get_updates`, since it may have an effect on `learning_rule.add_channels_to_monitor` (that is currently the case for `learning_rule.RMSProp`). """ if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=self.learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. Parameters ---------- model : a Model instance dataset : Dataset """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if contains_inf(param.get_value())] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([contains_nan(param.get_value()) for param in model.get_params()]): nan_params = [param for param in model.get_params() if contains_nan(param.get_value())] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() # test if force batch size and batch size has_force_batch_size = getattr(model, "force_batch_size", False) train_dataset_is_uneven = \ dataset.get_num_examples() % self.batch_size != 0 has_monitoring_datasets = \ self.monitoring_dataset is not None and \ self.monitoring_dataset.values() > 0 if has_monitoring_datasets: monitoring_datasets_are_uneven = \ any(d.get_num_examples() % self.batch_size != 0 for d in self.monitoring_dataset.values()) else: monitoring_datasets_are_uneven = False # or True it doesn't matter if has_force_batch_size and train_dataset_is_uneven and \ not has_uniform_batch_size(self.train_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set train_iteration_mode (and " "maybe monitor_iteration_mode) to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") if has_force_batch_size and has_monitoring_datasets and \ monitoring_datasets_are_uneven and \ not has_uniform_batch_size(self.monitor_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set monitor_iteration_mode to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' learning_rate = self.learning_rate params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates( learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.modify_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if contains_inf(update_val): raise ValueError("debug value of %s contains infs" % update.name) if contains_nan(update_val): raise ValueError("debug value of %s contains nans" % update.name) # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost. # We have to do that after learning_rule.get_updates has been # called, since it may have an effect on # learning_rule.add_channels_to_monitor (that is currently the case # for AdaDelta and RMSProp). self._setup_monitor() with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params def train(self, dataset): """ Runs one epoch of SGD training on the specified dataset. Parameters ---------- dataset : Dataset """ if not hasattr(self, 'sgd_update'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if not isfinite(value): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError("Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng = rng, num_batches = self.batches_per_iter) on_load_batch = self.on_load_batch for batch in iterator: for callback in on_load_batch: callback(*batch) self.sgd_update(*batch) # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if not isfinite(value): raise Exception("NaN in " + param.name) def continue_learning(self, model): """ Returns True if the algorithm should continue running, or False if it has reached convergence / started overfitting and should stop. Parameters ---------- model : a Model instance """ if self.termination_criterion is None: return True else: return self.termination_criterion.continue_learning(self.model)
class SGD(TrainingAlgorithm): """ Stochastic Gradient Descent WRITEME: what is a good reference to read about this algorithm? A TrainingAlgorithm that does gradient descent on minibatches. """ def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule=None, init_momentum=None, set_batch_size=False, train_iteration_mode=None, batches_per_iter=None, theano_function_mode=None, monitoring_costs=None, seed=[2012, 10, 5]): """ WRITEME learning_rate: The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost: a pylearn2.costs.cost.Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. init_momentum: **DEPRECATED** if None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. learning_rule: training_algorithms.learning_rule.LearningRule, a learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule. set_batch_size: if True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size theano_function_mode: The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc """ if isinstance(cost, (list, tuple, set)): raise TypeError( "SGD no longer supports using collections of Costs to represent " " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn( "init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead" ) # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError( "Specified an amount of monitoring batches but not a monitoring dataset." ) self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = np.random.RandomState(seed) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs def setup(self, model, dataset): if self.cost is None: self.cost = model.get_default_cost() inf_params = [ param for param in model.get_params() if np.any(np.isinf(param.get_value())) ] if len(inf_params) > 0: raise ValueError("These params are Inf: " + str(inf_params)) if any([ np.any(np.isnan(param.get_value())) for param in model.get_params() ]): nan_params = [ param for param in model.get_params() if np.any(np.isnan(param.get_value())) ] raise ValueError("These params are NaN: " + str(nan_params)) self.model = model batch_size = self.batch_size if hasattr(model, "force_batch_size"): if model.force_batch_size > 0: if batch_size is not None: if batch_size != model.force_batch_size: if self.set_batch_size: model.set_batch_size(batch_size) else: raise ValueError( "batch_size argument to SGD conflicts with model's force_batch_size attribute" ) else: self.batch_size = model.force_batch_size model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, **fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, **fixed_var_descr.fixed_vars) for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % { 'costname': cost_value.name, 'paramname': param.name }) lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param, 1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update( self.learning_rule.get_updates(learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.censor_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params def train(self, dataset): if not hasattr(self, 'sgd_update'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError( "Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng=rng, num_batches=self.batches_per_iter) on_load_batch = self.on_load_batch for batch in iterator: for callback in on_load_batch: callback(mapping.nest(batch)) self.sgd_update(*batch) # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) def continue_learning(self, model): if self.termination_criterion is None: return True else: return self.termination_criterion.continue_learning(self.model)