Exemplo n.º 1
0
class SGD(TrainingAlgorithm):
    """
    SGD = (Minibatch) Stochastic Gradient Descent.
    A TrainingAlgorithm that does stochastic gradient descent on minibatches
    of training examples.

    For theoretical background on this algorithm, see Yoshua Bengio's machine
    learning course notes on the subject:

    http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html

    Parameters
    ----------
    learning_rate : float
        The learning rate to use. Train object callbacks can change the
        learning rate after each epoch. SGD update_callbacks can change
        it after each minibatch.
    cost : pylearn2.costs.cost.Cost, optional
        Cost object specifying the objective function to be minimized.
        Optionally, may be None. In this case, SGD will call the model's
        get_default_cost method to obtain the objective function.
    batch_size : int, optional
        The size of the batch to be used.
        If not specified, the model will be asked for the batch size, so
        you must have specified the batch size there.
        (Some models are rigidly defined to only work with one batch size)
    monitoring_batch_size : int, optional
        The size of the monitoring batches.
    monitoring_batches : int, optional
        At the start of each epoch, we run "monitoring", to evaluate
        quantities such as the validation set error.
        monitoring_batches, if specified, determines the number of batches
        to draw from the iterator for each monitoring dataset.
        Unnecessary if not using monitoring or if `monitor_iteration_mode`
        is 'sequential' and `batch_size` is specified (number of
        batches will be calculated based on full dataset size).
        TODO: make it possible to specify different monitoring_batches
        for each monitoring dataset. The Monitor itself already supports
        this.
    monitoring_dataset : Dataset or dictionary, optional
        If not specified, no monitoring is used.
        If specified to be a Dataset, monitor on that Dataset.
        If specified to be dictionary, the keys should be string names
        of datasets, and the values should be Datasets. All monitoring
        channels will be computed for all monitoring Datasets and will
        have the dataset name and an underscore prepended to them.
    monitor_iteration_mode : str, optional
        The iteration mode used to iterate over the examples in all
        monitoring datasets. If not specified, defaults to 'sequential'.
        TODO: make it possible to specify different modes for different
        datasets.
    termination_criterion : instance of \
        pylearn2.termination_criteria.TerminationCriterion, optional

        Used to determine when the algorithm should stop running.
        If not specified, runs forever--or more realistically, until
        external factors halt the python process (Kansas 1977).
    update_callbacks : list, optional
        If specified, each member of the list should be a callable that
        accepts an SGD instance as its only argument.
        All callbacks will be called with this SGD instance after each
        SGD step.
    learning_rule : training_algorithms.learning_rule.LearningRule, optional
        A learning rule computes the new parameter values given old
        parameters and first-order gradients. If learning_rule is None,
        sgd.SGD will update parameters according to the standard SGD
        learning rule:

        .. code-block:: none

            param := param - learning_rate * d cost / d param

        This argument allows more sophisticated learning rules, such
        as SGD with momentum.
    init_momentum : float, **DEPRECATED** option
        Use learning_rule instead.
        If None, does not use momentum otherwise, use momentum and
        initialize the momentum coefficient to init_momentum. Callbacks
        can change this over time just like the learning rate. If the
        gradient is the same on every step, then the update taken by the
        SGD algorithm is scaled by a factor of 1/(1-momentum). See
        section 9 of Geoffrey Hinton's "A Practical Guide to Training
        Restricted Boltzmann Machines" for details.
    set_batch_size : bool, optional
        Defaults to False.
        If True, and batch_size conflicts with model.force_batch_size,
        will call model.set_batch_size(batch_size) in an attempt to
        change model.force_batch_size
    train_iteration_mode : str, optional
        Defaults to 'shuffled_sequential'.
        The iteration mode to use for iterating through training examples.
    batches_per_iter : int, optional
        The number of batches to draw from the iterator over training
        examples.
        If iteration mode is 'sequential' or 'shuffled_sequential', this
        is unnecessary; when unspecified we will iterate over all examples.
    theano_function_mode : a valid argument to theano.function's \
        'mode' parameter, optional

        The theano mode to compile the updates function with. Note that
        pylearn2 includes some wraplinker modes that are not bundled with
        theano. See pylearn2.devtools. These extra modes let you do
        things like check for NaNs at every step, or record md5 digests
        of all computations performed by the update function to help
        isolate problems with nondeterminism.
    monitoring_costs : list, optional
        a list of Cost instances. The Monitor will also include all
        channels defined by these Costs, even though we don't train
        using them.
    seed : valid argument to np.random.RandomState, optional
        The seed used for the random number generate to be passed to the
        training dataset iterator (if any)
    """
    def __init__(self, learning_rate, cost=None, batch_size=None,
                 monitoring_batch_size=None, monitoring_batches=None,
                 monitoring_dataset=None, monitor_iteration_mode='sequential',
                 termination_criterion=None, update_callbacks=None,
                 learning_rule = None, init_momentum = None,
                 set_batch_size = False,
                 train_iteration_mode = None, batches_per_iter=None,
                 theano_function_mode = None, monitoring_costs=None,
                 seed=[2012, 10, 5], discriminator_steps=1):

        self.discriminator_steps = discriminator_steps
        self.train_generator = 0

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of " +
                            "Costs to represent a sum of Costs. Use " +
                            "pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn("init_momentum interface is deprecated and will "
            "become officially unsuported as of May 9, 2014. Please use the "
            "`learning_rule` parameter instead, providing an object of type "
            "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batch_size = monitoring_batch_size
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batch_size is not None:
                raise ValueError("Specified a monitoring batch size " +
                                 "but not a monitoring dataset.")
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " +
                                 "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = make_np_rng(seed, which_method=["randn","randint"])
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs

    def setup(self, model, dataset):
        """
        Compiles the theano functions needed for the train method.

        Parameters
        ----------
        model : a Model instance
        dataset : Dataset
        """
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [param for param in model.get_params()
                      if np.any(np.isinf(param.get_value()))]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: "+str(inf_params))
        if any([np.any(np.isnan(param.get_value()))
                for param in model.get_params()]):
            nan_params = [param for param in model.get_params()
                          if np.any(np.isnan(param.get_value()))]
            raise ValueError("These params are NaN: "+str(nan_params))
        self.model = model

        self._synchronize_batch_size(model)
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        # test if force batch size and batch size
        if getattr(model, "force_batch_size", False) and \
           any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for
               dataset in self.monitoring_dataset.values()) and \
           not has_uniform_batch_size(self.monitor_iteration_mode):

            raise ValueError("Dataset size is not a multiple of batch size."
                             "You should set monitor_iteration_mode to "
                             "even_sequential, even_shuffled_sequential or "
                             "even_batchwise_shuffled_sequential")

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost
        learning_rate = self.learning_rate
        if self.monitoring_dataset is not None:
            if (self.monitoring_batch_size is None and
                    self.monitoring_batches is None):
                self.monitoring_batch_size = self.batch_size
                self.monitoring_batches = self.batches_per_iter
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.monitoring_batch_size,
                               num_batches=self.monitoring_batches,
                               extra_costs=self.monitoring_costs,
                               mode=self.monitor_iteration_mode)
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                        self.monitor,
                        monitoring_dataset)

        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i
        self.params = params


        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 ** fixed_var_descr.fixed_vars)
        if not isinstance(grads, OrderedDict):
            raise TypeError(str(type(self.cost)) + ".get_gradients returned " +
                            "something with" + str(type(grads)) + "as its " +
                            "first member. Expected OrderedDict.")

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        assert len(updates.keys()) == 0

        def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False):

            updates = OrderedDict()

            assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator)

            if learn_discriminator:
                cur_params = model.discriminator.get_params()
            else:
                cur_params = model.generator.get_params()

            def check():
                for param in params:
                    if param not in cur_params:
                        assert param not in updates

            cur_grads = OrderedDict()
            for param in cur_params:
                cur_grads[param] = grads[param]

            for param in grads:
                if grads[param].name is None and cost_value is not None:
                    grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
                                         {'costname': cost_value.name,
                                          'paramname': param.name})
                assert grads[param].dtype == param.dtype

            cur_lr_scalers = OrderedDict()
            for param in cur_params:
                if param in lr_scalers:
                    lr_scaler = lr_scalers[param]
                    cur_lr_scalers[param] = lr_scaler

            log.info('Parameter and initial learning rate summary:')
            for param in cur_params:
                param_name = param.name
                if param_name is None:
                    param_name = 'anon_param'
                lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.)
                log.info('\t' + param_name + ': ' + str(lr))

            updates.update(self.learning_rule.get_updates(
                    learning_rate, cur_grads, cur_lr_scalers))
            check()

            for param in cur_params:
                if updates[param].name is None:
                    updates[param].name = 'sgd_update(' + param.name + ')'
            check()
            model.modify_updates(updates)
            check()
            for param in cur_params:
                update = updates[param]
                if update.name is None:
                    update.name = 'censor(sgd_update(' + param.name + '))'
                for update_val in get_debug_values(update):
                    if np.any(np.isinf(update_val)):
                        raise ValueError("debug value of %s contains infs" %
                                update.name)
                    if np.any(np.isnan(update_val)):
                        raise ValueError("debug value of %s contains nans" %
                                update.name)

            check()

            if dont_you_fucking_dare_touch_the_generator:
                for param in model.generator.get_params():
                    assert param not in updates

            with log_timing(log, 'Compiling sgd_update'):
                return function(theano_args,
                                           updates=updates,
                                           name='sgd_update',
                                           on_unused_input='ignore',
                                           mode=self.theano_function_mode)
        self.d_func = get_func(1, 0, dont_you_fucking_dare_touch_the_generator=True)
        self.g_func = get_func(0, 1)

    def train(self, dataset):
        """
        Runs one epoch of SGD training on the specified dataset.

        Parameters
        ----------
        dataset : Dataset
        """


        if not hasattr(self, 'd_func'):
            raise Exception("train called without first calling setup")

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.first = False
        rng = self.rng
        if not is_stochastic(self.train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)

        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError("Unable to train with SGD, because "
                    "the cost does not actually use data from the data set. "
                    "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=self.train_iteration_mode,
                batch_size=self.batch_size,
                data_specs=flat_data_specs, return_tuple=True,
                rng = rng, num_batches = self.batches_per_iter)


        on_load_batch = self.on_load_batch
        i = 0
        for batch in iterator:
            for callback in on_load_batch:
                callback(*batch)
            if self.train_generator and i == self.discriminator_steps:
                self.g_func(*batch)
                i = 0
            else:
                self.d_func(*batch)
                i += 1
            # iterator might return a smaller batch if dataset size
            # isn't divisible by batch_size
            # Note: if data_specs[0] is a NullSpace, there is no way to know
            # how many examples would actually have been in the batch,
            # since it was empty, so actual_batch_size would be reported as 0.
            actual_batch_size = flat_data_specs[0].np_batch_size(batch)
            self.monitor.report_batch(actual_batch_size)
            for callback in self.update_callbacks:
                callback(self)


        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.train_generator = not self.train_generator

    def continue_learning(self, model):
        """
        Returns True if the algorithm should continue running, or False
        if it has reached convergence / started overfitting and should
        stop.

        Parameters
        ----------
        model : a Model instance
        """
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion.continue_learning(self.model)
Exemplo n.º 2
0
class SGD(TrainingAlgorithm):
    """
    Stochastic Gradient Descent

    WRITEME: what is a good reference to read about this algorithm?

    A TrainingAlgorithm that does gradient descent on minibatches.

    """
    def __init__(self, learning_rate, cost=None, batch_size=None,
                 monitoring_batches=None, monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None, update_callbacks=None,
                 learning_rule = None, init_momentum = None, set_batch_size = False,
                 train_iteration_mode = None, batches_per_iter=None,
                 theano_function_mode = None, monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
            WRITEME

            learning_rate: The learning rate to use.
                            Train object callbacks can change the learning
                            rate after each epoch. SGD update_callbacks
                            can change it after each minibatch.
            cost: a pylearn2.costs.cost.Cost object specifying the objective
                  function to be minimized.
                  Optionally, may be None. In this case, SGD will call the model's
                  get_default_cost method to obtain the objective function.
            init_momentum: **DEPRECATED** if None, does not use momentum
                            otherwise, use momentum and initialize the
                            momentum coefficient to init_momentum.
                            Callbacks can change this over time just like
                            the learning rate.

                            If the gradient is the same on every step, then
                            the update taken by the SGD algorithm is scaled
                            by a factor of 1/(1-momentum).

                            See section 9 of Geoffrey Hinton's "A Practical
                            Guide to Training Restricted Boltzmann Machines"
                            for details.
            learning_rule: training_algorithms.learning_rule.LearningRule,
                           a learning rule computes the new parameter values given
                           old parameters and first-order gradients. If learning_rule
                           is None, sgd.SGD will update parameters according to
                           the standard SGD learning rule.
            set_batch_size: if True, and batch_size conflicts with
                            model.force_batch_size, will call
                            model.set_batch_size(batch_size) in an attempt
                            to change model.force_batch_size
            theano_function_mode: The theano mode to compile the updates function with.
                            Note that pylearn2 includes some wraplinker modes that are
                            not bundled with theano. See pylearn2.devtools. These
                            extra modes let you do things like check for NaNs at every
                            step, or record md5 digests of all computations performed
                            by the update function to help isolate problems with nondeterminism.

            Parameters are updated by the formula:

            inc := momentum * inc - learning_rate * d cost / d param
            param := param + inc
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of Costs to represent "
                    " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn("init_momentum interface is deprecated and will "
            "become officially unsuported as of May 9, 2014. Please use the "
            "`learning_rule` parameter instead, providing an object of type "
            "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = np.random.RandomState(seed)
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs

    def setup(self, model, dataset):
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [ param for param in model.get_params() if np.any(np.isinf(param.get_value())) ]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: "+str(inf_params))
        if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]):
            nan_params = [ param for param in model.get_params() if np.any(np.isnan(param.get_value())) ]
            raise ValueError("These params are NaN: "+str(nan_params))
        self.model = model

        batch_size = self.batch_size
        if hasattr(model, "force_batch_size"):
            if model.force_batch_size > 0:
                if batch_size is not None:
                    if batch_size != model.force_batch_size:
                        if self.set_batch_size:
                            model.set_batch_size(batch_size)
                        else:
                            raise ValueError("batch_size argument to SGD conflicts with model's force_batch_size attribute")
                else:
                    self.batch_size = model.force_batch_size
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name, batch_size = self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost
        learning_rate = self.learning_rate
        if self.monitoring_dataset is not None:
            self.monitor.setup(
                    dataset=self.monitoring_dataset,
                    cost=self.cost,
                    batch_size=self.batch_size,
                    num_batches=self.monitoring_batches,
                    extra_costs=self.monitoring_costs,
                    mode=self.monitor_iteration_mode
                    )
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                        self.monitor,
                        monitoring_dataset)

        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i

        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 ** fixed_var_descr.fixed_vars)

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        for param in grads:
            if grads[param].name is None and cost_value is not None:
                grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
                                     {'costname': cost_value.name,
                                      'paramname': param.name})

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        log.info('Parameter and initial learning rate summary:')
        for param in params:
            param_name = param.name
            if param_name is None:
                param_name = 'anon_param'
            lr = learning_rate.get_value() * lr_scalers.get(param,1.)
            log.info('\t' + param_name + ': ' + str(lr))

        if self.learning_rule:
            updates.update(self.learning_rule.get_updates(
                learning_rate, grads, lr_scalers))
        else:
            # Use standard SGD updates with fixed learning rate.
            updates.update( dict(safe_zip(params, [param - learning_rate * \
                lr_scalers.get(param, 1.) * grads[param]
                                    for param in params])))

        for param in params:
            if updates[param].name is None:
                updates[param].name = 'sgd_update(' + param.name + ')'
        model.censor_updates(updates)
        for param in params:
            update = updates[param]
            if update.name is None:
                update.name = 'censor(sgd_update(' + param.name + '))'
            for update_val in get_debug_values(update):
                if np.any(np.isinf(update_val)):
                    raise ValueError("debug value of %s contains infs" % update.name)
                if np.any(np.isnan(update_val)):
                    raise ValueError("debug value of %s contains nans" % update.name)


        with log_timing(log, 'Compiling sgd_update'):
            self.sgd_update = function(theano_args,
                                       updates=updates,
                                       name='sgd_update',
                                       on_unused_input='ignore',
                                       mode=self.theano_function_mode)
        self.params = params

    def train(self, dataset):
        if not hasattr(self, 'sgd_update'):
            raise Exception("train called without first calling setup")

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.first = False
        rng = self.rng
        if not is_stochastic(self.train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)

        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError("Unable to train with SGD, because "
                    "the cost does not actually use data from the data set. "
                    "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=self.train_iteration_mode,
                batch_size=self.batch_size,
                data_specs=flat_data_specs, return_tuple=True,
                rng = rng, num_batches = self.batches_per_iter)

        on_load_batch = self.on_load_batch
        for batch in iterator:
            for callback in on_load_batch:
                callback(mapping.nest(batch))
            self.sgd_update(*batch)
            # iterator might return a smaller batch if dataset size
            # isn't divisible by batch_size
            # Note: if data_specs[0] is a NullSpace, there is no way to know
            # how many examples would actually have been in the batch,
            # since it was empty, so actual_batch_size would be reported as 0.
            actual_batch_size = flat_data_specs[0].np_batch_size(batch)
            self.monitor.report_batch(actual_batch_size)
            for callback in self.update_callbacks:
                callback(self)

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion.continue_learning(self.model)
Exemplo n.º 3
0
class SGD(TrainingAlgorithm):
    """
    SGD = (Minibatch) Stochastic Gradient Descent.
    A TrainingAlgorithm that does stochastic gradient descent on minibatches
    of training examples.

    For theoretical background on this algorithm, see Yoshua Bengio's machine
    learning course notes on the subject:

    http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html

    Parameters
    ----------
    learning_rate : float
        The learning rate to use. Train object callbacks can change the
        learning rate after each epoch. SGD update_callbacks can change
        it after each minibatch.
    cost : pylearn2.costs.cost.Cost, optional
        Cost object specifying the objective function to be minimized.
        Optionally, may be None. In this case, SGD will call the model's
        get_default_cost method to obtain the objective function.
    batch_size : int, optional
        The size of the batch to be used.
        If not specified, the model will be asked for the batch size, so
        you must have specified the batch size there.
        (Some models are rigidly defined to only work with one batch size)
    monitoring_batch_size : int, optional
        The size of the monitoring batches.
    monitoring_batches : int, optional
        At the start of each epoch, we run "monitoring", to evaluate
        quantities such as the validation set error.
        monitoring_batches, if specified, determines the number of batches
        to draw from the iterator for each monitoring dataset.
        Unnecessary if not using monitoring or if `monitor_iteration_mode`
        is 'sequential' and `batch_size` is specified (number of
        batches will be calculated based on full dataset size).
        TODO: make it possible to specify different monitoring_batches
        for each monitoring dataset. The Monitor itself already supports
        this.
    monitoring_dataset : Dataset or dictionary, optional
        If not specified, no monitoring is used.
        If specified to be a Dataset, monitor on that Dataset.
        If specified to be dictionary, the keys should be string names
        of datasets, and the values should be Datasets. All monitoring
        channels will be computed for all monitoring Datasets and will
        have the dataset name and an underscore prepended to them.
    monitor_iteration_mode : str, optional
        The iteration mode used to iterate over the examples in all
        monitoring datasets. If not specified, defaults to 'sequential'.
        TODO: make it possible to specify different modes for different
        datasets.
    termination_criterion : instance of \
        pylearn2.termination_criteria.TerminationCriterion, optional

        Used to determine when the algorithm should stop running.
        If not specified, runs forever--or more realistically, until
        external factors halt the python process (Kansas 1977).
    update_callbacks : list, optional
        If specified, each member of the list should be a callable that
        accepts an SGD instance as its only argument.
        All callbacks will be called with this SGD instance after each
        SGD step.
    learning_rule : training_algorithms.learning_rule.LearningRule, optional
        A learning rule computes the new parameter values given old
        parameters and first-order gradients. If learning_rule is None,
        sgd.SGD will update parameters according to the standard SGD
        learning rule:

        .. code-block:: none

            param := param - learning_rate * d cost / d param

        This argument allows more sophisticated learning rules, such
        as SGD with momentum.
    init_momentum : float, **DEPRECATED** option
        Use learning_rule instead.
        If None, does not use momentum otherwise, use momentum and
        initialize the momentum coefficient to init_momentum. Callbacks
        can change this over time just like the learning rate. If the
        gradient is the same on every step, then the update taken by the
        SGD algorithm is scaled by a factor of 1/(1-momentum). See
        section 9 of Geoffrey Hinton's "A Practical Guide to Training
        Restricted Boltzmann Machines" for details.
    set_batch_size : bool, optional
        Defaults to False.
        If True, and batch_size conflicts with model.force_batch_size,
        will call model.set_batch_size(batch_size) in an attempt to
        change model.force_batch_size
    train_iteration_mode : str, optional
        Defaults to 'shuffled_sequential'.
        The iteration mode to use for iterating through training examples.
    batches_per_iter : int, optional
        The number of batches to draw from the iterator over training
        examples.
        If iteration mode is 'sequential' or 'shuffled_sequential', this
        is unnecessary; when unspecified we will iterate over all examples.
    theano_function_mode : a valid argument to theano.function's \
        'mode' parameter, optional

        The theano mode to compile the updates function with. Note that
        pylearn2 includes some wraplinker modes that are not bundled with
        theano. See pylearn2.devtools. These extra modes let you do
        things like check for NaNs at every step, or record md5 digests
        of all computations performed by the update function to help
        isolate problems with nondeterminism.
    monitoring_costs : OrderedDict, optional
        A dictionary of Cost instances. Keys should be string containing
        the name of the cost. The Monitor will also include all
        channels defined by these Costs, even though we don't train
        using them.
    seed : valid argument to np.random.RandomState, optional
        The seed used for the random number generate to be passed to the
        training dataset iterator (if any)
    """
    def __init__(self, learning_rate, cost=None, batch_size=None,
                 monitoring_batch_size=None, monitoring_batches=None,
                 monitoring_dataset=None, monitor_iteration_mode='sequential',
                 termination_criterion=None, update_callbacks=None,
                 learning_rule = None, init_momentum = None,
                 set_batch_size = False,
                 train_iteration_mode = None, batches_per_iter=None,
                 theano_function_mode = None, monitoring_costs=None,
                 seed=[2012, 10, 5]):

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of " +
                            "Costs to represent a sum of Costs. Use " +
                            "pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn("init_momentum interface is deprecated and will "
            "become officially unsuported as of May 9, 2014. Please use the "
            "`learning_rule` parameter instead, providing an object of type "
            "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batch_size = monitoring_batch_size
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batch_size is not None:
                raise ValueError("Specified a monitoring batch size " +
                                 "but not a monitoring dataset.")
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " +
                                 "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = make_np_rng(seed, which_method=["randn","randint"])
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs

    def _setup_monitor(self):
        """
        Set up monitor to model the objective value, learning rate,
        momentum (if applicable), and extra channels defined by
        the cost.

        This method must be called after `learning_rule.get_updates`,
        since it may have an effect on `learning_rule.add_channels_to_monitor`
        (that is currently the case for `learning_rule.RMSProp`).
        """
        if self.monitoring_dataset is not None:
            if (self.monitoring_batch_size is None and
                    self.monitoring_batches is None):
                self.monitoring_batch_size = self.batch_size
                self.monitoring_batches = self.batches_per_iter
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.monitoring_batch_size,
                               num_batches=self.monitoring_batches,
                               extra_costs=self.monitoring_costs,
                               mode=self.monitor_iteration_mode)
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=self.learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                        self.monitor,
                        monitoring_dataset)

    def setup(self, model, dataset):
        """
        Compiles the theano functions needed for the train method.

        Parameters
        ----------
        model : a Model instance
        dataset : Dataset
        """
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [param for param in model.get_params()
                      if contains_inf(param.get_value())]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: "+str(inf_params))
        if any([contains_nan(param.get_value())
                for param in model.get_params()]):
            nan_params = [param for param in model.get_params()
                          if contains_nan(param.get_value())]
            raise ValueError("These params are NaN: "+str(nan_params))
        self.model = model

        self._synchronize_batch_size(model)
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        # test if force batch size and batch size
        has_force_batch_size = getattr(model, "force_batch_size", False)
        train_dataset_is_uneven = \
            dataset.get_num_examples() % self.batch_size != 0

        has_monitoring_datasets = \
            self.monitoring_dataset is not None and \
            self.monitoring_dataset.values() > 0

        if has_monitoring_datasets:
            monitoring_datasets_are_uneven = \
                any(d.get_num_examples() % self.batch_size
                    != 0 for d in self.monitoring_dataset.values())
        else:
            monitoring_datasets_are_uneven = False  # or True it doesn't matter

        if has_force_batch_size and train_dataset_is_uneven and \
           not has_uniform_batch_size(self.train_iteration_mode):

            raise ValueError("Dataset size is not a multiple of batch size."
                             "You should set train_iteration_mode (and "
                             "maybe monitor_iteration_mode) to "
                             "even_sequential, even_shuffled_sequential or "
                             "even_batchwise_shuffled_sequential")

        if has_force_batch_size and has_monitoring_datasets and \
           monitoring_datasets_are_uneven and \
           not has_uniform_batch_size(self.monitor_iteration_mode):

            raise ValueError("Dataset size is not a multiple of batch size."
                             "You should set monitor_iteration_mode to "
                             "even_sequential, even_shuffled_sequential or "
                             "even_batchwise_shuffled_sequential")

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        learning_rate = self.learning_rate
        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i

        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 ** fixed_var_descr.fixed_vars)
        if not isinstance(grads, OrderedDict):
            raise TypeError(str(type(self.cost)) + ".get_gradients returned " +
                            "something with" + str(type(grads)) + "as its " +
                            "first member. Expected OrderedDict.")

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        for param in grads:
            if grads[param].name is None and cost_value is not None:
                grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
                                     {'costname': cost_value.name,
                                      'paramname': param.name})
            assert grads[param].dtype == param.dtype

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        log.info('Parameter and initial learning rate summary:')
        for param in params:
            param_name = param.name
            if param_name is None:
                param_name = 'anon_param'
            lr = learning_rate.get_value() * lr_scalers.get(param,1.)
            log.info('\t' + param_name + ': ' + str(lr))

        if self.learning_rule:
            updates.update(self.learning_rule.get_updates(
                learning_rate, grads, lr_scalers))
        else:
            # Use standard SGD updates with fixed learning rate.
            updates.update( dict(safe_zip(params, [param - learning_rate * \
                lr_scalers.get(param, 1.) * grads[param]
                                    for param in params])))

        for param in params:
            if updates[param].name is None:
                updates[param].name = 'sgd_update(' + param.name + ')'
        model.modify_updates(updates)
        for param in params:
            update = updates[param]
            if update.name is None:
                update.name = 'censor(sgd_update(' + param.name + '))'
            for update_val in get_debug_values(update):
                if contains_inf(update_val):
                    raise ValueError("debug value of %s contains infs" %
                            update.name)
                if contains_nan(update_val):
                    raise ValueError("debug value of %s contains nans" %
                            update.name)


        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost.
        # We have to do that after learning_rule.get_updates has been
        # called, since it may have an effect on
        # learning_rule.add_channels_to_monitor (that is currently the case
        # for AdaDelta and RMSProp).
        self._setup_monitor()

        with log_timing(log, 'Compiling sgd_update'):
            self.sgd_update = function(theano_args,
                                       updates=updates,
                                       name='sgd_update',
                                       on_unused_input='ignore',
                                       mode=self.theano_function_mode)
        self.params = params

    def train(self, dataset):
        """
        Runs one epoch of SGD training on the specified dataset.

        Parameters
        ----------
        dataset : Dataset
        """
        if not hasattr(self, 'sgd_update'):
            raise Exception("train called without first calling setup")

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if not isfinite(value):
                raise Exception("NaN in " + param.name)

        self.first = False
        rng = self.rng
        if not is_stochastic(self.train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)

        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError("Unable to train with SGD, because "
                    "the cost does not actually use data from the data set. "
                    "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=self.train_iteration_mode,
                batch_size=self.batch_size,
                data_specs=flat_data_specs, return_tuple=True,
                rng = rng, num_batches = self.batches_per_iter)

        on_load_batch = self.on_load_batch
        for batch in iterator:
            for callback in on_load_batch:
                callback(*batch)
            self.sgd_update(*batch)
            # iterator might return a smaller batch if dataset size
            # isn't divisible by batch_size
            # Note: if data_specs[0] is a NullSpace, there is no way to know
            # how many examples would actually have been in the batch,
            # since it was empty, so actual_batch_size would be reported as 0.
            actual_batch_size = flat_data_specs[0].np_batch_size(batch)
            self.monitor.report_batch(actual_batch_size)
            for callback in self.update_callbacks:
                callback(self)

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if not isfinite(value):
                raise Exception("NaN in " + param.name)

    def continue_learning(self, model):
        """
        Returns True if the algorithm should continue running, or False
        if it has reached convergence / started overfitting and should
        stop.

        Parameters
        ----------
        model : a Model instance
        """
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion.continue_learning(self.model)
Exemplo n.º 4
0
class SGD(TrainingAlgorithm):
    """
    Stochastic Gradient Descent

    WRITEME: what is a good reference to read about this algorithm?

    A TrainingAlgorithm that does gradient descent on minibatches.

    """
    def __init__(self,
                 learning_rate,
                 cost=None,
                 batch_size=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None,
                 update_callbacks=None,
                 learning_rule=None,
                 init_momentum=None,
                 set_batch_size=False,
                 train_iteration_mode=None,
                 batches_per_iter=None,
                 theano_function_mode=None,
                 monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
            WRITEME

            learning_rate: The learning rate to use.
                            Train object callbacks can change the learning
                            rate after each epoch. SGD update_callbacks
                            can change it after each minibatch.
            cost: a pylearn2.costs.cost.Cost object specifying the objective
                  function to be minimized.
                  Optionally, may be None. In this case, SGD will call the model's
                  get_default_cost method to obtain the objective function.
            init_momentum: **DEPRECATED** if None, does not use momentum
                            otherwise, use momentum and initialize the
                            momentum coefficient to init_momentum.
                            Callbacks can change this over time just like
                            the learning rate.

                            If the gradient is the same on every step, then
                            the update taken by the SGD algorithm is scaled
                            by a factor of 1/(1-momentum).

                            See section 9 of Geoffrey Hinton's "A Practical
                            Guide to Training Restricted Boltzmann Machines"
                            for details.
            learning_rule: training_algorithms.learning_rule.LearningRule,
                           a learning rule computes the new parameter values given
                           old parameters and first-order gradients. If learning_rule
                           is None, sgd.SGD will update parameters according to
                           the standard SGD learning rule.
            set_batch_size: if True, and batch_size conflicts with
                            model.force_batch_size, will call
                            model.set_batch_size(batch_size) in an attempt
                            to change model.force_batch_size
            theano_function_mode: The theano mode to compile the updates function with.
                            Note that pylearn2 includes some wraplinker modes that are
                            not bundled with theano. See pylearn2.devtools. These
                            extra modes let you do things like check for NaNs at every
                            step, or record md5 digests of all computations performed
                            by the update function to help isolate problems with nondeterminism.

            Parameters are updated by the formula:

            inc := momentum * inc - learning_rate * d cost / d param
            param := param + inc
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError(
                "SGD no longer supports using collections of Costs to represent "
                " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn(
                "init_momentum interface is deprecated and will "
                "become officially unsuported as of May 9, 2014. Please use the "
                "`learning_rule` parameter instead, providing an object of type "
                "`pylearn2.training_algorithms.learning_rule.Momentum` instead"
            )
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError(
                    "Specified an amount of monitoring batches but not a monitoring dataset."
                )
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = np.random.RandomState(seed)
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs

    def setup(self, model, dataset):
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [
            param for param in model.get_params()
            if np.any(np.isinf(param.get_value()))
        ]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: " + str(inf_params))
        if any([
                np.any(np.isnan(param.get_value()))
                for param in model.get_params()
        ]):
            nan_params = [
                param for param in model.get_params()
                if np.any(np.isnan(param.get_value()))
            ]
            raise ValueError("These params are NaN: " + str(nan_params))
        self.model = model

        batch_size = self.batch_size
        if hasattr(model, "force_batch_size"):
            if model.force_batch_size > 0:
                if batch_size is not None:
                    if batch_size != model.force_batch_size:
                        if self.set_batch_size:
                            model.set_batch_size(batch_size)
                        else:
                            raise ValueError(
                                "batch_size argument to SGD conflicts with model's force_batch_size attribute"
                            )
                else:
                    self.batch_size = model.force_batch_size
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    **fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost
        learning_rate = self.learning_rate
        if self.monitoring_dataset is not None:
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.batch_size,
                               num_batches=self.monitoring_batches,
                               extra_costs=self.monitoring_costs,
                               mode=self.monitor_iteration_mode)
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                    self.monitor, monitoring_dataset)

        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i

        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 **fixed_var_descr.fixed_vars)

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        for param in grads:
            if grads[param].name is None and cost_value is not None:
                grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {
                    'costname': cost_value.name,
                    'paramname': param.name
                })

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        log.info('Parameter and initial learning rate summary:')
        for param in params:
            param_name = param.name
            if param_name is None:
                param_name = 'anon_param'
            lr = learning_rate.get_value() * lr_scalers.get(param, 1.)
            log.info('\t' + param_name + ': ' + str(lr))

        if self.learning_rule:
            updates.update(
                self.learning_rule.get_updates(learning_rate, grads,
                                               lr_scalers))
        else:
            # Use standard SGD updates with fixed learning rate.
            updates.update( dict(safe_zip(params, [param - learning_rate * \
                lr_scalers.get(param, 1.) * grads[param]
                                    for param in params])))

        for param in params:
            if updates[param].name is None:
                updates[param].name = 'sgd_update(' + param.name + ')'
        model.censor_updates(updates)
        for param in params:
            update = updates[param]
            if update.name is None:
                update.name = 'censor(sgd_update(' + param.name + '))'
            for update_val in get_debug_values(update):
                if np.any(np.isinf(update_val)):
                    raise ValueError("debug value of %s contains infs" %
                                     update.name)
                if np.any(np.isnan(update_val)):
                    raise ValueError("debug value of %s contains nans" %
                                     update.name)

        with log_timing(log, 'Compiling sgd_update'):
            self.sgd_update = function(theano_args,
                                       updates=updates,
                                       name='sgd_update',
                                       on_unused_input='ignore',
                                       mode=self.theano_function_mode)
        self.params = params

    def train(self, dataset):
        if not hasattr(self, 'sgd_update'):
            raise Exception("train called without first calling setup")

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.first = False
        rng = self.rng
        if not is_stochastic(self.train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)

        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError(
                "Unable to train with SGD, because "
                "the cost does not actually use data from the data set. "
                "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=self.train_iteration_mode,
                                    batch_size=self.batch_size,
                                    data_specs=flat_data_specs,
                                    return_tuple=True,
                                    rng=rng,
                                    num_batches=self.batches_per_iter)

        on_load_batch = self.on_load_batch
        for batch in iterator:
            for callback in on_load_batch:
                callback(mapping.nest(batch))
            self.sgd_update(*batch)
            # iterator might return a smaller batch if dataset size
            # isn't divisible by batch_size
            # Note: if data_specs[0] is a NullSpace, there is no way to know
            # how many examples would actually have been in the batch,
            # since it was empty, so actual_batch_size would be reported as 0.
            actual_batch_size = flat_data_specs[0].np_batch_size(batch)
            self.monitor.report_batch(actual_batch_size)
            for callback in self.update_callbacks:
                callback(self)

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion.continue_learning(self.model)