예제 #1
0
    def __init__(self,
                 learning_rate,
                 cost=None,
                 batch_size=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None,
                 update_callbacks=None,
                 learning_rule=None,
                 init_momentum=None,
                 set_batch_size=False,
                 train_iteration_mode=None,
                 batches_per_iter=None,
                 theano_function_mode=None,
                 monitoring_costs=None,
                 seed=[2012, 10, 5]):

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of " +
                            "Costs to represent a sum of Costs. Use " +
                            "pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn(
                "init_momentum interface is deprecated and will "
                "become officially unsuported as of May 9, 2014. Please use the "
                "`learning_rule` parameter instead, providing an object of type "
                "`pylearn2.training_algorithms.learning_rule.Momentum` instead"
            )
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " +
                                 "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = make_np_rng(seed, which_method=["randn", "randint"])
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs
예제 #2
0
def get_ae_pretrainer(layer, data, batch_size, epochs=30):
    init_lr = 0.05

    train_algo = SGD(
        batch_size=batch_size,
        learning_rate=init_lr,
        learning_rule=Momentum(init_momentum=0.5),
        monitoring_batches=batch_size,
        monitoring_dataset=data,
        # for ContractiveAutoencoder:
        # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()],
        #                             [0.5, cost.MethodCost(method='contraction_penalty')]]),
        # for HigherOrderContractiveAutoencoder:
        # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()],
        #                             [0.5, cost.MethodCost(method='contraction_penalty')],
        #                             [0.5, cost.MethodCost(method='higher_order_penalty')]]),
        # for DenoisingAutoencoder:
        cost=MeanSquaredReconstructionError(),
        termination_criterion=EpochCounter(epochs))
    return Train(model=layer,
                 algorithm=train_algo,
                 dataset=data,
                 extensions=[
                     MomentumAdjustor(final_momentum=0.9, start=0,
                                      saturate=25),
                     LinearDecayOverEpoch(start=1,
                                          saturate=25,
                                          decay_factor=.02)
                 ])
예제 #3
0
def get_layer_trainer_sgd_autoencoder(layer,
                                      trainset,
                                      batch_size=10,
                                      learning_rate=0.1,
                                      max_epochs=100,
                                      name=''):
    # configs on sgd
    train_algo = SGD(
        learning_rate=learning_rate,
        #             learning_rule = AdaDelta(),
        learning_rule=Momentum(init_momentum=0.5),
        cost=MeanSquaredReconstructionError(),
        batch_size=batch_size,
        monitoring_dataset=trainset,
        termination_criterion=EpochCounter(max_epochs=max_epochs),
        update_callbacks=None)

    log_callback = LoggingCallback(name)

    return Train(model=layer,
                 algorithm=train_algo,
                 extensions=[
                     log_callback,
                     OneOverEpoch(start=1, half_life=5),
                     MomentumAdjustor(final_momentum=0.7,
                                      start=10,
                                      saturate=100)
                 ],
                 dataset=trainset)
예제 #4
0
    def __init__(
        self,
        learning_rate,
        cost=None,
        batch_size=None,
        monitoring_batches=None,
        monitoring_dataset=None,
        monitor_iteration_mode="sequential",
        termination_criterion=None,
        update_callbacks=None,
        learning_rule=None,
        init_momentum=None,
        set_batch_size=False,
        train_iteration_mode=None,
        batches_per_iter=None,
        theano_function_mode=None,
        monitoring_costs=None,
        seed=[2012, 10, 5],
    ):

        if isinstance(cost, (list, tuple, set)):
            raise TypeError(
                "SGD no longer supports using collections of "
                + "Costs to represent a sum of Costs. Use "
                + "pylearn2.costs.cost.SumOfCosts instead."
            )

        if init_momentum:
            warnings.warn(
                "init_momentum interface is deprecated and will "
                "become officially unsuported as of May 9, 2014. Please use the "
                "`learning_rule` parameter instead, providing an object of type "
                "`pylearn2.training_algorithms.learning_rule.Momentum` instead"
            )
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, "learning_rate")
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = "shuffled_sequential"
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = make_np_rng(seed, which_method=["randn", "randint"])
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs
예제 #5
0
def test_lr_scalers():
    """
    Tests that SGD respects Model.get_lr_scalers
    """
    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    learning_rate = .001

    class ModelWithScalers(Model):
        def __init__(self):
            super(ModelWithScalers, self).__init__()
            self._params = [sharedX(np.zeros(shape)) for shape in shapes]
            self.input_space = VectorSpace(1)

        def __call__(self, X):
            # Implemented only so that DummyCost would work
            return X

        def get_lr_scalers(self):
            return dict(zip(self._params, scales))

    model = ModelWithScalers()

    dataset = ArangeDataset(1)

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(.0),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    manual = [
        param - learning_rate * scale for param, scale in zip(manual, scales)
    ]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))

    manual = [
        param - learning_rate * scale for param, scale in zip(manual, scales)
    ]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))
예제 #6
0
def test_nesterov_momentum():
    """
    Make sure that learning_rule.Momentum obtains the same parameter values as
    with a hand-crafted sgd w/ momentum implementation, given a dummy model and
    learning rate scaler for each parameter.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(momentum, nesterov_momentum=True),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    vel = [-learning_rate * scale for scale in scales]
    updates = [
        -learning_rate * scale + v * momentum
        for scale, v in izip(scales, vel)
    ]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))

    vel = [
        -learning_rate * scale + i * momentum
        for scale, i in izip(scales, vel)
    ]
    updates = [
        -learning_rate * scale + v * momentum
        for scale, v in izip(scales, vel)
    ]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))
예제 #7
0
def test_momentum():
    """
    Make sure that learning_rule.Momentum obtains the same parameter values as
    with a hand-crafted sgd w/ momentum implementation, given a dummy model and
    learning rate scaler for each parameter.
    """
    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    learning_rate = .001
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(momentum),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    inc = [-learning_rate * scale for param, scale in zip(manual, scales)]
    manual = [param + i for param, i in zip(manual, inc)]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))

    manual = [
        param - learning_rate * scale + i * momentum
        for param, scale, i in zip(manual, scales, inc)
    ]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))
예제 #8
0
def test_lr_scalers_momentum():
    """
    Tests that SGD respects Model.get_lr_scalers when using
    momentum.
    """
    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    learning_rate = .001
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(momentum),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    inc = [-learning_rate * scale for param, scale in zip(manual, scales)]
    manual = [param + i for param, i in zip(manual, inc)]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))

    manual = [
        param - learning_rate * scale + i * momentum
        for param, scale, i in zip(manual, scales, inc)
    ]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))
예제 #9
0
def get_finetuner(model, trainset, batch_size=100, epochs=100):
    train_algo = SGD(batch_size=batch_size,
                     learning_rule=Momentum(init_momentum=0.5),
                     learning_rate=0.5,
                     monitoring_batches=batch_size,
                     monitoring_dataset=trainset,
                     cost=Dropout(input_include_probs={'h0': .5},
                                  input_scales={'h0': 2.}),
                     termination_criterion=EpochCounter(epochs))
    path = DATA_DIR + 'model' + str(SUBMODEL) + 'saved_daex.pkl'
    return Train(model=model,
                 algorithm=train_algo,
                 dataset=trainset,
                 save_path=path,
                 save_freq=10,
                 extensions=[
                     MomentumAdjustor(final_momentum=0.9,
                                      start=0,
                                      saturate=int(epochs * 0.8)),
                     LinearDecayOverEpoch(start=1,
                                          saturate=int(epochs * 0.7),
                                          decay_factor=.02)
                 ])
예제 #10
0
def get_trainer2(model, trainset, epochs=50):
    train_algo = SGD(
        batch_size=bsize,
        learning_rate=0.5,
        learning_rule=Momentum(init_momentum=0.5),
        monitoring_batches=bsize,
        monitoring_dataset=trainset,
        cost=Dropout(input_include_probs={'h0': .8}, input_scales={'h0': 1.}),
        termination_criterion=EpochCounter(epochs),
    )
    path = DATA_DIR + 'model2saved_conv.pkl'
    return Train(model=model,
                 algorithm=train_algo,
                 dataset=trainset,
                 save_path=path,
                 save_freq=1,
                 extensions=[
                     MomentumAdjustor(final_momentum=0.7,
                                      start=0,
                                      saturate=int(epochs * 0.5)),
                     LinearDecayOverEpoch(start=1,
                                          saturate=int(epochs * 0.8),
                                          decay_factor=.01)
                 ])
예제 #11
0
파일: sgd.py 프로젝트: antran89/pylearn2
    def __init__(self, learning_rate, cost=None, batch_size=None,
                 monitoring_batches=None, monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None, update_callbacks=None,
                 learning_rule = None, init_momentum = None, set_batch_size = False,
                 train_iteration_mode = None, batches_per_iter=None,
                 theano_function_mode = None, monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
            WRITEME

            learning_rate: The learning rate to use.
                            Train object callbacks can change the learning
                            rate after each epoch. SGD update_callbacks
                            can change it after each minibatch.
            cost: a pylearn2.costs.cost.Cost object specifying the objective
                  function to be minimized.
                  Optionally, may be None. In this case, SGD will call the model's
                  get_default_cost method to obtain the objective function.
            init_momentum: **DEPRECATED** if None, does not use momentum
                            otherwise, use momentum and initialize the
                            momentum coefficient to init_momentum.
                            Callbacks can change this over time just like
                            the learning rate.

                            If the gradient is the same on every step, then
                            the update taken by the SGD algorithm is scaled
                            by a factor of 1/(1-momentum).

                            See section 9 of Geoffrey Hinton's "A Practical
                            Guide to Training Restricted Boltzmann Machines"
                            for details.
            learning_rule: training_algorithms.learning_rule.LearningRule,
                           a learning rule computes the new parameter values given
                           old parameters and first-order gradients. If learning_rule
                           is None, sgd.SGD will update parameters according to
                           the standard SGD learning rule.
            set_batch_size: if True, and batch_size conflicts with
                            model.force_batch_size, will call
                            model.set_batch_size(batch_size) in an attempt
                            to change model.force_batch_size
            theano_function_mode: The theano mode to compile the updates function with.
                            Note that pylearn2 includes some wraplinker modes that are
                            not bundled with theano. See pylearn2.devtools. These
                            extra modes let you do things like check for NaNs at every
                            step, or record md5 digests of all computations performed
                            by the update function to help isolate problems with nondeterminism.

            Parameters are updated by the formula:

            inc := momentum * inc - learning_rate * d cost / d param
            param := param + inc
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of Costs to represent "
                    " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn("init_momentum interface is deprecated and will "
            "become officially unsuported as of May 9, 2014. Please use the "
            "`learning_rule` parameter instead, providing an object of type "
            "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = np.random.RandomState(seed)
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs
예제 #12
0
파일: sgd.py 프로젝트: antran89/pylearn2
class SGD(TrainingAlgorithm):
    """
    Stochastic Gradient Descent

    WRITEME: what is a good reference to read about this algorithm?

    A TrainingAlgorithm that does gradient descent on minibatches.

    """
    def __init__(self, learning_rate, cost=None, batch_size=None,
                 monitoring_batches=None, monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None, update_callbacks=None,
                 learning_rule = None, init_momentum = None, set_batch_size = False,
                 train_iteration_mode = None, batches_per_iter=None,
                 theano_function_mode = None, monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
            WRITEME

            learning_rate: The learning rate to use.
                            Train object callbacks can change the learning
                            rate after each epoch. SGD update_callbacks
                            can change it after each minibatch.
            cost: a pylearn2.costs.cost.Cost object specifying the objective
                  function to be minimized.
                  Optionally, may be None. In this case, SGD will call the model's
                  get_default_cost method to obtain the objective function.
            init_momentum: **DEPRECATED** if None, does not use momentum
                            otherwise, use momentum and initialize the
                            momentum coefficient to init_momentum.
                            Callbacks can change this over time just like
                            the learning rate.

                            If the gradient is the same on every step, then
                            the update taken by the SGD algorithm is scaled
                            by a factor of 1/(1-momentum).

                            See section 9 of Geoffrey Hinton's "A Practical
                            Guide to Training Restricted Boltzmann Machines"
                            for details.
            learning_rule: training_algorithms.learning_rule.LearningRule,
                           a learning rule computes the new parameter values given
                           old parameters and first-order gradients. If learning_rule
                           is None, sgd.SGD will update parameters according to
                           the standard SGD learning rule.
            set_batch_size: if True, and batch_size conflicts with
                            model.force_batch_size, will call
                            model.set_batch_size(batch_size) in an attempt
                            to change model.force_batch_size
            theano_function_mode: The theano mode to compile the updates function with.
                            Note that pylearn2 includes some wraplinker modes that are
                            not bundled with theano. See pylearn2.devtools. These
                            extra modes let you do things like check for NaNs at every
                            step, or record md5 digests of all computations performed
                            by the update function to help isolate problems with nondeterminism.

            Parameters are updated by the formula:

            inc := momentum * inc - learning_rate * d cost / d param
            param := param + inc
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of Costs to represent "
                    " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn("init_momentum interface is deprecated and will "
            "become officially unsuported as of May 9, 2014. Please use the "
            "`learning_rule` parameter instead, providing an object of type "
            "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = np.random.RandomState(seed)
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs

    def setup(self, model, dataset):
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [ param for param in model.get_params() if np.any(np.isinf(param.get_value())) ]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: "+str(inf_params))
        if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]):
            nan_params = [ param for param in model.get_params() if np.any(np.isnan(param.get_value())) ]
            raise ValueError("These params are NaN: "+str(nan_params))
        self.model = model

        batch_size = self.batch_size
        if hasattr(model, "force_batch_size"):
            if model.force_batch_size > 0:
                if batch_size is not None:
                    if batch_size != model.force_batch_size:
                        if self.set_batch_size:
                            model.set_batch_size(batch_size)
                        else:
                            raise ValueError("batch_size argument to SGD conflicts with model's force_batch_size attribute")
                else:
                    self.batch_size = model.force_batch_size
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name, batch_size = self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost
        learning_rate = self.learning_rate
        if self.monitoring_dataset is not None:
            self.monitor.setup(
                    dataset=self.monitoring_dataset,
                    cost=self.cost,
                    batch_size=self.batch_size,
                    num_batches=self.monitoring_batches,
                    extra_costs=self.monitoring_costs,
                    mode=self.monitor_iteration_mode
                    )
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                        self.monitor,
                        monitoring_dataset)

        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i

        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 ** fixed_var_descr.fixed_vars)

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        for param in grads:
            if grads[param].name is None and cost_value is not None:
                grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
                                     {'costname': cost_value.name,
                                      'paramname': param.name})

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        log.info('Parameter and initial learning rate summary:')
        for param in params:
            param_name = param.name
            if param_name is None:
                param_name = 'anon_param'
            lr = learning_rate.get_value() * lr_scalers.get(param,1.)
            log.info('\t' + param_name + ': ' + str(lr))

        if self.learning_rule:
            updates.update(self.learning_rule.get_updates(
                learning_rate, grads, lr_scalers))
        else:
            # Use standard SGD updates with fixed learning rate.
            updates.update( dict(safe_zip(params, [param - learning_rate * \
                lr_scalers.get(param, 1.) * grads[param]
                                    for param in params])))

        for param in params:
            if updates[param].name is None:
                updates[param].name = 'sgd_update(' + param.name + ')'
        model.censor_updates(updates)
        for param in params:
            update = updates[param]
            if update.name is None:
                update.name = 'censor(sgd_update(' + param.name + '))'
            for update_val in get_debug_values(update):
                if np.any(np.isinf(update_val)):
                    raise ValueError("debug value of %s contains infs" % update.name)
                if np.any(np.isnan(update_val)):
                    raise ValueError("debug value of %s contains nans" % update.name)


        with log_timing(log, 'Compiling sgd_update'):
            self.sgd_update = function(theano_args,
                                       updates=updates,
                                       name='sgd_update',
                                       on_unused_input='ignore',
                                       mode=self.theano_function_mode)
        self.params = params

    def train(self, dataset):
        if not hasattr(self, 'sgd_update'):
            raise Exception("train called without first calling setup")

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.first = False
        rng = self.rng
        if not is_stochastic(self.train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)

        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError("Unable to train with SGD, because "
                    "the cost does not actually use data from the data set. "
                    "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=self.train_iteration_mode,
                batch_size=self.batch_size,
                data_specs=flat_data_specs, return_tuple=True,
                rng = rng, num_batches = self.batches_per_iter)

        on_load_batch = self.on_load_batch
        for batch in iterator:
            for callback in on_load_batch:
                callback(mapping.nest(batch))
            self.sgd_update(*batch)
            # iterator might return a smaller batch if dataset size
            # isn't divisible by batch_size
            # Note: if data_specs[0] is a NullSpace, there is no way to know
            # how many examples would actually have been in the batch,
            # since it was empty, so actual_batch_size would be reported as 0.
            actual_batch_size = flat_data_specs[0].np_batch_size(batch)
            self.monitor.report_batch(actual_batch_size)
            for callback in self.update_callbacks:
                callback(self)

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion.continue_learning(self.model)
예제 #13
0
파일: sgd.py 프로젝트: wumiyang/pylearn2
    def __init__(self,
                 learning_rate,
                 cost=None,
                 batch_size=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None,
                 update_callbacks=None,
                 learning_rule=None,
                 init_momentum=None,
                 set_batch_size=False,
                 train_iteration_mode=None,
                 batches_per_iter=None,
                 theano_function_mode=None,
                 monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
        Parameters
        ----------
        learning_rate : float
            The learning rate to use. Train object callbacks can change the \
            learning rate after each epoch. SGD update_callbacks can change \
            it after each minibatch.
        cost : pylearn2.costs.cost.Cost
            Cost object specifying the objective function to be minimized. \
            Optionally, may be None. In this case, SGD will call the model's \
            get_default_cost method to obtain the objective function.
        batch_size : optional, int
            The size of the batch to be used.
            If not specified, the model will be asked for the batch size, so
            you must have specified the batch size there.
            (Some models are rigidly defined to only work with one batch size)
        monitoring_batches : optional, int
            At the start of each epoch, we run "monitoring", to evaluate
            quantities such as the validation set error.
            monitoring_batches, if specified, determines the number of batches
            to draw from the iterator for each monitoring dataset.
            Unnecessary if not using monitoring or if `monitor_iteration_mode`
            is 'sequential' and `batch_size` is specified (number of
            batches will be calculated based on full dataset size).
            TODO: make it possible to specify different monitoring_batches
            for each monitoring dataset. The Monitor itself already supports
            this.
        monitoring_dataset : optional, a Dataset or dictionary
            If not specified, no monitoring is used.
            If specified to be a Dataset, monitor on that Dataset.
            If specified to be dictionary, the keys should be string names
            of datasets, and the values should be Datasets. All monitoring
            channels will be computed for all monitoring Datasets and will
            have the dataset name and an underscore prepended to them.
        monitor_iteration_mode : optional, str
            The iteration mode used to iterate over the examples in all
            monitoring datasets. If not specified, defaults to 'sequential'.
            TODO: make it possible to specify different modes for different
            datasets.
        termination_criterion : optional, instance of
            pylearn2.termination_criteria.TerminationCriterion
            Used to determine when the algorithm should stop running.
            If not specified, runs forever--or more realistically, until
            external factors halt the python process (Kansas 1977).
        update_callbacks : optional, list
            If specified, each member of the list should be a callable that
            accepts an SGD instance as its only argument.
            All callbacks will be called with this SGD instance after each
            SGD step.
        learning_rule : training_algorithms.learning_rule.LearningRule
            A learning rule computes the new parameter values given old \
            parameters and first-order gradients. If learning_rule is None, \
            sgd.SGD will update parameters according to the standard SGD \
            learning rule:
                param := param - learning_rate * d cost / d param
            This argument allows more sophisticated learning rules, such
            as SGD with momentum.
        init_momentum : **DEPRECATED** option, float
            Use learning_rule instead.
            If None, does not use momentum otherwise, use momentum and \
            initialize the momentum coefficient to init_momentum. Callbacks \
            can change this over time just like the learning rate. If the \
            gradient is the same on every step, then the update taken by the \
            SGD algorithm is scaled by a factor of 1/(1-momentum). See \
            section 9 of Geoffrey Hinton's "A Practical Guide to Training \
            Restricted Boltzmann Machines" for details.
        set_batch_size : optional, bool
            Defaults to False.
            If True, and batch_size conflicts with model.force_batch_size, \
            will call model.set_batch_size(batch_size) in an attempt to \
            change model.force_batch_size
        train_iteration_mode : optional, str
            Defaults to 'shuffled_sequential'.
            The iteration mode to use for iterating through training examples.
        batches_per_iter : optional, int
            The number of batches to draw from the iterator over training
            examples.
            If iterational mode is 'sequential' or 'shuffled_sequential', this
            is unnecessary; when unspecified we will iterate over all examples.
        theano_function_mode : optional, a valid argument to theano.function's
            'mode' parameter.
            The theano mode to compile the updates function with. Note that \
            pylearn2 includes some wraplinker modes that are not bundled with \
            theano. See pylearn2.devtools. These extra modes let you do \
            things like check for NaNs at every step, or record md5 digests \
            of all computations performed by the update function to help \
            isolate problems with nondeterminism.
        monitoring_costs : optional, list
            a list of Cost instances. The Monitor will also include all
            channels defined by these Costs, even though we don't train
            using them.
        seed : optional, valid argument to np.random.RandomState
            The seed used for the random number generate to be passed to the
            training dataset iterator (if any)
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of " +
                            "Costs to represent a sum of Costs. Use " +
                            "pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn(
                "init_momentum interface is deprecated and will "
                "become officially unsuported as of May 9, 2014. Please use the "
                "`learning_rule` parameter instead, providing an object of type "
                "`pylearn2.training_algorithms.learning_rule.Momentum` instead"
            )
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " +
                                 "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = make_np_rng(seed, which_method=["randn", "randint"])
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs
예제 #14
0
    def __init__(self,
                 layers,
                 random_state=None,
                 learning_rule='sgd',
                 learning_rate=0.01,
                 learning_momentum=0.9,
                 dropout=False,
                 batch_size=1,
                 n_iter=None,
                 n_stable=50,
                 f_stable=0.001,
                 valid_set=None,
                 valid_size=0.0,
                 verbose=False,
                 **params):

        self.layers = []
        for i, layer in enumerate(layers):
            assert isinstance(layer, Layer),\
                "Specify each layer as an instance of a `sknn.mlp.Layer` object."

            # Layer names are optional, if not specified then generate one.
            if layer.name is None:
                label = "hidden" if i < len(layers) - 1 else "output"
                layer.name = "%s%i" % (label, i)

            # sklearn may pass layers in as additional named parameters, remove them.
            if layer.name in params:
                del params[layer.name]

            self.layers.append(layer)

        # Don't support any additional parameters that are not in the constructor.
        # These are specified only so `get_params()` can return named layers, for double-
        # underscore syntax to work.
        assert len(params) == 0,\
            "The specified additional parameters are unknown."

        self.random_state = random_state
        self.learning_rule = learning_rule
        self.learning_rate = learning_rate
        self.learning_momentum = learning_momentum
        self.dropout = dropout if type(dropout) is float else (
            0.5 if dropout else 0.0)
        self.batch_size = batch_size
        self.n_iter = n_iter
        self.n_stable = n_stable
        self.f_stable = f_stable
        self.valid_set = valid_set
        self.valid_size = valid_size
        self.verbose = verbose

        self.unit_counts = None
        self.input_space = None
        self.mlp = None
        self.weights = None
        self.vs = None
        self.ds = None
        self.trainer = None
        self.f = None
        self.train_set = None
        self.best_valid_error = float("inf")

        self.cost = "Dropout" if dropout else None
        if learning_rule == 'sgd':
            self._learning_rule = None
        # elif learning_rule == 'adagrad':
        #     self._learning_rule = AdaGrad()
        elif learning_rule == 'adadelta':
            self._learning_rule = AdaDelta()
        elif learning_rule == 'momentum':
            self._learning_rule = Momentum(learning_momentum)
        elif learning_rule == 'nesterov':
            self._learning_rule = Momentum(learning_momentum,
                                           nesterov_momentum=True)
        elif learning_rule == 'rmsprop':
            self._learning_rule = RMSProp()
        else:
            raise NotImplementedError(
                "Learning rule type `%s` is not supported." % learning_rule)

        self._setup()
예제 #15
0
model = MLP(layers=[h0, h1, h2, h3, h4, y],
            batch_size=batchSize,
            input_space=inputSpace)

algorithm = SGD(learning_rate=1e-3,
                cost=MethodCost("cost_from_X"),
                batch_size=batchSize,
                monitoring_batch_size=batchSize,
                monitoring_dataset={
                    'train': train,
                    'valid': valid
                },
                monitor_iteration_mode="even_batchwise_shuffled_sequential",
                termination_criterion=EpochCounter(max_epochs=200),
                learning_rule=Momentum(init_momentum=0.1,
                                       nesterov_momentum=True),
                train_iteration_mode="even_batchwise_shuffled_sequential")

train = Train(dataset=train,
              model=model,
              algorithm=algorithm,
              save_path="ConvNet2.pkl",
              save_freq=1,
              extensions=[
                  MonitorBasedSaveBest(channel_name="valid_y_misclass",
                                       save_path="ConvNet2_best.pkl")
              ])

print("Starting training session")

train.main_loop()
예제 #16
0
    def __init__(self,
                 learning_rate,
                 cost=None,
                 batch_size=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None,
                 update_callbacks=None,
                 learning_rule=None,
                 init_momentum=None,
                 set_batch_size=False,
                 train_iteration_mode=None,
                 batches_per_iter=None,
                 theano_function_mode=None,
                 monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
            WRITEME

            learning_rate: The learning rate to use.
                            Train object callbacks can change the learning
                            rate after each epoch. SGD update_callbacks
                            can change it after each minibatch.
            cost: a pylearn2.costs.cost.Cost object specifying the objective
                  function to be minimized.
                  Optionally, may be None. In this case, SGD will call the model's
                  get_default_cost method to obtain the objective function.
            init_momentum: **DEPRECATED** if None, does not use momentum
                            otherwise, use momentum and initialize the
                            momentum coefficient to init_momentum.
                            Callbacks can change this over time just like
                            the learning rate.

                            If the gradient is the same on every step, then
                            the update taken by the SGD algorithm is scaled
                            by a factor of 1/(1-momentum).

                            See section 9 of Geoffrey Hinton's "A Practical
                            Guide to Training Restricted Boltzmann Machines"
                            for details.
            learning_rule: training_algorithms.learning_rule.LearningRule,
                           a learning rule computes the new parameter values given
                           old parameters and first-order gradients. If learning_rule
                           is None, sgd.SGD will update parameters according to
                           the standard SGD learning rule.
            set_batch_size: if True, and batch_size conflicts with
                            model.force_batch_size, will call
                            model.set_batch_size(batch_size) in an attempt
                            to change model.force_batch_size
            theano_function_mode: The theano mode to compile the updates function with.
                            Note that pylearn2 includes some wraplinker modes that are
                            not bundled with theano. See pylearn2.devtools. These
                            extra modes let you do things like check for NaNs at every
                            step, or record md5 digests of all computations performed
                            by the update function to help isolate problems with nondeterminism.

            Parameters are updated by the formula:

            inc := momentum * inc - learning_rate * d cost / d param
            param := param + inc
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError(
                "SGD no longer supports using collections of Costs to represent "
                " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn(
                "init_momentum interface is deprecated and will "
                "become officially unsuported as of May 9, 2014. Please use the "
                "`learning_rule` parameter instead, providing an object of type "
                "`pylearn2.training_algorithms.learning_rule.Momentum` instead"
            )
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError(
                    "Specified an amount of monitoring batches but not a monitoring dataset."
                )
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = np.random.RandomState(seed)
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs
예제 #17
0
class SGD(TrainingAlgorithm):
    """
    SGD = (Minibatch) Stochastic Gradient Descent.
    A TrainingAlgorithm that does stochastic gradient descent on minibatches
    of training examples.

    For theoretical background on this algorithm, see Yoshua Bengio's machine
    learning course notes on the subject:

    http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html

    Parameters
    ----------
    learning_rate : float
        The learning rate to use. Train object callbacks can change the
        learning rate after each epoch. SGD update_callbacks can change
        it after each minibatch.
    cost : pylearn2.costs.cost.Cost, optional
        Cost object specifying the objective function to be minimized.
        Optionally, may be None. In this case, SGD will call the model's
        get_default_cost method to obtain the objective function.
    batch_size : int, optional
        The size of the batch to be used.
        If not specified, the model will be asked for the batch size, so
        you must have specified the batch size there.
        (Some models are rigidly defined to only work with one batch size)
    monitoring_batch_size : int, optional
        The size of the monitoring batches.
    monitoring_batches : int, optional
        At the start of each epoch, we run "monitoring", to evaluate
        quantities such as the validation set error.
        monitoring_batches, if specified, determines the number of batches
        to draw from the iterator for each monitoring dataset.
        Unnecessary if not using monitoring or if `monitor_iteration_mode`
        is 'sequential' and `batch_size` is specified (number of
        batches will be calculated based on full dataset size).
        TODO: make it possible to specify different monitoring_batches
        for each monitoring dataset. The Monitor itself already supports
        this.
    monitoring_dataset : Dataset or dictionary, optional
        If not specified, no monitoring is used.
        If specified to be a Dataset, monitor on that Dataset.
        If specified to be dictionary, the keys should be string names
        of datasets, and the values should be Datasets. All monitoring
        channels will be computed for all monitoring Datasets and will
        have the dataset name and an underscore prepended to them.
    monitor_iteration_mode : str, optional
        The iteration mode used to iterate over the examples in all
        monitoring datasets. If not specified, defaults to 'sequential'.
        TODO: make it possible to specify different modes for different
        datasets.
    termination_criterion : instance of \
        pylearn2.termination_criteria.TerminationCriterion, optional

        Used to determine when the algorithm should stop running.
        If not specified, runs forever--or more realistically, until
        external factors halt the python process (Kansas 1977).
    update_callbacks : list, optional
        If specified, each member of the list should be a callable that
        accepts an SGD instance as its only argument.
        All callbacks will be called with this SGD instance after each
        SGD step.
    learning_rule : training_algorithms.learning_rule.LearningRule, optional
        A learning rule computes the new parameter values given old
        parameters and first-order gradients. If learning_rule is None,
        sgd.SGD will update parameters according to the standard SGD
        learning rule:

        .. code-block:: none

            param := param - learning_rate * d cost / d param

        This argument allows more sophisticated learning rules, such
        as SGD with momentum.
    init_momentum : float, **DEPRECATED** option
        Use learning_rule instead.
        If None, does not use momentum otherwise, use momentum and
        initialize the momentum coefficient to init_momentum. Callbacks
        can change this over time just like the learning rate. If the
        gradient is the same on every step, then the update taken by the
        SGD algorithm is scaled by a factor of 1/(1-momentum). See
        section 9 of Geoffrey Hinton's "A Practical Guide to Training
        Restricted Boltzmann Machines" for details.
    set_batch_size : bool, optional
        Defaults to False.
        If True, and batch_size conflicts with model.force_batch_size,
        will call model.set_batch_size(batch_size) in an attempt to
        change model.force_batch_size
    train_iteration_mode : str, optional
        Defaults to 'shuffled_sequential'.
        The iteration mode to use for iterating through training examples.
    batches_per_iter : int, optional
        The number of batches to draw from the iterator over training
        examples.
        If iteration mode is 'sequential' or 'shuffled_sequential', this
        is unnecessary; when unspecified we will iterate over all examples.
    theano_function_mode : a valid argument to theano.function's \
        'mode' parameter, optional

        The theano mode to compile the updates function with. Note that
        pylearn2 includes some wraplinker modes that are not bundled with
        theano. See pylearn2.devtools. These extra modes let you do
        things like check for NaNs at every step, or record md5 digests
        of all computations performed by the update function to help
        isolate problems with nondeterminism.
    monitoring_costs : list, optional
        a list of Cost instances. The Monitor will also include all
        channels defined by these Costs, even though we don't train
        using them.
    seed : valid argument to np.random.RandomState, optional
        The seed used for the random number generate to be passed to the
        training dataset iterator (if any)
    """
    def __init__(self, learning_rate, cost=None, batch_size=None,
                 monitoring_batch_size=None, monitoring_batches=None,
                 monitoring_dataset=None, monitor_iteration_mode='sequential',
                 termination_criterion=None, update_callbacks=None,
                 learning_rule = None, init_momentum = None,
                 set_batch_size = False,
                 train_iteration_mode = None, batches_per_iter=None,
                 theano_function_mode = None, monitoring_costs=None,
                 seed=[2012, 10, 5], discriminator_steps=1):

        self.discriminator_steps = discriminator_steps
        self.train_generator = 0

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of " +
                            "Costs to represent a sum of Costs. Use " +
                            "pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn("init_momentum interface is deprecated and will "
            "become officially unsuported as of May 9, 2014. Please use the "
            "`learning_rule` parameter instead, providing an object of type "
            "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batch_size = monitoring_batch_size
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batch_size is not None:
                raise ValueError("Specified a monitoring batch size " +
                                 "but not a monitoring dataset.")
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " +
                                 "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = make_np_rng(seed, which_method=["randn","randint"])
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs

    def setup(self, model, dataset):
        """
        Compiles the theano functions needed for the train method.

        Parameters
        ----------
        model : a Model instance
        dataset : Dataset
        """
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [param for param in model.get_params()
                      if np.any(np.isinf(param.get_value()))]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: "+str(inf_params))
        if any([np.any(np.isnan(param.get_value()))
                for param in model.get_params()]):
            nan_params = [param for param in model.get_params()
                          if np.any(np.isnan(param.get_value()))]
            raise ValueError("These params are NaN: "+str(nan_params))
        self.model = model

        self._synchronize_batch_size(model)
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        # test if force batch size and batch size
        if getattr(model, "force_batch_size", False) and \
           any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for
               dataset in self.monitoring_dataset.values()) and \
           not has_uniform_batch_size(self.monitor_iteration_mode):

            raise ValueError("Dataset size is not a multiple of batch size."
                             "You should set monitor_iteration_mode to "
                             "even_sequential, even_shuffled_sequential or "
                             "even_batchwise_shuffled_sequential")

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost
        learning_rate = self.learning_rate
        if self.monitoring_dataset is not None:
            if (self.monitoring_batch_size is None and
                    self.monitoring_batches is None):
                self.monitoring_batch_size = self.batch_size
                self.monitoring_batches = self.batches_per_iter
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.monitoring_batch_size,
                               num_batches=self.monitoring_batches,
                               extra_costs=self.monitoring_costs,
                               mode=self.monitor_iteration_mode)
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                        self.monitor,
                        monitoring_dataset)

        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i
        self.params = params


        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 ** fixed_var_descr.fixed_vars)
        if not isinstance(grads, OrderedDict):
            raise TypeError(str(type(self.cost)) + ".get_gradients returned " +
                            "something with" + str(type(grads)) + "as its " +
                            "first member. Expected OrderedDict.")

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        assert len(updates.keys()) == 0

        def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False):

            updates = OrderedDict()

            assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator)

            if learn_discriminator:
                cur_params = model.discriminator.get_params()
            else:
                cur_params = model.generator.get_params()

            def check():
                for param in params:
                    if param not in cur_params:
                        assert param not in updates

            cur_grads = OrderedDict()
            for param in cur_params:
                cur_grads[param] = grads[param]

            for param in grads:
                if grads[param].name is None and cost_value is not None:
                    grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
                                         {'costname': cost_value.name,
                                          'paramname': param.name})
                assert grads[param].dtype == param.dtype

            cur_lr_scalers = OrderedDict()
            for param in cur_params:
                if param in lr_scalers:
                    lr_scaler = lr_scalers[param]
                    cur_lr_scalers[param] = lr_scaler

            log.info('Parameter and initial learning rate summary:')
            for param in cur_params:
                param_name = param.name
                if param_name is None:
                    param_name = 'anon_param'
                lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.)
                log.info('\t' + param_name + ': ' + str(lr))

            updates.update(self.learning_rule.get_updates(
                    learning_rate, cur_grads, cur_lr_scalers))
            check()

            for param in cur_params:
                if updates[param].name is None:
                    updates[param].name = 'sgd_update(' + param.name + ')'
            check()
            model.modify_updates(updates)
            check()
            for param in cur_params:
                update = updates[param]
                if update.name is None:
                    update.name = 'censor(sgd_update(' + param.name + '))'
                for update_val in get_debug_values(update):
                    if np.any(np.isinf(update_val)):
                        raise ValueError("debug value of %s contains infs" %
                                update.name)
                    if np.any(np.isnan(update_val)):
                        raise ValueError("debug value of %s contains nans" %
                                update.name)

            check()

            if dont_you_fucking_dare_touch_the_generator:
                for param in model.generator.get_params():
                    assert param not in updates

            with log_timing(log, 'Compiling sgd_update'):
                return function(theano_args,
                                           updates=updates,
                                           name='sgd_update',
                                           on_unused_input='ignore',
                                           mode=self.theano_function_mode)
        self.d_func = get_func(1, 0, dont_you_fucking_dare_touch_the_generator=True)
        self.g_func = get_func(0, 1)

    def train(self, dataset):
        """
        Runs one epoch of SGD training on the specified dataset.

        Parameters
        ----------
        dataset : Dataset
        """


        if not hasattr(self, 'd_func'):
            raise Exception("train called without first calling setup")

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.first = False
        rng = self.rng
        if not is_stochastic(self.train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)

        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError("Unable to train with SGD, because "
                    "the cost does not actually use data from the data set. "
                    "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=self.train_iteration_mode,
                batch_size=self.batch_size,
                data_specs=flat_data_specs, return_tuple=True,
                rng = rng, num_batches = self.batches_per_iter)


        on_load_batch = self.on_load_batch
        i = 0
        for batch in iterator:
            for callback in on_load_batch:
                callback(*batch)
            if self.train_generator and i == self.discriminator_steps:
                self.g_func(*batch)
                i = 0
            else:
                self.d_func(*batch)
                i += 1
            # iterator might return a smaller batch if dataset size
            # isn't divisible by batch_size
            # Note: if data_specs[0] is a NullSpace, there is no way to know
            # how many examples would actually have been in the batch,
            # since it was empty, so actual_batch_size would be reported as 0.
            actual_batch_size = flat_data_specs[0].np_batch_size(batch)
            self.monitor.report_batch(actual_batch_size)
            for callback in self.update_callbacks:
                callback(self)


        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.train_generator = not self.train_generator

    def continue_learning(self, model):
        """
        Returns True if the algorithm should continue running, or False
        if it has reached convergence / started overfitting and should
        stop.

        Parameters
        ----------
        model : a Model instance
        """
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion.continue_learning(self.model)
예제 #18
0
파일: sgd.py 프로젝트: yosinski/pylearn2
    def __init__(self, learning_rate, cost=None, batch_size=None,
                 monitoring_batches=None, monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None, update_callbacks=None,
                 learning_rule = None, init_momentum = None, set_batch_size = False,
                 train_iteration_mode = None, batches_per_iter=None,
                 theano_function_mode = None, monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
        Parameters
        ----------
        learning_rate : float
            The learning rate to use. Train object callbacks can change the \
            learning rate after each epoch. SGD update_callbacks can change \
            it after each minibatch.
        cost : pylearn2.costs.cost.Cost
            Cost object specifying the objective function to be minimized. \
            Optionally, may be None. In this case, SGD will call the model's \
            get_default_cost method to obtain the objective function.
        batch_size : optional, int
            The size of the batch to be used.
            If not specified, the model will be asked for the batch size, so
            you must have specified the batch size there.
            (Some models are rigidly defined to only work with one batch size)
        monitoring_batches : optional, int
            At the start of each epoch, we run "monitoring", to evaluate
            quantities such as the validation set error.
            monitoring_batches, if specified, determines the number of batches
            to draw from the iterator for each monitoring dataset.
            Unnecessary if not using monitoring or if `monitor_iteration_mode`
            is 'sequential' and `batch_size` is specified (number of
            batches will be calculated based on full dataset size).
            TODO: make it possible to specify different monitoring_batches
            for each monitoring dataset. The Monitor itself already supports
            this.
        monitoring_dataset : optional, a Dataset or dictionary
            If not specified, no monitoring is used.
            If specified to be a Dataset, monitor on that Dataset.
            If specified to be dictionary, the keys should be string names
            of datasets, and the values should be Datasets. All monitoring
            channels will be computed for all monitoring Datasets and will
            have the dataset name and an underscore prepended to them.
        monitor_iteration_mode : optional, str
            The iteration mode used to iterate over the examples in all
            monitoring datasets. If not specified, defaults to 'sequential'.
            TODO: make it possible to specify different modes for different
            datasets.
        termination_criterion : optional, instance of
            pylearn2.termination_criteria.TerminationCriterion
            Used to determine when the algorithm should stop running.
            If not specified, runs forever--or more realistically, until
            external factors halt the python process (Kansas 1977).
        update_callbacks : optional, list
            If specified, each member of the list should be a callable that
            accepts an SGD instance as its only argument.
            All callbacks will be called with this SGD instance after each
            SGD step.
        learning_rule : training_algorithms.learning_rule.LearningRule
            A learning rule computes the new parameter values given old \
            parameters and first-order gradients. If learning_rule is None, \
            sgd.SGD will update parameters according to the standard SGD \
            learning rule:
                param := param - learning_rate * d cost / d param
            This argument allows more sophisticated learning rules, such
            as SGD with momentum.
        init_momentum : **DEPRECATED** option, float
            Use learning_rule instead.
            If None, does not use momentum otherwise, use momentum and \
            initialize the momentum coefficient to init_momentum. Callbacks \
            can change this over time just like the learning rate. If the \
            gradient is the same on every step, then the update taken by the \
            SGD algorithm is scaled by a factor of 1/(1-momentum). See \
            section 9 of Geoffrey Hinton's "A Practical Guide to Training \
            Restricted Boltzmann Machines" for details.
        set_batch_size : optional, bool
            Defaults to False.
            If True, and batch_size conflicts with model.force_batch_size, \
            will call model.set_batch_size(batch_size) in an attempt to \
            change model.force_batch_size
        train_iteration_mode : optional, str
            Defaults to 'shuffled_sequential'.
            The iteration mode to use for iterating through training examples.
        batches_per_iter : optional, int
            The number of batches to draw from the iterator over training
            examples.
            If iterational mode is 'sequential' or 'shuffled_sequential', this
            is unnecessary; when unspecified we will iterate over all examples.
        theano_function_mode : optional, a valid argument to theano.function's
            'mode' parameter.
            The theano mode to compile the updates function with. Note that \
            pylearn2 includes some wraplinker modes that are not bundled with \
            theano. See pylearn2.devtools. These extra modes let you do \
            things like check for NaNs at every step, or record md5 digests \
            of all computations performed by the update function to help \
            isolate problems with nondeterminism.
        monitoring_costs : optional, list
            a list of Cost instances. The Monitor will also include all
            channels defined by these Costs, even though we don't train
            using them.
        seed : optional, valid argument to np.random.RandomState
            The seed used for the random number generate to be passed to the
            training dataset iterator (if any)
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of " +
                            "Costs to represent a sum of Costs. Use " +
                            "pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn("init_momentum interface is deprecated and will "
            "become officially unsuported as of May 9, 2014. Please use the "
            "`learning_rule` parameter instead, providing an object of type "
            "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " +
                                 "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = np.random.RandomState(seed)
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs
예제 #19
0
    def run_sgd(mode):
        # Must be seeded the same both times run_sgd is called
        disturb_mem.disturb_mem()
        rng = np.random.RandomState([2012, 11, 27])

        batch_size = 5
        train_batches = 3
        valid_batches = 4
        num_features = 2

        # Synthesize dataset with a linear decision boundary
        w = rng.randn(num_features)

        def make_dataset(num_batches):
            disturb_mem.disturb_mem()
            m = num_batches * batch_size
            X = rng.randn(m, num_features)
            y = np.zeros((m, 1))
            y[:, 0] = np.dot(X, w) > 0.

            rval = DenseDesignMatrix(X=X, y=y)

            rval.yaml_src = ""  # suppress no yaml_src warning

            X = rval.get_batch_design(batch_size)
            assert X.shape == (batch_size, num_features)

            return rval

        train = make_dataset(train_batches)
        valid = make_dataset(valid_batches)

        num_chunks = 10
        chunk_width = 2

        class ManyParamsModel(Model):
            """
            Make a model with lots of parameters, so that there are many
            opportunities for their updates to get accidentally re-ordered
            non-deterministically. This makes non-determinism bugs manifest
            more frequently.
            """
            def __init__(self):
                super(ManyParamsModel, self).__init__()
                self.W1 = [
                    sharedX(rng.randn(num_features, chunk_width))
                    for i in xrange(num_chunks)
                ]
                disturb_mem.disturb_mem()
                self.W2 = [
                    sharedX(rng.randn(chunk_width)) for i in xrange(num_chunks)
                ]
                self._params = safe_union(self.W1, self.W2)
                self.input_space = VectorSpace(num_features)
                self.output_space = VectorSpace(1)

        disturb_mem.disturb_mem()
        model = ManyParamsModel()
        disturb_mem.disturb_mem()

        class LotsOfSummingCost(Cost):
            """
            Make a cost whose gradient on the parameters involves summing many
            terms together, so that T.grad is more likely to sum things in a
            random order.
            """

            supervised = True

            def expr(self, model, data, **kwargs):
                self.get_data_specs(model)[0].validate(data)
                X, Y = data
                disturb_mem.disturb_mem()

                def mlp_pred(non_linearity):
                    Z = [T.dot(X, W) for W in model.W1]
                    H = map(non_linearity, Z)
                    Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)]
                    pred = sum(Z)
                    return pred

                nonlinearity_predictions = map(
                    mlp_pred, [T.nnet.sigmoid, T.nnet.softplus, T.sqr, T.sin])
                pred = sum(nonlinearity_predictions)
                disturb_mem.disturb_mem()

                return abs(pred - Y[:, 0]).sum()

            def get_data_specs(self, model):
                data = CompositeSpace(
                    (model.get_input_space(), model.get_output_space()))
                source = (model.get_input_source(), model.get_target_source())
                return (data, source)

        cost = LotsOfSummingCost()

        disturb_mem.disturb_mem()

        algorithm = SGD(
            cost=cost,
            batch_size=batch_size,
            learning_rule=Momentum(.5),
            learning_rate=1e-3,
            monitoring_dataset={
                'train': train,
                'valid': valid
            },
            update_callbacks=[ExponentialDecay(decay_factor=2., min_lr=.0001)],
            termination_criterion=EpochCounter(max_epochs=5))

        disturb_mem.disturb_mem()

        train_object = Train(dataset=train,
                             model=model,
                             algorithm=algorithm,
                             extensions=[
                                 PolyakAveraging(start=0),
                                 MomentumAdjustor(final_momentum=.9,
                                                  start=1,
                                                  saturate=5),
                             ],
                             save_freq=0)

        disturb_mem.disturb_mem()

        train_object.main_loop()
예제 #20
0
파일: sgd.py 프로젝트: sebastien-j/pylearn2
class SGD(TrainingAlgorithm):
    """
    SGD = (Minibatch) Stochastic Gradient Descent.
    A TrainingAlgorithm that does stochastic gradient descent on minibatches
    of training examples.

    For theoretical background on this algorithm, see Yoshua Bengio's machine
    learning course notes on the subject:

    http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html

    Parameters
    ----------
    learning_rate : float
        The learning rate to use. Train object callbacks can change the
        learning rate after each epoch. SGD update_callbacks can change
        it after each minibatch.
    cost : pylearn2.costs.cost.Cost, optional
        Cost object specifying the objective function to be minimized.
        Optionally, may be None. In this case, SGD will call the model's
        get_default_cost method to obtain the objective function.
    batch_size : int, optional
        The size of the batch to be used.
        If not specified, the model will be asked for the batch size, so
        you must have specified the batch size there.
        (Some models are rigidly defined to only work with one batch size)
    monitoring_batch_size : int, optional
        The size of the monitoring batches.
    monitoring_batches : int, optional
        At the start of each epoch, we run "monitoring", to evaluate
        quantities such as the validation set error.
        monitoring_batches, if specified, determines the number of batches
        to draw from the iterator for each monitoring dataset.
        Unnecessary if not using monitoring or if `monitor_iteration_mode`
        is 'sequential' and `batch_size` is specified (number of
        batches will be calculated based on full dataset size).
        TODO: make it possible to specify different monitoring_batches
        for each monitoring dataset. The Monitor itself already supports
        this.
    monitoring_dataset : Dataset or dictionary, optional
        If not specified, no monitoring is used.
        If specified to be a Dataset, monitor on that Dataset.
        If specified to be dictionary, the keys should be string names
        of datasets, and the values should be Datasets. All monitoring
        channels will be computed for all monitoring Datasets and will
        have the dataset name and an underscore prepended to them.
    monitor_iteration_mode : str, optional
        The iteration mode used to iterate over the examples in all
        monitoring datasets. If not specified, defaults to 'sequential'.
        TODO: make it possible to specify different modes for different
        datasets.
    termination_criterion : instance of \
        pylearn2.termination_criteria.TerminationCriterion, optional

        Used to determine when the algorithm should stop running.
        If not specified, runs forever--or more realistically, until
        external factors halt the python process (Kansas 1977).
    update_callbacks : list, optional
        If specified, each member of the list should be a callable that
        accepts an SGD instance as its only argument.
        All callbacks will be called with this SGD instance after each
        SGD step.
    learning_rule : training_algorithms.learning_rule.LearningRule, optional
        A learning rule computes the new parameter values given old
        parameters and first-order gradients. If learning_rule is None,
        sgd.SGD will update parameters according to the standard SGD
        learning rule:

        .. code-block:: none

            param := param - learning_rate * d cost / d param

        This argument allows more sophisticated learning rules, such
        as SGD with momentum.
    init_momentum : float, **DEPRECATED** option
        Use learning_rule instead.
        If None, does not use momentum otherwise, use momentum and
        initialize the momentum coefficient to init_momentum. Callbacks
        can change this over time just like the learning rate. If the
        gradient is the same on every step, then the update taken by the
        SGD algorithm is scaled by a factor of 1/(1-momentum). See
        section 9 of Geoffrey Hinton's "A Practical Guide to Training
        Restricted Boltzmann Machines" for details.
    set_batch_size : bool, optional
        Defaults to False.
        If True, and batch_size conflicts with model.force_batch_size,
        will call model.set_batch_size(batch_size) in an attempt to
        change model.force_batch_size
    train_iteration_mode : str, optional
        Defaults to 'shuffled_sequential'.
        The iteration mode to use for iterating through training examples.
    batches_per_iter : int, optional
        The number of batches to draw from the iterator over training
        examples.
        If iteration mode is 'sequential' or 'shuffled_sequential', this
        is unnecessary; when unspecified we will iterate over all examples.
    theano_function_mode : a valid argument to theano.function's \
        'mode' parameter, optional

        The theano mode to compile the updates function with. Note that
        pylearn2 includes some wraplinker modes that are not bundled with
        theano. See pylearn2.devtools. These extra modes let you do
        things like check for NaNs at every step, or record md5 digests
        of all computations performed by the update function to help
        isolate problems with nondeterminism.
    monitoring_costs : OrderedDict, optional
        A dictionary of Cost instances. Keys should be string containing
        the name of the cost. The Monitor will also include all
        channels defined by these Costs, even though we don't train
        using them.
    seed : valid argument to np.random.RandomState, optional
        The seed used for the random number generate to be passed to the
        training dataset iterator (if any)
    """
    def __init__(self, learning_rate, cost=None, batch_size=None,
                 monitoring_batch_size=None, monitoring_batches=None,
                 monitoring_dataset=None, monitor_iteration_mode='sequential',
                 termination_criterion=None, update_callbacks=None,
                 learning_rule = None, init_momentum = None,
                 set_batch_size = False,
                 train_iteration_mode = None, batches_per_iter=None,
                 theano_function_mode = None, monitoring_costs=None,
                 seed=[2012, 10, 5]):

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of " +
                            "Costs to represent a sum of Costs. Use " +
                            "pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn("init_momentum interface is deprecated and will "
            "become officially unsuported as of May 9, 2014. Please use the "
            "`learning_rule` parameter instead, providing an object of type "
            "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batch_size = monitoring_batch_size
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batch_size is not None:
                raise ValueError("Specified a monitoring batch size " +
                                 "but not a monitoring dataset.")
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " +
                                 "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = make_np_rng(seed, which_method=["randn","randint"])
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs

    def _setup_monitor(self):
        """
        Set up monitor to model the objective value, learning rate,
        momentum (if applicable), and extra channels defined by
        the cost.

        This method must be called after `learning_rule.get_updates`,
        since it may have an effect on `learning_rule.add_channels_to_monitor`
        (that is currently the case for `learning_rule.RMSProp`).
        """
        if self.monitoring_dataset is not None:
            if (self.monitoring_batch_size is None and
                    self.monitoring_batches is None):
                self.monitoring_batch_size = self.batch_size
                self.monitoring_batches = self.batches_per_iter
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.monitoring_batch_size,
                               num_batches=self.monitoring_batches,
                               extra_costs=self.monitoring_costs,
                               mode=self.monitor_iteration_mode)
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=self.learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                        self.monitor,
                        monitoring_dataset)

    def setup(self, model, dataset):
        """
        Compiles the theano functions needed for the train method.

        Parameters
        ----------
        model : a Model instance
        dataset : Dataset
        """
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [param for param in model.get_params()
                      if contains_inf(param.get_value())]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: "+str(inf_params))
        if any([contains_nan(param.get_value())
                for param in model.get_params()]):
            nan_params = [param for param in model.get_params()
                          if contains_nan(param.get_value())]
            raise ValueError("These params are NaN: "+str(nan_params))
        self.model = model

        self._synchronize_batch_size(model)
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        # test if force batch size and batch size
        has_force_batch_size = getattr(model, "force_batch_size", False)
        train_dataset_is_uneven = \
            dataset.get_num_examples() % self.batch_size != 0

        has_monitoring_datasets = \
            self.monitoring_dataset is not None and \
            self.monitoring_dataset.values() > 0

        if has_monitoring_datasets:
            monitoring_datasets_are_uneven = \
                any(d.get_num_examples() % self.batch_size
                    != 0 for d in self.monitoring_dataset.values())
        else:
            monitoring_datasets_are_uneven = False  # or True it doesn't matter

        if has_force_batch_size and train_dataset_is_uneven and \
           not has_uniform_batch_size(self.train_iteration_mode):

            raise ValueError("Dataset size is not a multiple of batch size."
                             "You should set train_iteration_mode (and "
                             "maybe monitor_iteration_mode) to "
                             "even_sequential, even_shuffled_sequential or "
                             "even_batchwise_shuffled_sequential")

        if has_force_batch_size and has_monitoring_datasets and \
           monitoring_datasets_are_uneven and \
           not has_uniform_batch_size(self.monitor_iteration_mode):

            raise ValueError("Dataset size is not a multiple of batch size."
                             "You should set monitor_iteration_mode to "
                             "even_sequential, even_shuffled_sequential or "
                             "even_batchwise_shuffled_sequential")

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        learning_rate = self.learning_rate
        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i

        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 ** fixed_var_descr.fixed_vars)
        if not isinstance(grads, OrderedDict):
            raise TypeError(str(type(self.cost)) + ".get_gradients returned " +
                            "something with" + str(type(grads)) + "as its " +
                            "first member. Expected OrderedDict.")

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        for param in grads:
            if grads[param].name is None and cost_value is not None:
                grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
                                     {'costname': cost_value.name,
                                      'paramname': param.name})
            assert grads[param].dtype == param.dtype

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        log.info('Parameter and initial learning rate summary:')
        for param in params:
            param_name = param.name
            if param_name is None:
                param_name = 'anon_param'
            lr = learning_rate.get_value() * lr_scalers.get(param,1.)
            log.info('\t' + param_name + ': ' + str(lr))

        if self.learning_rule:
            updates.update(self.learning_rule.get_updates(
                learning_rate, grads, lr_scalers))
        else:
            # Use standard SGD updates with fixed learning rate.
            updates.update( dict(safe_zip(params, [param - learning_rate * \
                lr_scalers.get(param, 1.) * grads[param]
                                    for param in params])))

        for param in params:
            if updates[param].name is None:
                updates[param].name = 'sgd_update(' + param.name + ')'
        model.modify_updates(updates)
        for param in params:
            update = updates[param]
            if update.name is None:
                update.name = 'censor(sgd_update(' + param.name + '))'
            for update_val in get_debug_values(update):
                if contains_inf(update_val):
                    raise ValueError("debug value of %s contains infs" %
                            update.name)
                if contains_nan(update_val):
                    raise ValueError("debug value of %s contains nans" %
                            update.name)


        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost.
        # We have to do that after learning_rule.get_updates has been
        # called, since it may have an effect on
        # learning_rule.add_channels_to_monitor (that is currently the case
        # for AdaDelta and RMSProp).
        self._setup_monitor()

        with log_timing(log, 'Compiling sgd_update'):
            self.sgd_update = function(theano_args,
                                       updates=updates,
                                       name='sgd_update',
                                       on_unused_input='ignore',
                                       mode=self.theano_function_mode)
        self.params = params

    def train(self, dataset):
        """
        Runs one epoch of SGD training on the specified dataset.

        Parameters
        ----------
        dataset : Dataset
        """
        if not hasattr(self, 'sgd_update'):
            raise Exception("train called without first calling setup")

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if not isfinite(value):
                raise Exception("NaN in " + param.name)

        self.first = False
        rng = self.rng
        if not is_stochastic(self.train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)

        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError("Unable to train with SGD, because "
                    "the cost does not actually use data from the data set. "
                    "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=self.train_iteration_mode,
                batch_size=self.batch_size,
                data_specs=flat_data_specs, return_tuple=True,
                rng = rng, num_batches = self.batches_per_iter)

        on_load_batch = self.on_load_batch
        for batch in iterator:
            for callback in on_load_batch:
                callback(*batch)
            self.sgd_update(*batch)
            # iterator might return a smaller batch if dataset size
            # isn't divisible by batch_size
            # Note: if data_specs[0] is a NullSpace, there is no way to know
            # how many examples would actually have been in the batch,
            # since it was empty, so actual_batch_size would be reported as 0.
            actual_batch_size = flat_data_specs[0].np_batch_size(batch)
            self.monitor.report_batch(actual_batch_size)
            for callback in self.update_callbacks:
                callback(self)

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if not isfinite(value):
                raise Exception("NaN in " + param.name)

    def continue_learning(self, model):
        """
        Returns True if the algorithm should continue running, or False
        if it has reached convergence / started overfitting and should
        stop.

        Parameters
        ----------
        model : a Model instance
        """
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion.continue_learning(self.model)
예제 #21
0
                                       dim=128,
                                       irange=0.001,
                                       init_bias=0)
    hidden_layer2 = mlp.RectifiedLinear(layer_name='hidden2',
                                        dim=128,
                                        irange=0.01,
                                        init_bias=0)
    hidden_layer3 = mlp.RectifiedLinear(layer_name='hidden3',
                                        dim=128,
                                        irange=0.01,
                                        init_bias=0)
    # create Softmax output layer
    output_layer = mlp.Softmax(3, 'output', irange=.1)
    # create Stochastic Gradient Descent trainer that runs for 400 epochs
    cost = NegativeLogLikelihoodCost()
    rule = Momentum(0.9)
    # rule = Momentum(0.9, True)
    # update_callbacks=ExponentialDecay(1 + 1e-5, 0.001)
    trainer = sgd.SGD(learning_rate=0.01,
                      cost=cost,
                      batch_size=128,
                      termination_criterion=EpochCounter(1000),
                      monitoring_dataset=vds,
                      learning_rule=rule)
    layers = [hidden_layer, hidden_layer2, output_layer]
    # create neural net that takes two inputs
    ann = mlp.MLP(layers, nvis=ds.feat_cnt)

    trainer.setup(ann, ds)
    print trainer.cost
    # train neural net until the termination criterion is true
예제 #22
0
from pylearn2.costs.cost import MethodCost
from pylearn2.datasets.mnist import MNIST
from pylearn2.models.mlp import MLP, Sigmoid, Softmax
from pylearn2.train import Train
from pylearn2.training_algorithms.sgd import SGD
from pylearn2.training_algorithms.learning_rule import Momentum, MomentumAdjustor
from pylearn2.termination_criteria import EpochCounter

train_set = MNIST(which_set='train', start=0, stop=50000)
valid_set = MNIST(which_set='train', start=50000, stop=60000)
test_set = MNIST(which_set='test')

model = MLP(nvis=784,
            layers=[Sigmoid(layer_name='h', dim=500, irange=0.01),
                    Softmax(layer_name='y', n_classes=10, irange=0.01)])

algorithm = SGD(batch_size=100, learning_rate=0.01,
                learning_rule=Momentum(init_momentum=0.5),
                monitoring_dataset={'train': train_set,
                                    'valid': valid_set,
                                    'test': test_set},
                cost=MethodCost('cost_from_X'),
                termination_criterion=EpochCounter(10))

train = Train(dataset=train_set, model=model, algorithm=algorithm,
              save_path="mnist_example.pkl", save_freq=1,
              extensions=[MomentumAdjustor(start=5, saturate=6,
                                           final_momentum=0.95)])

train.main_loop()
예제 #23
0
class SGD(TrainingAlgorithm):
    """
    Stochastic Gradient Descent

    WRITEME: what is a good reference to read about this algorithm?

    A TrainingAlgorithm that does gradient descent on minibatches.

    """
    def __init__(self,
                 learning_rate,
                 cost=None,
                 batch_size=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None,
                 update_callbacks=None,
                 learning_rule=None,
                 init_momentum=None,
                 set_batch_size=False,
                 train_iteration_mode=None,
                 batches_per_iter=None,
                 theano_function_mode=None,
                 monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
            WRITEME

            learning_rate: The learning rate to use.
                            Train object callbacks can change the learning
                            rate after each epoch. SGD update_callbacks
                            can change it after each minibatch.
            cost: a pylearn2.costs.cost.Cost object specifying the objective
                  function to be minimized.
                  Optionally, may be None. In this case, SGD will call the model's
                  get_default_cost method to obtain the objective function.
            init_momentum: **DEPRECATED** if None, does not use momentum
                            otherwise, use momentum and initialize the
                            momentum coefficient to init_momentum.
                            Callbacks can change this over time just like
                            the learning rate.

                            If the gradient is the same on every step, then
                            the update taken by the SGD algorithm is scaled
                            by a factor of 1/(1-momentum).

                            See section 9 of Geoffrey Hinton's "A Practical
                            Guide to Training Restricted Boltzmann Machines"
                            for details.
            learning_rule: training_algorithms.learning_rule.LearningRule,
                           a learning rule computes the new parameter values given
                           old parameters and first-order gradients. If learning_rule
                           is None, sgd.SGD will update parameters according to
                           the standard SGD learning rule.
            set_batch_size: if True, and batch_size conflicts with
                            model.force_batch_size, will call
                            model.set_batch_size(batch_size) in an attempt
                            to change model.force_batch_size
            theano_function_mode: The theano mode to compile the updates function with.
                            Note that pylearn2 includes some wraplinker modes that are
                            not bundled with theano. See pylearn2.devtools. These
                            extra modes let you do things like check for NaNs at every
                            step, or record md5 digests of all computations performed
                            by the update function to help isolate problems with nondeterminism.

            Parameters are updated by the formula:

            inc := momentum * inc - learning_rate * d cost / d param
            param := param + inc
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError(
                "SGD no longer supports using collections of Costs to represent "
                " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn(
                "init_momentum interface is deprecated and will "
                "become officially unsuported as of May 9, 2014. Please use the "
                "`learning_rule` parameter instead, providing an object of type "
                "`pylearn2.training_algorithms.learning_rule.Momentum` instead"
            )
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError(
                    "Specified an amount of monitoring batches but not a monitoring dataset."
                )
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = np.random.RandomState(seed)
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs

    def setup(self, model, dataset):
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [
            param for param in model.get_params()
            if np.any(np.isinf(param.get_value()))
        ]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: " + str(inf_params))
        if any([
                np.any(np.isnan(param.get_value()))
                for param in model.get_params()
        ]):
            nan_params = [
                param for param in model.get_params()
                if np.any(np.isnan(param.get_value()))
            ]
            raise ValueError("These params are NaN: " + str(nan_params))
        self.model = model

        batch_size = self.batch_size
        if hasattr(model, "force_batch_size"):
            if model.force_batch_size > 0:
                if batch_size is not None:
                    if batch_size != model.force_batch_size:
                        if self.set_batch_size:
                            model.set_batch_size(batch_size)
                        else:
                            raise ValueError(
                                "batch_size argument to SGD conflicts with model's force_batch_size attribute"
                            )
                else:
                    self.batch_size = model.force_batch_size
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    **fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost
        learning_rate = self.learning_rate
        if self.monitoring_dataset is not None:
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.batch_size,
                               num_batches=self.monitoring_batches,
                               extra_costs=self.monitoring_costs,
                               mode=self.monitor_iteration_mode)
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                    self.monitor, monitoring_dataset)

        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i

        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 **fixed_var_descr.fixed_vars)

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        for param in grads:
            if grads[param].name is None and cost_value is not None:
                grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {
                    'costname': cost_value.name,
                    'paramname': param.name
                })

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        log.info('Parameter and initial learning rate summary:')
        for param in params:
            param_name = param.name
            if param_name is None:
                param_name = 'anon_param'
            lr = learning_rate.get_value() * lr_scalers.get(param, 1.)
            log.info('\t' + param_name + ': ' + str(lr))

        if self.learning_rule:
            updates.update(
                self.learning_rule.get_updates(learning_rate, grads,
                                               lr_scalers))
        else:
            # Use standard SGD updates with fixed learning rate.
            updates.update( dict(safe_zip(params, [param - learning_rate * \
                lr_scalers.get(param, 1.) * grads[param]
                                    for param in params])))

        for param in params:
            if updates[param].name is None:
                updates[param].name = 'sgd_update(' + param.name + ')'
        model.censor_updates(updates)
        for param in params:
            update = updates[param]
            if update.name is None:
                update.name = 'censor(sgd_update(' + param.name + '))'
            for update_val in get_debug_values(update):
                if np.any(np.isinf(update_val)):
                    raise ValueError("debug value of %s contains infs" %
                                     update.name)
                if np.any(np.isnan(update_val)):
                    raise ValueError("debug value of %s contains nans" %
                                     update.name)

        with log_timing(log, 'Compiling sgd_update'):
            self.sgd_update = function(theano_args,
                                       updates=updates,
                                       name='sgd_update',
                                       on_unused_input='ignore',
                                       mode=self.theano_function_mode)
        self.params = params

    def train(self, dataset):
        if not hasattr(self, 'sgd_update'):
            raise Exception("train called without first calling setup")

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.first = False
        rng = self.rng
        if not is_stochastic(self.train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)

        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError(
                "Unable to train with SGD, because "
                "the cost does not actually use data from the data set. "
                "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=self.train_iteration_mode,
                                    batch_size=self.batch_size,
                                    data_specs=flat_data_specs,
                                    return_tuple=True,
                                    rng=rng,
                                    num_batches=self.batches_per_iter)

        on_load_batch = self.on_load_batch
        for batch in iterator:
            for callback in on_load_batch:
                callback(mapping.nest(batch))
            self.sgd_update(*batch)
            # iterator might return a smaller batch if dataset size
            # isn't divisible by batch_size
            # Note: if data_specs[0] is a NullSpace, there is no way to know
            # how many examples would actually have been in the batch,
            # since it was empty, so actual_batch_size would be reported as 0.
            actual_batch_size = flat_data_specs[0].np_batch_size(batch)
            self.monitor.report_batch(actual_batch_size)
            for callback in self.update_callbacks:
                callback(self)

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion.continue_learning(self.model)