def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule=None, init_momentum=None, set_batch_size=False, train_iteration_mode=None, batches_per_iter=None, theano_function_mode=None, monitoring_costs=None, seed=[2012, 10, 5]): if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn( "init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead" ) # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = make_np_rng(seed, which_method=["randn", "randint"]) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs
def get_ae_pretrainer(layer, data, batch_size, epochs=30): init_lr = 0.05 train_algo = SGD( batch_size=batch_size, learning_rate=init_lr, learning_rule=Momentum(init_momentum=0.5), monitoring_batches=batch_size, monitoring_dataset=data, # for ContractiveAutoencoder: # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()], # [0.5, cost.MethodCost(method='contraction_penalty')]]), # for HigherOrderContractiveAutoencoder: # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()], # [0.5, cost.MethodCost(method='contraction_penalty')], # [0.5, cost.MethodCost(method='higher_order_penalty')]]), # for DenoisingAutoencoder: cost=MeanSquaredReconstructionError(), termination_criterion=EpochCounter(epochs)) return Train(model=layer, algorithm=train_algo, dataset=data, extensions=[ MomentumAdjustor(final_momentum=0.9, start=0, saturate=25), LinearDecayOverEpoch(start=1, saturate=25, decay_factor=.02) ])
def get_layer_trainer_sgd_autoencoder(layer, trainset, batch_size=10, learning_rate=0.1, max_epochs=100, name=''): # configs on sgd train_algo = SGD( learning_rate=learning_rate, # learning_rule = AdaDelta(), learning_rule=Momentum(init_momentum=0.5), cost=MeanSquaredReconstructionError(), batch_size=batch_size, monitoring_dataset=trainset, termination_criterion=EpochCounter(max_epochs=max_epochs), update_callbacks=None) log_callback = LoggingCallback(name) return Train(model=layer, algorithm=train_algo, extensions=[ log_callback, OneOverEpoch(start=1, half_life=5), MomentumAdjustor(final_momentum=0.7, start=10, saturate=100) ], dataset=trainset)
def __init__( self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode="sequential", termination_criterion=None, update_callbacks=None, learning_rule=None, init_momentum=None, set_batch_size=False, train_iteration_mode=None, batches_per_iter=None, theano_function_mode=None, monitoring_costs=None, seed=[2012, 10, 5], ): if isinstance(cost, (list, tuple, set)): raise TypeError( "SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead." ) if init_momentum: warnings.warn( "init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead" ) # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, "learning_rate") self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = "shuffled_sequential" self.train_iteration_mode = train_iteration_mode self.first = True self.rng = make_np_rng(seed, which_method=["randn", "randint"]) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs
def test_lr_scalers(): """ Tests that SGD respects Model.get_lr_scalers """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] learning_rate = .001 class ModelWithScalers(Model): def __init__(self): super(ModelWithScalers, self).__init__() self._params = [sharedX(np.zeros(shape)) for shape in shapes] self.input_space = VectorSpace(1) def __call__(self, X): # Implemented only so that DummyCost would work return X def get_lr_scalers(self): return dict(zip(self._params, scales)) model = ModelWithScalers() dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(.0), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] manual = [ param - learning_rate * scale for param, scale in zip(manual, scales) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale for param, scale in zip(manual, scales) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_nesterov_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum, nesterov_momentum=True), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] vel = [-learning_rate * scale for scale in scales] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) vel = [ -learning_rate * scale + i * momentum for scale, i in izip(scales, vel) ] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_lr_scalers_momentum(): """ Tests that SGD respects Model.get_lr_scalers when using momentum. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def get_finetuner(model, trainset, batch_size=100, epochs=100): train_algo = SGD(batch_size=batch_size, learning_rule=Momentum(init_momentum=0.5), learning_rate=0.5, monitoring_batches=batch_size, monitoring_dataset=trainset, cost=Dropout(input_include_probs={'h0': .5}, input_scales={'h0': 2.}), termination_criterion=EpochCounter(epochs)) path = DATA_DIR + 'model' + str(SUBMODEL) + 'saved_daex.pkl' return Train(model=model, algorithm=train_algo, dataset=trainset, save_path=path, save_freq=10, extensions=[ MomentumAdjustor(final_momentum=0.9, start=0, saturate=int(epochs * 0.8)), LinearDecayOverEpoch(start=1, saturate=int(epochs * 0.7), decay_factor=.02) ])
def get_trainer2(model, trainset, epochs=50): train_algo = SGD( batch_size=bsize, learning_rate=0.5, learning_rule=Momentum(init_momentum=0.5), monitoring_batches=bsize, monitoring_dataset=trainset, cost=Dropout(input_include_probs={'h0': .8}, input_scales={'h0': 1.}), termination_criterion=EpochCounter(epochs), ) path = DATA_DIR + 'model2saved_conv.pkl' return Train(model=model, algorithm=train_algo, dataset=trainset, save_path=path, save_freq=1, extensions=[ MomentumAdjustor(final_momentum=0.7, start=0, saturate=int(epochs * 0.5)), LinearDecayOverEpoch(start=1, saturate=int(epochs * 0.8), decay_factor=.01) ])
def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule = None, init_momentum = None, set_batch_size = False, train_iteration_mode = None, batches_per_iter=None, theano_function_mode = None, monitoring_costs=None, seed=[2012, 10, 5]): """ WRITEME learning_rate: The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost: a pylearn2.costs.cost.Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. init_momentum: **DEPRECATED** if None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. learning_rule: training_algorithms.learning_rule.LearningRule, a learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule. set_batch_size: if True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size theano_function_mode: The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc """ if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of Costs to represent " " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn("init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead") # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = np.random.RandomState(seed) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs
class SGD(TrainingAlgorithm): """ Stochastic Gradient Descent WRITEME: what is a good reference to read about this algorithm? A TrainingAlgorithm that does gradient descent on minibatches. """ def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule = None, init_momentum = None, set_batch_size = False, train_iteration_mode = None, batches_per_iter=None, theano_function_mode = None, monitoring_costs=None, seed=[2012, 10, 5]): """ WRITEME learning_rate: The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost: a pylearn2.costs.cost.Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. init_momentum: **DEPRECATED** if None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. learning_rule: training_algorithms.learning_rule.LearningRule, a learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule. set_batch_size: if True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size theano_function_mode: The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc """ if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of Costs to represent " " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn("init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead") # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = np.random.RandomState(seed) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs def setup(self, model, dataset): if self.cost is None: self.cost = model.get_default_cost() inf_params = [ param for param in model.get_params() if np.any(np.isinf(param.get_value())) ] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]): nan_params = [ param for param in model.get_params() if np.any(np.isnan(param.get_value())) ] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model batch_size = self.batch_size if hasattr(model, "force_batch_size"): if model.force_batch_size > 0: if batch_size is not None: if batch_size != model.force_batch_size: if self.set_batch_size: model.set_batch_size(batch_size) else: raise ValueError("batch_size argument to SGD conflicts with model's force_batch_size attribute") else: self.batch_size = model.force_batch_size model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size = self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: self.monitor.setup( dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode ) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates( learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.censor_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params def train(self, dataset): if not hasattr(self, 'sgd_update'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError("Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng = rng, num_batches = self.batches_per_iter) on_load_batch = self.on_load_batch for batch in iterator: for callback in on_load_batch: callback(mapping.nest(batch)) self.sgd_update(*batch) # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) def continue_learning(self, model): if self.termination_criterion is None: return True else: return self.termination_criterion.continue_learning(self.model)
def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule=None, init_momentum=None, set_batch_size=False, train_iteration_mode=None, batches_per_iter=None, theano_function_mode=None, monitoring_costs=None, seed=[2012, 10, 5]): """ Parameters ---------- learning_rate : float The learning rate to use. Train object callbacks can change the \ learning rate after each epoch. SGD update_callbacks can change \ it after each minibatch. cost : pylearn2.costs.cost.Cost Cost object specifying the objective function to be minimized. \ Optionally, may be None. In this case, SGD will call the model's \ get_default_cost method to obtain the objective function. batch_size : optional, int The size of the batch to be used. If not specified, the model will be asked for the batch size, so you must have specified the batch size there. (Some models are rigidly defined to only work with one batch size) monitoring_batches : optional, int At the start of each epoch, we run "monitoring", to evaluate quantities such as the validation set error. monitoring_batches, if specified, determines the number of batches to draw from the iterator for each monitoring dataset. Unnecessary if not using monitoring or if `monitor_iteration_mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). TODO: make it possible to specify different monitoring_batches for each monitoring dataset. The Monitor itself already supports this. monitoring_dataset : optional, a Dataset or dictionary If not specified, no monitoring is used. If specified to be a Dataset, monitor on that Dataset. If specified to be dictionary, the keys should be string names of datasets, and the values should be Datasets. All monitoring channels will be computed for all monitoring Datasets and will have the dataset name and an underscore prepended to them. monitor_iteration_mode : optional, str The iteration mode used to iterate over the examples in all monitoring datasets. If not specified, defaults to 'sequential'. TODO: make it possible to specify different modes for different datasets. termination_criterion : optional, instance of pylearn2.termination_criteria.TerminationCriterion Used to determine when the algorithm should stop running. If not specified, runs forever--or more realistically, until external factors halt the python process (Kansas 1977). update_callbacks : optional, list If specified, each member of the list should be a callable that accepts an SGD instance as its only argument. All callbacks will be called with this SGD instance after each SGD step. learning_rule : training_algorithms.learning_rule.LearningRule A learning rule computes the new parameter values given old \ parameters and first-order gradients. If learning_rule is None, \ sgd.SGD will update parameters according to the standard SGD \ learning rule: param := param - learning_rate * d cost / d param This argument allows more sophisticated learning rules, such as SGD with momentum. init_momentum : **DEPRECATED** option, float Use learning_rule instead. If None, does not use momentum otherwise, use momentum and \ initialize the momentum coefficient to init_momentum. Callbacks \ can change this over time just like the learning rate. If the \ gradient is the same on every step, then the update taken by the \ SGD algorithm is scaled by a factor of 1/(1-momentum). See \ section 9 of Geoffrey Hinton's "A Practical Guide to Training \ Restricted Boltzmann Machines" for details. set_batch_size : optional, bool Defaults to False. If True, and batch_size conflicts with model.force_batch_size, \ will call model.set_batch_size(batch_size) in an attempt to \ change model.force_batch_size train_iteration_mode : optional, str Defaults to 'shuffled_sequential'. The iteration mode to use for iterating through training examples. batches_per_iter : optional, int The number of batches to draw from the iterator over training examples. If iterational mode is 'sequential' or 'shuffled_sequential', this is unnecessary; when unspecified we will iterate over all examples. theano_function_mode : optional, a valid argument to theano.function's 'mode' parameter. The theano mode to compile the updates function with. Note that \ pylearn2 includes some wraplinker modes that are not bundled with \ theano. See pylearn2.devtools. These extra modes let you do \ things like check for NaNs at every step, or record md5 digests \ of all computations performed by the update function to help \ isolate problems with nondeterminism. monitoring_costs : optional, list a list of Cost instances. The Monitor will also include all channels defined by these Costs, even though we don't train using them. seed : optional, valid argument to np.random.RandomState The seed used for the random number generate to be passed to the training dataset iterator (if any) """ if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn( "init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead" ) # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = make_np_rng(seed, which_method=["randn", "randint"]) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs
def __init__(self, layers, random_state=None, learning_rule='sgd', learning_rate=0.01, learning_momentum=0.9, dropout=False, batch_size=1, n_iter=None, n_stable=50, f_stable=0.001, valid_set=None, valid_size=0.0, verbose=False, **params): self.layers = [] for i, layer in enumerate(layers): assert isinstance(layer, Layer),\ "Specify each layer as an instance of a `sknn.mlp.Layer` object." # Layer names are optional, if not specified then generate one. if layer.name is None: label = "hidden" if i < len(layers) - 1 else "output" layer.name = "%s%i" % (label, i) # sklearn may pass layers in as additional named parameters, remove them. if layer.name in params: del params[layer.name] self.layers.append(layer) # Don't support any additional parameters that are not in the constructor. # These are specified only so `get_params()` can return named layers, for double- # underscore syntax to work. assert len(params) == 0,\ "The specified additional parameters are unknown." self.random_state = random_state self.learning_rule = learning_rule self.learning_rate = learning_rate self.learning_momentum = learning_momentum self.dropout = dropout if type(dropout) is float else ( 0.5 if dropout else 0.0) self.batch_size = batch_size self.n_iter = n_iter self.n_stable = n_stable self.f_stable = f_stable self.valid_set = valid_set self.valid_size = valid_size self.verbose = verbose self.unit_counts = None self.input_space = None self.mlp = None self.weights = None self.vs = None self.ds = None self.trainer = None self.f = None self.train_set = None self.best_valid_error = float("inf") self.cost = "Dropout" if dropout else None if learning_rule == 'sgd': self._learning_rule = None # elif learning_rule == 'adagrad': # self._learning_rule = AdaGrad() elif learning_rule == 'adadelta': self._learning_rule = AdaDelta() elif learning_rule == 'momentum': self._learning_rule = Momentum(learning_momentum) elif learning_rule == 'nesterov': self._learning_rule = Momentum(learning_momentum, nesterov_momentum=True) elif learning_rule == 'rmsprop': self._learning_rule = RMSProp() else: raise NotImplementedError( "Learning rule type `%s` is not supported." % learning_rule) self._setup()
model = MLP(layers=[h0, h1, h2, h3, h4, y], batch_size=batchSize, input_space=inputSpace) algorithm = SGD(learning_rate=1e-3, cost=MethodCost("cost_from_X"), batch_size=batchSize, monitoring_batch_size=batchSize, monitoring_dataset={ 'train': train, 'valid': valid }, monitor_iteration_mode="even_batchwise_shuffled_sequential", termination_criterion=EpochCounter(max_epochs=200), learning_rule=Momentum(init_momentum=0.1, nesterov_momentum=True), train_iteration_mode="even_batchwise_shuffled_sequential") train = Train(dataset=train, model=model, algorithm=algorithm, save_path="ConvNet2.pkl", save_freq=1, extensions=[ MonitorBasedSaveBest(channel_name="valid_y_misclass", save_path="ConvNet2_best.pkl") ]) print("Starting training session") train.main_loop()
def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule=None, init_momentum=None, set_batch_size=False, train_iteration_mode=None, batches_per_iter=None, theano_function_mode=None, monitoring_costs=None, seed=[2012, 10, 5]): """ WRITEME learning_rate: The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost: a pylearn2.costs.cost.Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. init_momentum: **DEPRECATED** if None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. learning_rule: training_algorithms.learning_rule.LearningRule, a learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule. set_batch_size: if True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size theano_function_mode: The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc """ if isinstance(cost, (list, tuple, set)): raise TypeError( "SGD no longer supports using collections of Costs to represent " " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn( "init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead" ) # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError( "Specified an amount of monitoring batches but not a monitoring dataset." ) self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = np.random.RandomState(seed) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs
class SGD(TrainingAlgorithm): """ SGD = (Minibatch) Stochastic Gradient Descent. A TrainingAlgorithm that does stochastic gradient descent on minibatches of training examples. For theoretical background on this algorithm, see Yoshua Bengio's machine learning course notes on the subject: http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html Parameters ---------- learning_rate : float The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost : pylearn2.costs.cost.Cost, optional Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. batch_size : int, optional The size of the batch to be used. If not specified, the model will be asked for the batch size, so you must have specified the batch size there. (Some models are rigidly defined to only work with one batch size) monitoring_batch_size : int, optional The size of the monitoring batches. monitoring_batches : int, optional At the start of each epoch, we run "monitoring", to evaluate quantities such as the validation set error. monitoring_batches, if specified, determines the number of batches to draw from the iterator for each monitoring dataset. Unnecessary if not using monitoring or if `monitor_iteration_mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). TODO: make it possible to specify different monitoring_batches for each monitoring dataset. The Monitor itself already supports this. monitoring_dataset : Dataset or dictionary, optional If not specified, no monitoring is used. If specified to be a Dataset, monitor on that Dataset. If specified to be dictionary, the keys should be string names of datasets, and the values should be Datasets. All monitoring channels will be computed for all monitoring Datasets and will have the dataset name and an underscore prepended to them. monitor_iteration_mode : str, optional The iteration mode used to iterate over the examples in all monitoring datasets. If not specified, defaults to 'sequential'. TODO: make it possible to specify different modes for different datasets. termination_criterion : instance of \ pylearn2.termination_criteria.TerminationCriterion, optional Used to determine when the algorithm should stop running. If not specified, runs forever--or more realistically, until external factors halt the python process (Kansas 1977). update_callbacks : list, optional If specified, each member of the list should be a callable that accepts an SGD instance as its only argument. All callbacks will be called with this SGD instance after each SGD step. learning_rule : training_algorithms.learning_rule.LearningRule, optional A learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule: .. code-block:: none param := param - learning_rate * d cost / d param This argument allows more sophisticated learning rules, such as SGD with momentum. init_momentum : float, **DEPRECATED** option Use learning_rule instead. If None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. set_batch_size : bool, optional Defaults to False. If True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size train_iteration_mode : str, optional Defaults to 'shuffled_sequential'. The iteration mode to use for iterating through training examples. batches_per_iter : int, optional The number of batches to draw from the iterator over training examples. If iteration mode is 'sequential' or 'shuffled_sequential', this is unnecessary; when unspecified we will iterate over all examples. theano_function_mode : a valid argument to theano.function's \ 'mode' parameter, optional The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. monitoring_costs : list, optional a list of Cost instances. The Monitor will also include all channels defined by these Costs, even though we don't train using them. seed : valid argument to np.random.RandomState, optional The seed used for the random number generate to be passed to the training dataset iterator (if any) """ def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule = None, init_momentum = None, set_batch_size = False, train_iteration_mode = None, batches_per_iter=None, theano_function_mode = None, monitoring_costs=None, seed=[2012, 10, 5], discriminator_steps=1): self.discriminator_steps = discriminator_steps self.train_generator = 0 if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn("init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead") # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batch_size = monitoring_batch_size self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batch_size is not None: raise ValueError("Specified a monitoring batch size " + "but not a monitoring dataset.") if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = make_np_rng(seed, which_method=["randn","randint"]) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. Parameters ---------- model : a Model instance dataset : Dataset """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if np.any(np.isinf(param.get_value()))] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]): nan_params = [param for param in model.get_params() if np.any(np.isnan(param.get_value()))] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() # test if force batch size and batch size if getattr(model, "force_batch_size", False) and \ any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for dataset in self.monitoring_dataset.values()) and \ not has_uniform_batch_size(self.monitor_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set monitor_iteration_mode to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i self.params = params grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") assert len(updates.keys()) == 0 def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False): updates = OrderedDict() assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator) if learn_discriminator: cur_params = model.discriminator.get_params() else: cur_params = model.generator.get_params() def check(): for param in params: if param not in cur_params: assert param not in updates cur_grads = OrderedDict() for param in cur_params: cur_grads[param] = grads[param] for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype cur_lr_scalers = OrderedDict() for param in cur_params: if param in lr_scalers: lr_scaler = lr_scalers[param] cur_lr_scalers[param] = lr_scaler log.info('Parameter and initial learning rate summary:') for param in cur_params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) updates.update(self.learning_rule.get_updates( learning_rate, cur_grads, cur_lr_scalers)) check() for param in cur_params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' check() model.modify_updates(updates) check() for param in cur_params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) check() if dont_you_fucking_dare_touch_the_generator: for param in model.generator.get_params(): assert param not in updates with log_timing(log, 'Compiling sgd_update'): return function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.d_func = get_func(1, 0, dont_you_fucking_dare_touch_the_generator=True) self.g_func = get_func(0, 1) def train(self, dataset): """ Runs one epoch of SGD training on the specified dataset. Parameters ---------- dataset : Dataset """ if not hasattr(self, 'd_func'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError("Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng = rng, num_batches = self.batches_per_iter) on_load_batch = self.on_load_batch i = 0 for batch in iterator: for callback in on_load_batch: callback(*batch) if self.train_generator and i == self.discriminator_steps: self.g_func(*batch) i = 0 else: self.d_func(*batch) i += 1 # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.train_generator = not self.train_generator def continue_learning(self, model): """ Returns True if the algorithm should continue running, or False if it has reached convergence / started overfitting and should stop. Parameters ---------- model : a Model instance """ if self.termination_criterion is None: return True else: return self.termination_criterion.continue_learning(self.model)
def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule = None, init_momentum = None, set_batch_size = False, train_iteration_mode = None, batches_per_iter=None, theano_function_mode = None, monitoring_costs=None, seed=[2012, 10, 5]): """ Parameters ---------- learning_rate : float The learning rate to use. Train object callbacks can change the \ learning rate after each epoch. SGD update_callbacks can change \ it after each minibatch. cost : pylearn2.costs.cost.Cost Cost object specifying the objective function to be minimized. \ Optionally, may be None. In this case, SGD will call the model's \ get_default_cost method to obtain the objective function. batch_size : optional, int The size of the batch to be used. If not specified, the model will be asked for the batch size, so you must have specified the batch size there. (Some models are rigidly defined to only work with one batch size) monitoring_batches : optional, int At the start of each epoch, we run "monitoring", to evaluate quantities such as the validation set error. monitoring_batches, if specified, determines the number of batches to draw from the iterator for each monitoring dataset. Unnecessary if not using monitoring or if `monitor_iteration_mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). TODO: make it possible to specify different monitoring_batches for each monitoring dataset. The Monitor itself already supports this. monitoring_dataset : optional, a Dataset or dictionary If not specified, no monitoring is used. If specified to be a Dataset, monitor on that Dataset. If specified to be dictionary, the keys should be string names of datasets, and the values should be Datasets. All monitoring channels will be computed for all monitoring Datasets and will have the dataset name and an underscore prepended to them. monitor_iteration_mode : optional, str The iteration mode used to iterate over the examples in all monitoring datasets. If not specified, defaults to 'sequential'. TODO: make it possible to specify different modes for different datasets. termination_criterion : optional, instance of pylearn2.termination_criteria.TerminationCriterion Used to determine when the algorithm should stop running. If not specified, runs forever--or more realistically, until external factors halt the python process (Kansas 1977). update_callbacks : optional, list If specified, each member of the list should be a callable that accepts an SGD instance as its only argument. All callbacks will be called with this SGD instance after each SGD step. learning_rule : training_algorithms.learning_rule.LearningRule A learning rule computes the new parameter values given old \ parameters and first-order gradients. If learning_rule is None, \ sgd.SGD will update parameters according to the standard SGD \ learning rule: param := param - learning_rate * d cost / d param This argument allows more sophisticated learning rules, such as SGD with momentum. init_momentum : **DEPRECATED** option, float Use learning_rule instead. If None, does not use momentum otherwise, use momentum and \ initialize the momentum coefficient to init_momentum. Callbacks \ can change this over time just like the learning rate. If the \ gradient is the same on every step, then the update taken by the \ SGD algorithm is scaled by a factor of 1/(1-momentum). See \ section 9 of Geoffrey Hinton's "A Practical Guide to Training \ Restricted Boltzmann Machines" for details. set_batch_size : optional, bool Defaults to False. If True, and batch_size conflicts with model.force_batch_size, \ will call model.set_batch_size(batch_size) in an attempt to \ change model.force_batch_size train_iteration_mode : optional, str Defaults to 'shuffled_sequential'. The iteration mode to use for iterating through training examples. batches_per_iter : optional, int The number of batches to draw from the iterator over training examples. If iterational mode is 'sequential' or 'shuffled_sequential', this is unnecessary; when unspecified we will iterate over all examples. theano_function_mode : optional, a valid argument to theano.function's 'mode' parameter. The theano mode to compile the updates function with. Note that \ pylearn2 includes some wraplinker modes that are not bundled with \ theano. See pylearn2.devtools. These extra modes let you do \ things like check for NaNs at every step, or record md5 digests \ of all computations performed by the update function to help \ isolate problems with nondeterminism. monitoring_costs : optional, list a list of Cost instances. The Monitor will also include all channels defined by these Costs, even though we don't train using them. seed : optional, valid argument to np.random.RandomState The seed used for the random number generate to be passed to the training dataset iterator (if any) """ if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn("init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead") # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = np.random.RandomState(seed) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs
def run_sgd(mode): # Must be seeded the same both times run_sgd is called disturb_mem.disturb_mem() rng = np.random.RandomState([2012, 11, 27]) batch_size = 5 train_batches = 3 valid_batches = 4 num_features = 2 # Synthesize dataset with a linear decision boundary w = rng.randn(num_features) def make_dataset(num_batches): disturb_mem.disturb_mem() m = num_batches * batch_size X = rng.randn(m, num_features) y = np.zeros((m, 1)) y[:, 0] = np.dot(X, w) > 0. rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning X = rval.get_batch_design(batch_size) assert X.shape == (batch_size, num_features) return rval train = make_dataset(train_batches) valid = make_dataset(valid_batches) num_chunks = 10 chunk_width = 2 class ManyParamsModel(Model): """ Make a model with lots of parameters, so that there are many opportunities for their updates to get accidentally re-ordered non-deterministically. This makes non-determinism bugs manifest more frequently. """ def __init__(self): super(ManyParamsModel, self).__init__() self.W1 = [ sharedX(rng.randn(num_features, chunk_width)) for i in xrange(num_chunks) ] disturb_mem.disturb_mem() self.W2 = [ sharedX(rng.randn(chunk_width)) for i in xrange(num_chunks) ] self._params = safe_union(self.W1, self.W2) self.input_space = VectorSpace(num_features) self.output_space = VectorSpace(1) disturb_mem.disturb_mem() model = ManyParamsModel() disturb_mem.disturb_mem() class LotsOfSummingCost(Cost): """ Make a cost whose gradient on the parameters involves summing many terms together, so that T.grad is more likely to sum things in a random order. """ supervised = True def expr(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) X, Y = data disturb_mem.disturb_mem() def mlp_pred(non_linearity): Z = [T.dot(X, W) for W in model.W1] H = map(non_linearity, Z) Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)] pred = sum(Z) return pred nonlinearity_predictions = map( mlp_pred, [T.nnet.sigmoid, T.nnet.softplus, T.sqr, T.sin]) pred = sum(nonlinearity_predictions) disturb_mem.disturb_mem() return abs(pred - Y[:, 0]).sum() def get_data_specs(self, model): data = CompositeSpace( (model.get_input_space(), model.get_output_space())) source = (model.get_input_source(), model.get_target_source()) return (data, source) cost = LotsOfSummingCost() disturb_mem.disturb_mem() algorithm = SGD( cost=cost, batch_size=batch_size, learning_rule=Momentum(.5), learning_rate=1e-3, monitoring_dataset={ 'train': train, 'valid': valid }, update_callbacks=[ExponentialDecay(decay_factor=2., min_lr=.0001)], termination_criterion=EpochCounter(max_epochs=5)) disturb_mem.disturb_mem() train_object = Train(dataset=train, model=model, algorithm=algorithm, extensions=[ PolyakAveraging(start=0), MomentumAdjustor(final_momentum=.9, start=1, saturate=5), ], save_freq=0) disturb_mem.disturb_mem() train_object.main_loop()
class SGD(TrainingAlgorithm): """ SGD = (Minibatch) Stochastic Gradient Descent. A TrainingAlgorithm that does stochastic gradient descent on minibatches of training examples. For theoretical background on this algorithm, see Yoshua Bengio's machine learning course notes on the subject: http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html Parameters ---------- learning_rate : float The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost : pylearn2.costs.cost.Cost, optional Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. batch_size : int, optional The size of the batch to be used. If not specified, the model will be asked for the batch size, so you must have specified the batch size there. (Some models are rigidly defined to only work with one batch size) monitoring_batch_size : int, optional The size of the monitoring batches. monitoring_batches : int, optional At the start of each epoch, we run "monitoring", to evaluate quantities such as the validation set error. monitoring_batches, if specified, determines the number of batches to draw from the iterator for each monitoring dataset. Unnecessary if not using monitoring or if `monitor_iteration_mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). TODO: make it possible to specify different monitoring_batches for each monitoring dataset. The Monitor itself already supports this. monitoring_dataset : Dataset or dictionary, optional If not specified, no monitoring is used. If specified to be a Dataset, monitor on that Dataset. If specified to be dictionary, the keys should be string names of datasets, and the values should be Datasets. All monitoring channels will be computed for all monitoring Datasets and will have the dataset name and an underscore prepended to them. monitor_iteration_mode : str, optional The iteration mode used to iterate over the examples in all monitoring datasets. If not specified, defaults to 'sequential'. TODO: make it possible to specify different modes for different datasets. termination_criterion : instance of \ pylearn2.termination_criteria.TerminationCriterion, optional Used to determine when the algorithm should stop running. If not specified, runs forever--or more realistically, until external factors halt the python process (Kansas 1977). update_callbacks : list, optional If specified, each member of the list should be a callable that accepts an SGD instance as its only argument. All callbacks will be called with this SGD instance after each SGD step. learning_rule : training_algorithms.learning_rule.LearningRule, optional A learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule: .. code-block:: none param := param - learning_rate * d cost / d param This argument allows more sophisticated learning rules, such as SGD with momentum. init_momentum : float, **DEPRECATED** option Use learning_rule instead. If None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. set_batch_size : bool, optional Defaults to False. If True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size train_iteration_mode : str, optional Defaults to 'shuffled_sequential'. The iteration mode to use for iterating through training examples. batches_per_iter : int, optional The number of batches to draw from the iterator over training examples. If iteration mode is 'sequential' or 'shuffled_sequential', this is unnecessary; when unspecified we will iterate over all examples. theano_function_mode : a valid argument to theano.function's \ 'mode' parameter, optional The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. monitoring_costs : OrderedDict, optional A dictionary of Cost instances. Keys should be string containing the name of the cost. The Monitor will also include all channels defined by these Costs, even though we don't train using them. seed : valid argument to np.random.RandomState, optional The seed used for the random number generate to be passed to the training dataset iterator (if any) """ def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule = None, init_momentum = None, set_batch_size = False, train_iteration_mode = None, batches_per_iter=None, theano_function_mode = None, monitoring_costs=None, seed=[2012, 10, 5]): if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn("init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead") # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batch_size = monitoring_batch_size self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batch_size is not None: raise ValueError("Specified a monitoring batch size " + "but not a monitoring dataset.") if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = make_np_rng(seed, which_method=["randn","randint"]) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs def _setup_monitor(self): """ Set up monitor to model the objective value, learning rate, momentum (if applicable), and extra channels defined by the cost. This method must be called after `learning_rule.get_updates`, since it may have an effect on `learning_rule.add_channels_to_monitor` (that is currently the case for `learning_rule.RMSProp`). """ if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=self.learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. Parameters ---------- model : a Model instance dataset : Dataset """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if contains_inf(param.get_value())] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([contains_nan(param.get_value()) for param in model.get_params()]): nan_params = [param for param in model.get_params() if contains_nan(param.get_value())] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() # test if force batch size and batch size has_force_batch_size = getattr(model, "force_batch_size", False) train_dataset_is_uneven = \ dataset.get_num_examples() % self.batch_size != 0 has_monitoring_datasets = \ self.monitoring_dataset is not None and \ self.monitoring_dataset.values() > 0 if has_monitoring_datasets: monitoring_datasets_are_uneven = \ any(d.get_num_examples() % self.batch_size != 0 for d in self.monitoring_dataset.values()) else: monitoring_datasets_are_uneven = False # or True it doesn't matter if has_force_batch_size and train_dataset_is_uneven and \ not has_uniform_batch_size(self.train_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set train_iteration_mode (and " "maybe monitor_iteration_mode) to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") if has_force_batch_size and has_monitoring_datasets and \ monitoring_datasets_are_uneven and \ not has_uniform_batch_size(self.monitor_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set monitor_iteration_mode to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' learning_rate = self.learning_rate params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates( learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.modify_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if contains_inf(update_val): raise ValueError("debug value of %s contains infs" % update.name) if contains_nan(update_val): raise ValueError("debug value of %s contains nans" % update.name) # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost. # We have to do that after learning_rule.get_updates has been # called, since it may have an effect on # learning_rule.add_channels_to_monitor (that is currently the case # for AdaDelta and RMSProp). self._setup_monitor() with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params def train(self, dataset): """ Runs one epoch of SGD training on the specified dataset. Parameters ---------- dataset : Dataset """ if not hasattr(self, 'sgd_update'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if not isfinite(value): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError("Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng = rng, num_batches = self.batches_per_iter) on_load_batch = self.on_load_batch for batch in iterator: for callback in on_load_batch: callback(*batch) self.sgd_update(*batch) # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if not isfinite(value): raise Exception("NaN in " + param.name) def continue_learning(self, model): """ Returns True if the algorithm should continue running, or False if it has reached convergence / started overfitting and should stop. Parameters ---------- model : a Model instance """ if self.termination_criterion is None: return True else: return self.termination_criterion.continue_learning(self.model)
dim=128, irange=0.001, init_bias=0) hidden_layer2 = mlp.RectifiedLinear(layer_name='hidden2', dim=128, irange=0.01, init_bias=0) hidden_layer3 = mlp.RectifiedLinear(layer_name='hidden3', dim=128, irange=0.01, init_bias=0) # create Softmax output layer output_layer = mlp.Softmax(3, 'output', irange=.1) # create Stochastic Gradient Descent trainer that runs for 400 epochs cost = NegativeLogLikelihoodCost() rule = Momentum(0.9) # rule = Momentum(0.9, True) # update_callbacks=ExponentialDecay(1 + 1e-5, 0.001) trainer = sgd.SGD(learning_rate=0.01, cost=cost, batch_size=128, termination_criterion=EpochCounter(1000), monitoring_dataset=vds, learning_rule=rule) layers = [hidden_layer, hidden_layer2, output_layer] # create neural net that takes two inputs ann = mlp.MLP(layers, nvis=ds.feat_cnt) trainer.setup(ann, ds) print trainer.cost # train neural net until the termination criterion is true
from pylearn2.costs.cost import MethodCost from pylearn2.datasets.mnist import MNIST from pylearn2.models.mlp import MLP, Sigmoid, Softmax from pylearn2.train import Train from pylearn2.training_algorithms.sgd import SGD from pylearn2.training_algorithms.learning_rule import Momentum, MomentumAdjustor from pylearn2.termination_criteria import EpochCounter train_set = MNIST(which_set='train', start=0, stop=50000) valid_set = MNIST(which_set='train', start=50000, stop=60000) test_set = MNIST(which_set='test') model = MLP(nvis=784, layers=[Sigmoid(layer_name='h', dim=500, irange=0.01), Softmax(layer_name='y', n_classes=10, irange=0.01)]) algorithm = SGD(batch_size=100, learning_rate=0.01, learning_rule=Momentum(init_momentum=0.5), monitoring_dataset={'train': train_set, 'valid': valid_set, 'test': test_set}, cost=MethodCost('cost_from_X'), termination_criterion=EpochCounter(10)) train = Train(dataset=train_set, model=model, algorithm=algorithm, save_path="mnist_example.pkl", save_freq=1, extensions=[MomentumAdjustor(start=5, saturate=6, final_momentum=0.95)]) train.main_loop()
class SGD(TrainingAlgorithm): """ Stochastic Gradient Descent WRITEME: what is a good reference to read about this algorithm? A TrainingAlgorithm that does gradient descent on minibatches. """ def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule=None, init_momentum=None, set_batch_size=False, train_iteration_mode=None, batches_per_iter=None, theano_function_mode=None, monitoring_costs=None, seed=[2012, 10, 5]): """ WRITEME learning_rate: The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost: a pylearn2.costs.cost.Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. init_momentum: **DEPRECATED** if None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. learning_rule: training_algorithms.learning_rule.LearningRule, a learning rule computes the new parameter values given old parameters and first-order gradients. If learning_rule is None, sgd.SGD will update parameters according to the standard SGD learning rule. set_batch_size: if True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size theano_function_mode: The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc """ if isinstance(cost, (list, tuple, set)): raise TypeError( "SGD no longer supports using collections of Costs to represent " " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn( "init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead" ) # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError( "Specified an amount of monitoring batches but not a monitoring dataset." ) self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = np.random.RandomState(seed) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs def setup(self, model, dataset): if self.cost is None: self.cost = model.get_default_cost() inf_params = [ param for param in model.get_params() if np.any(np.isinf(param.get_value())) ] if len(inf_params) > 0: raise ValueError("These params are Inf: " + str(inf_params)) if any([ np.any(np.isnan(param.get_value())) for param in model.get_params() ]): nan_params = [ param for param in model.get_params() if np.any(np.isnan(param.get_value())) ] raise ValueError("These params are NaN: " + str(nan_params)) self.model = model batch_size = self.batch_size if hasattr(model, "force_batch_size"): if model.force_batch_size > 0: if batch_size is not None: if batch_size != model.force_batch_size: if self.set_batch_size: model.set_batch_size(batch_size) else: raise ValueError( "batch_size argument to SGD conflicts with model's force_batch_size attribute" ) else: self.batch_size = model.force_batch_size model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, **fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, **fixed_var_descr.fixed_vars) for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % { 'costname': cost_value.name, 'paramname': param.name }) lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param, 1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update( self.learning_rule.get_updates(learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.censor_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params def train(self, dataset): if not hasattr(self, 'sgd_update'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError( "Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng=rng, num_batches=self.batches_per_iter) on_load_batch = self.on_load_batch for batch in iterator: for callback in on_load_batch: callback(mapping.nest(batch)) self.sgd_update(*batch) # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) def continue_learning(self, model): if self.termination_criterion is None: return True else: return self.termination_criterion.continue_learning(self.model)