def test_variational_cd(): # Verifies that VariationalCD works well with make_layer_to_symbolic_state visible_layer = BinaryVector(nvis=100) hidden_layer = BinaryVectorMaxPool(detector_layer_dim=500, pool_size=1, layer_name='h', irange=0.05, init_bias=-2.0) model = DBM(visible_layer=visible_layer, hidden_layers=[hidden_layer], batch_size=100, niter=1) cost = VariationalCD(num_chains=100, num_gibbs_steps=2) data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s' % (source) arg = space.make_theano_batch(name=name) theano_args.append(arg) theano_args = tuple(theano_args) nested_args = mapping.nest(theano_args) grads, updates = cost.get_gradients(model, nested_args)
def get_gradients(model): cost = model.get_default_cost() data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (SGD.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=model.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) nested_args = mapping.nest(theano_args) fixed_var_descr = cost.get_fixed_var_descr(model, nested_args) grads, updates = cost.get_gradients(model, nested_args, **fixed_var_descr.fixed_vars) params = list(model.get_params()) for param in params: some = grads[param] print("ok") return grads
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model self.monitor = Monitor.get_monitor(model) if self.monitoring_dataset is not None: # Get the data specifications needed by the model space, source = model.get_monitoring_data_specs() # Create Theano variables for each of the individual components # of that data. Usually, it will be X for inputs and Y for targets. # First, we need to find these components, and put them in a tuple mapping = DataSpecsMapping((space, source)) space_tuple = mapping.flatten(space, return_tuple=True) source_tuple = mapping.flatten(source, return_tuple=True) # Then, build a flat tuple of these Theano variables ipt = tuple(sp.make_theano_batch(name='monitor_%s' % src) for (sp, src) in safe_zip(space_tuple, source_tuple)) # Finally, organize them back into a structure expected by the # monitoring channels of the model nested_ipt = mapping.nest(ipt) self.monitor.add_dataset(dataset=self.monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) channels = model.get_monitoring_channels(nested_ipt) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None self.monitor.add_channel(name=name, ipt=nested_ipt, val=J, prereqs=prereqs, data_specs=(space, source)) self.first = True self.bSetup = True
def _build_data_specs(self): """ Computes a nested data_specs for input and all channels Also computes the mapping to flatten it. This function is called from redo_theano. """ # Ask the model what it needs m_space, m_source = self.model.get_monitoring_data_specs() input_spaces = [m_space] input_sources = [m_source] for channel in self.channels.values(): space = channel.data_specs[0] assert isinstance(space, Space) input_spaces.append(space) input_sources.append(channel.data_specs[1]) nested_space = CompositeSpace(input_spaces) nested_source = tuple(input_sources) self._nested_data_specs = (nested_space, nested_source) self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs) flat_space = self._data_specs_mapping.flatten(nested_space, return_tuple=True) flat_source = self._data_specs_mapping.flatten(nested_source, return_tuple=True) self._flat_data_specs = (CompositeSpace(flat_space), flat_source)
def train(self, dataset): if not hasattr(self, 'sgd_update'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError( "Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng=rng, num_batches=self.batches_per_iter) on_load_batch = self.on_load_batch for batch in iterator: for callback in on_load_batch: callback(mapping.nest(batch)) self.sgd_update(*batch) # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name)
def load_model(model_paths, costs, batch_size=100): if type(costs) is not list: costs = len(model_paths) * [costs] model = {} model['layers'] = [] model['costs'] = [] model['comparative_costs'] = [] model['weights'] = [] model['encoders'] = [] model['decoders'] = [] for i, path in enumerate(model_paths): if os.path.isfile(path): model['layers'].append(serial.load(path)) I = model['layers'][i].get_input_space().make_theano_batch( batch_size=batch_size) E = model['layers'][i].encode(I) model['encoders'].append(theano.function([I], E)) H = model['layers'][i].get_output_space().make_theano_batch( batch_size=batch_size) D = model['layers'][i].decode(H) model['decoders'].append(theano.function([H], D)) model['weights'].append(model['layers'][i].get_weights()) data_specs = costs[i].get_data_specs(model['layers'][i]) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): arg = space.make_theano_batch(batch_size=batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = costs[i].get_fixed_var_descr( model['layers'][i], nested_args) model['costs'].append( theano.function([nested_args], costs[i].expr(model['layers'][i], nested_args, **fixed_var_descr.fixed_vars))) I2 = model['layers'][i].get_input_space().make_theano_batch( batch_size=batch_size) model['comparative_costs'].append( theano.function([I, I2], costs[i].costs[0].cost(I, I2))) else: sys.exit("Whoa. " + path + " isn't a thing I know about!") return model
def train(self, dataset): if not hasattr(self, 'sgd_update'): raise Exception("train called without first calling setup") # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name) self.first = False rng = self.rng if not is_stochastic(self.train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError("Unable to train with SGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=self.train_iteration_mode, batch_size=self.batch_size, data_specs=flat_data_specs, return_tuple=True, rng = rng, num_batches = self.batches_per_iter) on_load_batch = self.on_load_batch for batch in iterator: for callback in on_load_batch: callback(mapping.nest(batch)) self.sgd_update(*batch) # iterator might return a smaller batch if dataset size # isn't divisible by batch_size # Note: if data_specs[0] is a NullSpace, there is no way to know # how many examples would actually have been in the batch, # since it was empty, so actual_batch_size would be reported as 0. actual_batch_size = flat_data_specs[0].np_batch_size(batch) self.monitor.report_batch(actual_batch_size) for callback in self.update_callbacks: callback(self) # Make sure none of the parameters have bad values for param in self.params: value = param.get_value(borrow=True) if np.any(np.isnan(value)) or np.any(np.isinf(value)): raise Exception("NaN in " + param.name)
def train(self, dataset): """ .. todo:: WRITEME """ assert self.bSetup model = self.model rng = self.rng train_iteration_mode = "shuffled_sequential" if not is_stochastic(train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError( "Unable to train with BGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs) ) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator( mode=train_iteration_mode, batch_size=self.batch_size, num_batches=self.batches_per_iter, data_specs=flat_data_specs, return_tuple=True, rng=rng, ) mode = self.theano_function_mode for data in iterator: if "targets" in source_tuple and mode is not None and hasattr(mode, "record"): Y = data[source_tuple.index("targets")] stry = str(Y).replace("\n", " ") mode.record.handle_line("data Y " + stry + "\n") for on_load_batch in self.on_load_batch: on_load_batch(mapping.nest(data)) self.before_step(model) self.optimizer.minimize(*data) self.after_step(model) actual_batch_size = flat_data_specs[0].np_batch_size(data) model.monitor.report_batch(actual_batch_size)
def setup(self): self.X = T.matrix('X') self.Y = T.matrix('Y') # Taken from pylearn2/training_algorithms/sgd.py data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) print 'BATCH SIZE=', self.batch_size theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) print self.cost fixed_var_descr = self.cost.get_fixed_var_descr( self.model, nested_args) print self.cost self.on_load_batch = fixed_var_descr.on_load_batch params = list(self.model.get_params()) self.X = nested_args[0] self.Y = nested_args[1] init_grads, updates = self.cost.get_gradients(self.model, nested_args) params = self.model.get_params() # We need to replace parameters with purely symbolic variables in case some are shared # Create gradient and cost functions self.params = params symbolic_params = [self._convert_variable(param) for param in params] givens = dict(zip(params, symbolic_params)) costfn = self.model.cost_from_X((self.X, self.Y)) gradfns = [init_grads[param] for param in params] #self.symbolic_params = symbolic_params #self._loss = theano.function(symbolic_para[self.X, self.Y], self.model.cost_from_X((self.X, self.Y)))#, givens=givens) #1/0 print 'Compiling function...' self.theano_f_df = theano.function(inputs=symbolic_params + [self.X, self.Y], outputs=[costfn] + gradfns, givens=givens) print 'done'
def __init__(self, data_callbacks, data_specs): """ data_callback: optional, callbacks to run on data. It is either a Python callable, or a tuple (possibly nested), in the same format as data_specs. data_specs: (space, source) pair specifying the format and label associated to the data. """ self.data_callbacks = data_callbacks self.data_specs = data_specs self._mapping = DataSpecsMapping(data_specs)
def train(self, dataset): """ .. todo:: WRITEME """ assert self.bSetup model = self.model rng = self.rng train_iteration_mode = 'shuffled_sequential' if not is_stochastic(train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError( "Unable to train with BGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=train_iteration_mode, batch_size=self.batch_size, num_batches=self.batches_per_iter, data_specs=flat_data_specs, return_tuple=True, rng=rng) mode = self.theano_function_mode for data in iterator: if ('targets' in source_tuple and mode is not None and hasattr(mode, 'record')): Y = data[source_tuple.index('targets')] stry = str(Y).replace('\n', ' ') mode.record.handle_line('data Y ' + stry + '\n') for on_load_batch in self.on_load_batch: on_load_batch(mapping.nest(data)) self.before_step(model) self.optimizer.minimize(*data) self.after_step(model) actual_batch_size = flat_data_specs[0].np_batch_size(data) model.monitor.report_batch(actual_batch_size)
def setup(self): self.X = T.matrix('X') self.Y = T.matrix('Y') # Taken from pylearn2/training_algorithms/sgd.py data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size = self.batch_size) theano_args.append(arg) print 'BATCH SIZE=',self.batch_size theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) print self.cost fixed_var_descr = self.cost.get_fixed_var_descr(self.model, nested_args) print self.cost self.on_load_batch = fixed_var_descr.on_load_batch params = list(self.model.get_params()) self.X = nested_args[0] self.Y = nested_args[1] init_grads, updates = self.cost.get_gradients(self.model, nested_args) params = self.model.get_params() # We need to replace parameters with purely symbolic variables in case some are shared # Create gradient and cost functions self.params = params symbolic_params = [self._convert_variable(param) for param in params] givens = dict(zip(params, symbolic_params)) costfn = self.model.cost_from_X((self.X, self.Y)) gradfns = [init_grads[param] for param in params] #self.symbolic_params = symbolic_params #self._loss = theano.function(symbolic_para[self.X, self.Y], self.model.cost_from_X((self.X, self.Y)))#, givens=givens) #1/0 print 'Compiling function...' self.theano_f_df = theano.function(inputs=symbolic_params + [self.X, self.Y], outputs=[costfn] + gradfns, givens=givens) print 'done'
def test_nest_specs(): x1 = TT.matrix('x1') x2 = TT.matrix('x2') x3 = TT.matrix('x3') x4 = TT.matrix('x4') for nested_space, nested_source, nested_data in [ (VectorSpace(dim=10), 'target', x2), (CompositeSpace([VectorSpace(dim=3), VectorSpace(dim=9)]), ('features', 'features'), (x1, x4)), (CompositeSpace([VectorSpace(dim=3), CompositeSpace([VectorSpace(dim=10), VectorSpace(dim=7)])]), ('features', ('target', 'features')), (x1, (x2, x3))), ]: mapping = DataSpecsMapping((nested_space, nested_source)) flat_space = mapping.flatten(nested_space) flat_source = mapping.flatten(nested_source) flat_data = mapping.flatten(nested_data) renested_space = mapping.nest(flat_space) renested_source = mapping.nest(flat_source) renested_data = mapping.nest(flat_data) assert_equal(renested_space, nested_space) assert_equal(renested_source, nested_source) assert_equal(renested_data, nested_data)
class CallbackCost(Cost): """ A Cost that runs callbacks on the data. Returns the sum of the data multiplied by the sum of all model parameters as the cost. The callback is run via the CallbackOp so the cost must be used to compute one of the outputs of your theano graph if you want the callback to get called. The is cost is designed so that the SGD algorithm will result in in the CallbackOp getting evaluated. """ def __init__(self, data_callbacks, data_specs): """ data_callback: optional, callbacks to run on data. It is either a Python callable, or a tuple (possibly nested), in the same format as data_specs. data_specs: (space, source) pair specifying the format and label associated to the data. """ self.data_callbacks = data_callbacks self.data_specs = data_specs self._mapping = DataSpecsMapping(data_specs) def get_data_specs(self, model): return self.data_specs def expr(self, model, data): self.get_data_specs(model)[0].validate(data) callbacks = self.data_callbacks cb_tuple = self._mapping.flatten(callbacks, return_tuple=True) data_tuple = self._mapping.flatten(data, return_tuple=True) costs = [] for (callback, data_var) in safe_zip(cb_tuple, data_tuple): orig_var = data_var data_var = CallbackOp(callback)(data_var) assert len(data_var.owner.inputs) == 1 assert orig_var is data_var.owner.inputs[0] costs.append(data_var.sum()) # sum() will call theano.add on the symbolic variables cost = sum(costs) model_terms = sum([param.sum() for param in model.get_params()]) cost = cost * model_terms return cost
def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function([], updates=[(unsup_counter, unsup_counter + 1)]) def on_load(batch, mapping=mapping, theano_func=theano_func): return theano_func() rval.on_load_batch = [on_load] return rval
def test_nest_specs(): x1 = TT.matrix("x1") x2 = TT.matrix("x2") x3 = TT.matrix("x3") x4 = TT.matrix("x4") for nested_space, nested_source, nested_data in [ (VectorSpace(dim=10), "target", x2), (CompositeSpace([VectorSpace(dim=3), VectorSpace(dim=9)]), ("features", "features"), (x1, x4)), ( CompositeSpace([VectorSpace(dim=3), CompositeSpace([VectorSpace(dim=10), VectorSpace(dim=7)])]), ("features", ("target", "features")), (x1, (x2, x3)), ), ]: mapping = DataSpecsMapping((nested_space, nested_source)) flat_space = mapping.flatten(nested_space) flat_source = mapping.flatten(nested_source) flat_data = mapping.flatten(nested_data) renested_space = mapping.nest(flat_space) renested_source = mapping.nest(flat_source) renested_data = mapping.nest(flat_data) assert_equal(renested_space, nested_space) assert_equal(renested_source, nested_source) assert_equal(renested_data, nested_data)
def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function([], updates=[(unsup_counter, unsup_counter + 1) ]) def on_load(batch, mapping=mapping, theano_func=theano_func): return theano_func() rval.on_load_batch = [on_load] return rval
def get_fixed_var_descr(self, model, data): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'sup_aux_var': sup_counter} rval.data_specs = data_specs # data has to be flattened into a tuple before being passed # to `function`. mapping = DataSpecsMapping(data_specs) flat_data = mapping.flatten(data, return_tuple=True) theano_func = function(flat_data, updates=[(sup_counter, sup_counter + 1)]) # the on_load_batch function will take numerical data formatted # as rval.data_specs, so we have to flatten it inside the # returned function too. # Using default argument binds the variables used in the lambda # function to the value they have when the lambda is defined. on_load = (lambda batch, mapping=mapping, theano_func=theano_func: theano_func(*mapping.flatten(batch, return_tuple=True))) rval.on_load_batch = [on_load] return rval
def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} rval.data_specs = data_specs # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function(data_tuple, updates=[(unsup_counter, unsup_counter + 1)]) # the on_load_batch function will take numerical data formatted # as rval.data_specs, so we have to flatten it inside the # returned function too. # Using default argument binds the variables used in the lambda # function to the value they have when the lambda is defined. on_load = (lambda batch, mapping=mapping, theano_func=theano_func: theano_func(*mapping.flatten(batch, return_tuple=True))) rval.on_load_batch = [on_load] return rval
def test_flatten_specs(): for space, source, flat_space, flat_source in [ # (None, None), (VectorSpace(dim=5), "features", VectorSpace(dim=5), "features"), ( CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=2)]), ("features", "features"), CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=2)]), ("features", "features"), ), ( CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=5)]), ("features", "targets"), CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=5)]), ("features", "targets"), ), ( CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=5)]), ("features", "features"), VectorSpace(dim=5), "features", ), ( CompositeSpace([VectorSpace(dim=5), CompositeSpace([VectorSpace(dim=9), VectorSpace(dim=12)])]), ("features", ("features", "targets")), CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=9), VectorSpace(dim=12)]), ("features", "features", "targets"), ), ( CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=9), VectorSpace(dim=12)]), ("features", "features", "targets"), CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=9), VectorSpace(dim=12)]), ("features", "features", "targets"), ), ]: mapping = DataSpecsMapping((space, source)) rval = (mapping.flatten(space), mapping.flatten(source)) assert_equal((flat_space, flat_source), rval)
def setup(self, model, dataset, algorithm): self.origin = model.get_param_vector() cost = algorithm.cost # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now # ======================================= data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) # End cargo culting # ====================== print "Compiling cost function..." cost_fn = function(theano_args, cost_value) self.cost_fn = cost_fn
def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} rval.data_specs = data_specs # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function(data_tuple, updates=[(unsup_counter, unsup_counter + 1) ]) # the on_load_batch function will take numerical data formatted # as rval.data_specs, so we have to flatten it inside the # returned function too. # Using default argument binds the variables used in the lambda # function to the value they have when the lambda is defined. on_load = (lambda batch, mapping=mapping, theano_func=theano_func: theano_func(*mapping.flatten(batch, return_tuple=True))) rval.on_load_batch = [on_load] return rval
def test_flatten_specs(): for space, source, flat_space, flat_source in [ #(None, None), (VectorSpace(dim=5), 'features', VectorSpace(dim=5), 'features'), (CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=2)]), ('features', 'features'), CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=2)]), ('features', 'features')), (CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=5)]), ('features', 'targets'), CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=5)]), ('features', 'targets')), (CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=5)]), ('features', 'features'), VectorSpace(dim=5), 'features'), (CompositeSpace([VectorSpace(dim=5), CompositeSpace([VectorSpace(dim=9), VectorSpace(dim=12)])]), ('features', ('features', 'targets')), CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=9), VectorSpace(dim=12)]), ('features', 'features', 'targets')), (CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=9), VectorSpace(dim=12)]), ('features', 'features', 'targets'), CompositeSpace([VectorSpace(dim=5), VectorSpace(dim=9), VectorSpace(dim=12)]), ('features', 'features', 'targets')) ]: mapping = DataSpecsMapping((space, source)) rval = (mapping.flatten(space), mapping.flatten(source)) assert_equal((flat_space, flat_source), rval)
def get_composite_specs_and_mapping(self, model): """ Build the composite data_specs and a mapping to flatten it, return both Build the composite data_specs described in `get_composite_specs`, and build a DataSpecsMapping that can convert between it and a flat equivalent version. In particular, it helps building a flat data_specs to request data, and nesting this data back to the composite data_specs, so it can be dispatched among the different sub-costs. This is a helper function used by `get_data_specs` and `get_gradients`, and possibly other methods. """ composite_space, sources = self.get_composite_data_specs(model) mapping = DataSpecsMapping((composite_space, sources)) return (composite_space, sources), mapping
def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential', obj_prereqs=None, cost_monitoring_args=None): if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: assert isinstance(extra_costs, (OrderedDict, dict)) costs = extra_costs assert '' not in costs costs[''] = cost if cost_monitoring_args is None: cost_monitoring_args = {} model = self.model # Build a composite data_specs containing the specs for all costs, # then the specs of the model cost_names = sorted(costs.keys()) spaces = [] sources = [] for c in cost_names: c_space, c_source = costs[c].get_data_specs(model) spaces.append(c_space) sources.append(c_source) # Ask the model for the data_specs needed m_space, m_source = model.get_monitoring_data_specs() spaces.append(m_space) sources.append(m_source) nested_space = CompositeSpace(spaces) nested_sources = tuple(sources) # Flatten this data_specs, so we build only one symbolic Theano # variable for each of the unique (space, source) pairs. mapping = DataSpecsMapping((nested_space, nested_sources)) space_tuple = mapping.flatten(nested_space, return_tuple=True) source_tuple = mapping.flatten(nested_sources, return_tuple=True) ipt = tuple( space.make_theano_batch(name='monitor_%s' % source, batch_size=None) for (space, source) in safe_zip(space_tuple, source_tuple)) # Build a nested tuple from ipt, to dispatch the appropriate parts # of the ipt batch to each cost nested_ipt = mapping.nest(ipt) # custom_channels = {} # for i, cost_name in enumerate(cost_names): # if cost_name == '': # prefix = '' # else: # prefix = cost_name + '_' # cost = costs[cost_name] # cost_ipt = nested_ipt[i] # raw_channels = cost.get_monitoring_channels(model, cost_ipt) # channels = {} # for name in raw_channels: # # We need three things: the value itself (raw_channels[name]), # # the input variables (cost_ipt), and the data_specs for # # these input variables ((spaces[i], sources[i])) # channels[prefix + name] = (raw_channels[name], # cost_ipt, # (spaces[i], sources[i])) # custom_channels.update(channels) # # # Use the last inputs from nested_ipt for the model # model_channels = model.get_monitoring_channels(nested_ipt[-1]) # channels = {} # for name in model_channels: # # Note: some code used to consider that model_channels[name] # # could be a a (channel, prereqs) pair, this is not supported. # channels[name] = (model_channels[name], # nested_ipt[-1], # (spaces[-1], sources[-1])) # custom_channels.update(channels) if is_stochastic(mode): seed = [[2013, 2, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks # that respond to the values in the monitor use the name to find # it. for i, cost_name in enumerate(cost_names): cost = costs[cost_name] cost_ipt = nested_ipt[i] cost_value_list = cost.expr(model, cost_ipt) cost_value = reduce(lambda x, y: x + y, cost_value_list) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' prereqs = obj_prereqs else: name = dprefix + cost_name prereqs = None cost.get_data_specs(model)[0].validate(cost_ipt) self.add_channel(name=name, ipt=cost_ipt, val=cost_value, data_specs=cost.get_data_specs(model), dataset=cur_dataset, prereqs=prereqs)
def agent_train(self, terminal): """ Training function. terminal: boolean Whether current state is a terminal state. """ # Wait until we have enough data to train if self.action_count >= ((self.train.algorithm.batch_size+1)*self.k+1): tic = time() if self.train_setup == 0: self.train.main_loop() data_specs = self.train.algorithm.cost.get_data_specs( self.model) # The iterator should be built from flat data specs, so it # returns flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten( data_specs[1], return_tuple=True ) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is # impossible to know the size of the actual batch. It # is not decided yet what the right thing to do should be. raise NotImplementedError( "Unable to train with SGD, because the cost does not" " actually use data from the data set. " "data_specs: %s" % str(data_specs) ) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) self.flat_data_specs = flat_data_specs self.train_setup = 1 else: tic_iter = time() temp_iter = self.train.dataset.iterator( mode=self.train.algorithm.train_iteration_mode, batch_size=self.train.algorithm.batch_size, data_specs=self.flat_data_specs, return_tuple=True, rng=self.train.algorithm.rng, num_batches=self.train.algorithm.batches_per_iter ) toc_iter = time() log.debug('Iter creation time: %0.2f' % (toc_iter - tic_iter)) tic_next = time() batch = temp_iter.next() toc_next = time() log.debug('Iter next time: %0.2f' % (toc_next - tic_next)) tic_sgd = time() self.train.algorithm.sgd_update(*batch) toc_sgd = time() log.debug('SGD time: %0.2f' % (toc_sgd - tic_sgd)) log.info('Frames seen: %d' % self.all_time_total_frames) log.info('Epsilon: %0.10f' % self.epsilon) toc = time() self.episode_training_time += toc-tic log.debug('Real train time: %0.2f' % (toc-tic))
def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. Parameters ---------- model : a Model instance dataset : Dataset """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if contains_inf(param.get_value())] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([contains_nan(param.get_value()) for param in model.get_params()]): nan_params = [param for param in model.get_params() if contains_nan(param.get_value())] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() # test if force batch size and batch size has_force_batch_size = getattr(model, "force_batch_size", False) train_dataset_is_uneven = \ dataset.get_num_examples() % self.batch_size != 0 has_monitoring_datasets = \ self.monitoring_dataset is not None and \ self.monitoring_dataset.values() > 0 if has_monitoring_datasets: monitoring_datasets_are_uneven = \ any(d.get_num_examples() % self.batch_size != 0 for d in self.monitoring_dataset.values()) else: monitoring_datasets_are_uneven = False # or True it doesn't matter if has_force_batch_size and train_dataset_is_uneven and \ not has_uniform_batch_size(self.train_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set train_iteration_mode (and " "maybe monitor_iteration_mode) to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") if has_force_batch_size and has_monitoring_datasets and \ monitoring_datasets_are_uneven and \ not has_uniform_batch_size(self.monitor_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set monitor_iteration_mode to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' learning_rate = self.learning_rate params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates( learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.modify_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if contains_inf(update_val): raise ValueError("debug value of %s contains infs" % update.name) if contains_nan(update_val): raise ValueError("debug value of %s contains nans" % update.name) # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost. # We have to do that after learning_rule.get_updates has been # called, since it may have an effect on # learning_rule.add_channels_to_monitor (that is currently the case # for AdaDelta and RMSProp). self._setup_monitor() with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model : object A Python object representing the model to train. Loosely implementing the interface of models.model.Model. dataset : pylearn2.datasets.dataset.Dataset Dataset object used to draw training data """ self.model = model if self.cost is None: self.cost = model.get_default_cost() try: if self.cost.is_stochastic(): raise TypeError("BGD is not compatible with stochastic " "costs.") except NotImplementedError: warnings.warn("BGD is not compatible with stochastic costs " "and cannot determine whether the current cost is " "stochastic.") if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size is None or model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but " + "model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) data_specs = self.cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space, # named according to the sources. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = 'BGD_[%s]' % source arg = space.make_theano_batch(name=name) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with their data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients( model, nested_args, ** fixed_var_descr.fixed_vars) assert isinstance(grads, OrderedDict) assert isinstance(grad_updates, OrderedDict) if cost_value is None: raise ValueError("BGD is incompatible with " + str(self.cost) + " because it is intractable, but BGD uses the " + "cost function value to do line searches.") # obj_prereqs has to be a list of function f called with f(*data), # where data is a data tuple coming from the iterator. # this function enables capturing "mapping" and "f", while # enabling the "*data" syntax def capture(f, mapping=mapping): new_f = lambda *args: f(mapping.flatten(args, return_tuple=True)) return new_f obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch] if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup( dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, obj_prereqs=obj_prereqs, cost_monitoring_args=fixed_var_descr.fixed_vars) params = model.get_params() self.optimizer = BatchGradientDescent( objective=cost_value, gradients=grads, gradient_updates=grad_updates, params=params, param_constrainers=[model.modify_updates], lr_scalers=model.get_lr_scalers(), inputs=theano_args, verbose=self.verbose_optimization, max_iter=self.updates_per_batch, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode, theano_function_mode=self.theano_function_mode, init_alpha=self.init_alpha) # These monitoring channels keep track of shared variables, # which do not need inputs nor data. if self.monitoring_dataset is not None: self.monitor.add_channel( name='ave_step_size', ipt=None, val=self.optimizer.ave_step_size, data_specs=(NullSpace(), ''), dataset=first_value(self.monitoring_dataset)) self.monitor.add_channel( name='ave_grad_size', ipt=None, val=self.optimizer.ave_grad_size, data_specs=(NullSpace(), ''), dataset=first_value(self.monitoring_dataset)) self.monitor.add_channel( name='ave_grad_mult', ipt=None, val=self.optimizer.ave_grad_mult, data_specs=(NullSpace(), ''), dataset=first_value(self.monitoring_dataset)) self.first = True self.bSetup = True
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model : object A Python object representing the model to train loosely \ implementing the interface of models.model.Model. dataset : pylearn2.datasets.dataset.Dataset Dataset object used to draw training data """ self.model = model if self.cost is None: self.cost = model.get_default_cost() if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but " + "model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) data_specs = self.cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space, # named according to the sources. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = 'BGD_[%s]' % source arg = space.make_theano_batch(name=name) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with their data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, **fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients( model, nested_args, **fixed_var_descr.fixed_vars) assert isinstance(grads, OrderedDict) assert isinstance(grad_updates, OrderedDict) if cost_value is None: raise ValueError("BGD is incompatible with " + str(self.cost) + " because it is intractable, but BGD uses the " + "cost function value to do line searches.") # obj_prereqs has to be a list of function f called with f(*data), # where data is a data tuple coming from the iterator. # this function enables capturing "mapping" and "f", while # enabling the "*data" syntax def capture(f, mapping=mapping): new_f = lambda *args: f(mapping.flatten(args, return_tuple=True)) return new_f obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch] if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, obj_prereqs=obj_prereqs, cost_monitoring_args=fixed_var_descr.fixed_vars) params = model.get_params() self.optimizer = BatchGradientDescent( objective=cost_value, gradients=grads, gradient_updates=grad_updates, params=params, param_constrainers=[model.censor_updates], lr_scalers=model.get_lr_scalers(), inputs=theano_args, verbose=self.verbose_optimization, max_iter=self.updates_per_batch, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode, theano_function_mode=self.theano_function_mode, init_alpha=self.init_alpha) # These monitoring channels keep track of shared variables, # which do not need inputs nor data. if self.monitoring_dataset is not None: self.monitor.add_channel( name='ave_step_size', ipt=None, val=self.optimizer.ave_step_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_size', ipt=None, val=self.optimizer.ave_grad_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_mult', ipt=None, val=self.optimizer.ave_grad_mult, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True
def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential', obj_prereqs=None, cost_monitoring_args=None): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. Parameters ---------- dataset : pylearn2.datasets.Dataset Dataset or dictionary mapping string names to Datasets. If \ string names are used, then for every dataset, each channel \ defined by the model or cost will be replicated with that \ dataset's name followed by an underscore as the prefix. For \ example, if your cost defines a channel called 'misclass', and \ datasets is {'train' : train_dataset, 'valid' : valid_dataset} \ you will get channels called 'train_misclass' and 'valid_misclass'. cost : pylearn2.costs.Cost The cost being optimized by training. The value of the cost will appear as the `objective` channel. Its `get_monitoring_channels` method will also be used to supply other channels. extra_costs : OrderedDict, optional A dictionary mapping channel names to Cost objects. Their value will appear as the specified channel name. They will also provide more monitoring channels via their `get_monitoring_channels` method. obj_prereqs : None, or list of functions Functions to pass as prerequisites to the `objective` channel. cost_monitoring_args : dict Dictionary of kwargs that will be passed to \ `cost.get_monitoring_channels()` (but not for the extra_costs). """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: costs = extra_costs assert '' not in costs costs[''] = cost if cost_monitoring_args is None: cost_monitoring_args = {} model = self.model # Build a composite data_specs containing the specs for all costs, # then the specs of the model cost_names = sorted(costs.keys()) spaces = [] sources = [] for c in cost_names: c_space, c_source = costs[c].get_data_specs(model) spaces.append(c_space) sources.append(c_source) # Ask the model for the data_specs needed m_space, m_source = model.get_monitoring_data_specs() spaces.append(m_space) sources.append(m_source) nested_space = CompositeSpace(spaces) nested_sources = tuple(sources) # Flatten this data_specs, so we build only one symbolic Theano # variable for each of the unique (space, source) pairs. mapping = DataSpecsMapping((nested_space, nested_sources)) space_tuple = mapping.flatten(nested_space, return_tuple=True) source_tuple = mapping.flatten(nested_sources, return_tuple=True) ipt = tuple( space.make_theano_batch(name='monitor_%s' % source, batch_size=None) for (space, source) in safe_zip(space_tuple, source_tuple)) # Build a nested tuple from ipt, to dispatch the appropriate parts # of the ipt batch to each cost nested_ipt = mapping.nest(ipt) custom_channels = {} for i, cost_name in enumerate(cost_names): if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] cost_ipt = nested_ipt[i] raw_channels = cost.get_monitoring_channels(model, cost_ipt) channels = {} for name in raw_channels: # We need three things: the value itself (raw_channels[name]), # the input variables (cost_ipt), and the data_specs for # these input variables ((spaces[i], sources[i])) channels[prefix + name] = (raw_channels[name], cost_ipt, (spaces[i], sources[i])) custom_channels.update(channels) # Use the last inputs from nested_ipt for the model model_channels = model.get_monitoring_channels(nested_ipt[-1]) channels = {} for name in model_channels: # Note: some code used to consider that model_channels[name] # could be a a (channel, prereqs) pair, this is not supported. channels[name] = (model_channels[name], nested_ipt[-1], (spaces[-1], sources[-1])) custom_channels.update(channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks # that respond to the values in the monitor use the name to find # it. for i, cost_name in enumerate(cost_names): cost = costs[cost_name] cost_ipt = nested_ipt[i] cost_value = cost.expr(model, cost_ipt) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' prereqs = obj_prereqs else: name = dprefix + cost_name prereqs = None cost.get_data_specs(model)[0].validate(cost_ipt) self.add_channel(name=name, ipt=cost_ipt, val=cost_value, data_specs=cost.get_data_specs(model), dataset=cur_dataset, prereqs=prereqs) for key in custom_channels: val, ipt, data_specs = custom_channels[key] data_specs[0].validate(ipt) self.add_channel(name=dprefix + key, ipt=ipt, val=val, data_specs=data_specs, dataset=cur_dataset)
def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. (or a list/tuple containing symbolic tensors, following the data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt,) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt,) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and \ not isinstance(elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel '+name+'\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: reraise_as(ValueError("The dataset specified is not one of the " + "monitor's datasets")) if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True
class Monitor(object): """ A class for monitoring Models while they are being trained. A monitor object records the number of minibatches and number of examples the model has trained, as well as any number of "channels" that track quantities of interest (examples: the objective function, measures of hidden unit activity, reconstruction error, sum of squared second derivatives, average norm of the weight vectors, etc.) """ def __init__(self, model): """ Makes a monitor for `model`. Assumes the model has not been trained at all yet. Parameters ---------- model : pylearn2.models.model.Model instance """ self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs() def _build_data_specs(self): """ Computes a nested data_specs for input and all channels Also computes the mapping to flatten it. This function is called from redo_theano. """ # Ask the model what it needs m_space, m_source = self.model.get_monitoring_data_specs() input_spaces = [m_space] input_sources = [m_source] for channel in self.channels.values(): space = channel.data_specs[0] assert isinstance(space, Space) input_spaces.append(space) input_sources.append(channel.data_specs[1]) nested_space = CompositeSpace(input_spaces) nested_source = tuple(input_sources) self._nested_data_specs = (nested_space, nested_source) self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs) flat_space = self._data_specs_mapping.flatten(nested_space, return_tuple=True) flat_source = self._data_specs_mapping.flatten(nested_source, return_tuple=True) self._flat_data_specs = (CompositeSpace(flat_space), flat_source) def set_theano_function_mode(self, mode): """ Parameters ---------- mode : theano.compile.Mode Theano functions for the monitoring channels will be compiled and run using this mode. """ if self.theano_function_mode != mode: self._dirty = True self.theano_function_mode = mode def add_dataset(self, dataset, mode='sequential', batch_size=None, num_batches=None, seed=None): """ Determines the data used to calculate the values of each channel. Parameters ---------- dataset : object A `pylearn2.datasets.Dataset` object. mode : str or object, optional Iteration mode; see the docstring of the `iterator` method \ on `pylearn2.datasets.Dataset` for details. batch_size : int, optional The size of an individual batch. Optional if `mode` is \ 'sequential' and `num_batches` is specified (batch size \ will be calculated based on full dataset size). num_batches : int, optional The total number of batches. Unnecessary if `mode` is \ 'sequential' and `batch_size` is specified (number of \ batches will be calculated based on full dataset size). seed : int, optional Optional. The seed to be used for random iteration modes. """ # The user can ommit using lists if only one dataset is set if not isinstance(dataset, list): dataset = [dataset] if not isinstance(mode, list): mode = [mode] if not isinstance(batch_size, list): batch_size = [batch_size] if not isinstance(num_batches, list): num_batches = [num_batches] if seed is None: seed = [None] * len(dataset) if not isinstance(seed, list): seed = [seed] if len(mode) != len(dataset): raise ValueError("Received " + str(len(dataset)) + " dataset but " + str(len(mode)) + " modes.") if any([len(l) != len(dataset) for l in [batch_size, seed]]): raise ValueError("make sure each dataset has its iteration " + "batch size and number of batches.") for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed): try: it = d.iterator(mode=m, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) except ValueError as exc: raise ValueError("invalid iteration parameters in " + "Monitor.add_dataset: " + str(exc)) if it.stochastic: # Must be a seed, not a random number generator. If it were a # random number generator, different iterators using it would # update its state, so we would not get the same iterator # each time. Also, must not be None, because this makes the # iterator pick a seed based on the clock if sd is None: raise TypeError("Monitor requires a seed when using " + "stochastic iteration modes.") if not isinstance(sd, (list, tuple, int)): raise TypeError("Monitor requires a seed (not a random " + "number generator) when using " + "stochastic iteration modes.") else: # The iterator should catch this, but let's double-check assert sd is None if not d in self._datasets: self._datasets.append(d) self._iteration_mode.append(m) self._batch_size.append(b) self._num_batches.append(n) self._rng_seed.append(sd) def __call__(self): """ Runs the model on the monitoring dataset in order to add one data point to each of the channels. """ # If the channels have changed at all, we need to recompile the theano # functions used to compute them if self._dirty: self.redo_theano() datasets = self._datasets # Set all channels' val_shared to 0 self.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size, self._num_batches, self.accum, self._rng_seed, self.num_examples): if isinstance(d, basestring): d = yaml_parse.load(d) raise NotImplementedError() # need to put d back into self._datasets myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) # If self._flat_data_specs is empty, no channel needs data, # so we do not need to call the iterator in order to average # the monitored values across different batches, we only # have to call them once. if len(self._flat_data_specs[1]) == 0: X = () self.run_prereqs(X, d) a(*X) else: actual_ne = 0 for X in myiterator: # X is a flat (not nested) tuple self.run_prereqs(X, d) a(*X) actual_ne += self._flat_data_specs[0].np_batch_size(X) # end for X if actual_ne != ne: raise RuntimeError("At compile time, your iterator said " "it had %d examples total, but at " "runtime it gave us %d." % (ne, actual_ne)) # end for d log.info("Monitoring step:") log.info("\tEpochs seen: %d" % self._epochs_seen) log.info("\tBatches seen: %d" % self._num_batches_seen) log.info("\tExamples seen: %d" % self._examples_seen) t = time.time() - self.t0 for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key): channel = self.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(self._num_batches_seen) channel.example_record.append(self._examples_seen) channel.epoch_record.append(self._epochs_seen) val = channel.val_shared.get_value() channel.val_record.append(val) # TODO: use logging infrastructure so that user can configure # formatting if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val log.info("\t%s: %s" % (channel_name, val_str)) def run_prereqs(self, data, dataset): """ Runs all "prerequistie functions" on a batch of data. Always called right before computing the monitoring channels on that batch. Parameters ---------- data : tuple or Variable a member of the Space used as input to the monitoring functions dataset : Dataset the Dataset the data was drawn from """ if dataset not in self.prereqs: return for prereq in self.prereqs[dataset]: prereq(*data) def get_batches_seen(self): """ Returns the number of batches the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly). """ return self._num_batches_seen def get_epochs_seen(self): """ Returns ------- epochs_seen : int The number of epochs the model has been trained on. One "epoch" is one pass through Dataset.iterator. """ return self._epochs_seen def get_examples_seen(self): """ Returns ------- examples_seen : int The number of examples the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._examples_seen def report_batch(self, num_examples): """ Call this whenever the model has learned on another batch of examples. Report how many examples were learned on. Parameters ---------- num_examples : int The number of examples learned on in this minibatch. """ self._examples_seen += num_examples self._num_batches_seen += 1 def report_epoch(self): """ Call this whenever the model has completed another "epoch" of learning. We regard one pass through Dataset.iterator as one epoch. """ self._epochs_seen += 1 def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args, ) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [ d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size) ] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) / cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append( function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names]) def register_names_to_del(self, names): """ Register names of fields that should be deleted before pickling. Parameters ---------- names : list A list of attribute names as strings. """ for name in names: if name not in self.names_to_del: self.names_to_del.append(name) def __getstate__(self): """ In order to avoid pickling a copy of the dataset whenever a monitor is saved, the __getstate__ method replaces the dataset field with the dataset's yaml source. This is not a perfect solution because it won't work with job resuming, which would require saving the state of the dataset's random number generator. Like in the Model class, we also need to avoid saving any Theano functions, so we delete everything that can be regenerated with `redo_theano` by deleting the fields in `self.names_to_del` """ # Patch old pickled monitors if not hasattr(self, '_datasets'): self._datasets = [self._dataset] del self._dataset temp = self._datasets if self._datasets: self._datasets = [] for dataset in temp: if isinstance(dataset, basestring): self._datasets.append(dataset) else: try: self._datasets.append(dataset.yaml_src) except AttributeError: warnings.warn('Trained model saved without ' + 'indicating yaml_src') d = copy.copy(self.__dict__) self._datasets = temp for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): """ Sets the object to have the state described by `d`. Parameters ---------- d : dict A dictionary mapping string names of fields to values for these fields. """ # patch old pkl files if '_dataset' in d: d['_datasets'] = [d['_dataset']] del d['_dataset'] self.__dict__.update(d) def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. \ (or a list/tuple containing symbolic tensors, following the \ data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of \ data drawn *from dataset* before the channel value is computed \ if two channels provide a prereq with exactly the same id, that \ prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt, ) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt, ) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and \ not isinstance(elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel ' + name + '\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not one of the " + "monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True def _sanity_check(self): """ Sometimes we serialize models and then load them somewhere else but still try to use their Monitor, and the Monitor is in a mangled state. I've added some calls to _sanity_check to try to catch when that happens. Not sure what to do for a long term fix. I think it requires making theano graphs serializable first. """ for name in self.channels: channel = self.channels[name] assert hasattr(channel, 'prereqs') @classmethod def get_monitor(cls, model): """ Returns a model's monitor. If the model doesn't have a monitor yet, installs one and returns that. Parameters ---------- model : object An object that implements the `Model` interface specified in \ `pylearn2.models`. """ if hasattr(model, 'monitor'): rval = model.monitor rval._sanity_check() else: rval = Monitor(model) model.monitor = rval return rval # TODO: find out if this method is used anywhere, remove if not. @property def batch_size(self): """ Returns ------- batch_size : int The size of the batches used for monitoring """ return self._batch_size # TODO: find out if this method is used anywhere, remove if not. @property def num_batches(self): """ Returns ------- num_batches : int The number of batches used for monitoring """ return self._num_batches def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential', obj_prereqs=None, cost_monitoring_args=None): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. Parameters ---------- dataset : pylearn2.datasets.Dataset Dataset or dictionary mapping string names to Datasets. If \ string names are used, then for every dataset, each channel \ defined by the model or cost will be replicated with that \ dataset's name followed by an underscore as the prefix. For \ example, if your cost defines a channel called 'misclass', and \ datasets is {'train' : train_dataset, 'valid' : valid_dataset} \ you will get channels called 'train_misclass' and 'valid_misclass'. cost : pylearn2.costs.Cost The cost being optimized by training. The value of the cost will appear as the `objective` channel. Its `get_monitoring_channels` method will also be used to supply other channels. extra_costs : OrderedDict, optional A dictionary mapping channel names to Cost objects. Their value will appear as the specified channel name. They will also provide more monitoring channels via their `get_monitoring_channels` method. obj_prereqs : None, or list of functions Functions to pass as prerequisites to the `objective` channel. cost_monitoring_args : dict Dictionary of kwargs that will be passed to \ `cost.get_monitoring_channels()` (but not for the extra_costs). """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: costs = extra_costs assert '' not in costs costs[''] = cost if cost_monitoring_args is None: cost_monitoring_args = {} model = self.model # Build a composite data_specs containing the specs for all costs, # then the specs of the model cost_names = sorted(costs.keys()) spaces = [] sources = [] for c in cost_names: c_space, c_source = costs[c].get_data_specs(model) spaces.append(c_space) sources.append(c_source) # Ask the model for the data_specs needed m_space, m_source = model.get_monitoring_data_specs() spaces.append(m_space) sources.append(m_source) nested_space = CompositeSpace(spaces) nested_sources = tuple(sources) # Flatten this data_specs, so we build only one symbolic Theano # variable for each of the unique (space, source) pairs. mapping = DataSpecsMapping((nested_space, nested_sources)) space_tuple = mapping.flatten(nested_space, return_tuple=True) source_tuple = mapping.flatten(nested_sources, return_tuple=True) ipt = tuple( space.make_theano_batch(name='monitor_%s' % source, batch_size=None) for (space, source) in safe_zip(space_tuple, source_tuple)) # Build a nested tuple from ipt, to dispatch the appropriate parts # of the ipt batch to each cost nested_ipt = mapping.nest(ipt) custom_channels = {} for i, cost_name in enumerate(cost_names): if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] cost_ipt = nested_ipt[i] raw_channels = cost.get_monitoring_channels(model, cost_ipt) channels = {} for name in raw_channels: # We need three things: the value itself (raw_channels[name]), # the input variables (cost_ipt), and the data_specs for # these input variables ((spaces[i], sources[i])) channels[prefix + name] = (raw_channels[name], cost_ipt, (spaces[i], sources[i])) custom_channels.update(channels) # Use the last inputs from nested_ipt for the model model_channels = model.get_monitoring_channels(nested_ipt[-1]) channels = {} for name in model_channels: # Note: some code used to consider that model_channels[name] # could be a a (channel, prereqs) pair, this is not supported. channels[name] = (model_channels[name], nested_ipt[-1], (spaces[-1], sources[-1])) custom_channels.update(channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks # that respond to the values in the monitor use the name to find # it. for i, cost_name in enumerate(cost_names): cost = costs[cost_name] cost_ipt = nested_ipt[i] cost_value = cost.expr(model, cost_ipt) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' prereqs = obj_prereqs else: name = dprefix + cost_name prereqs = None cost.get_data_specs(model)[0].validate(cost_ipt) self.add_channel(name=name, ipt=cost_ipt, val=cost_value, data_specs=cost.get_data_specs(model), dataset=cur_dataset, prereqs=prereqs) for key in custom_channels: val, ipt, data_specs = custom_channels[key] data_specs[0].validate(ipt) self.add_channel(name=dprefix + key, ipt=ipt, val=val, data_specs=data_specs, dataset=cur_dataset)
def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. Parameters ---------- model : a Model instance dataset : Dataset """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if np.any(np.isinf(param.get_value()))] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]): nan_params = [param for param in model.get_params() if np.any(np.isnan(param.get_value()))] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() # test if force batch size and batch size if getattr(model, "force_batch_size", False) and \ any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for dataset in self.monitoring_dataset.values()) and \ not has_uniform_batch_size(self.monitor_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set monitor_iteration_mode to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i self.params = params grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") assert len(updates.keys()) == 0 def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False): updates = OrderedDict() assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator) if learn_discriminator: cur_params = model.discriminator.get_params() else: cur_params = model.generator.get_params() def check(): for param in params: if param not in cur_params: assert param not in updates cur_grads = OrderedDict() for param in cur_params: cur_grads[param] = grads[param] for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype cur_lr_scalers = OrderedDict() for param in cur_params: if param in lr_scalers: lr_scaler = lr_scalers[param] cur_lr_scalers[param] = lr_scaler log.info('Parameter and initial learning rate summary:') for param in cur_params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) updates.update(self.learning_rule.get_updates( learning_rate, cur_grads, cur_lr_scalers)) check() for param in cur_params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' check() model.modify_updates(updates) check() for param in cur_params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) check() if dont_you_fucking_dare_touch_the_generator: for param in model.generator.get_params(): assert param not in updates with log_timing(log, 'Compiling sgd_update'): return function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.d_func = get_func(1, 0, dont_you_fucking_dare_touch_the_generator=True) self.g_func = get_func(0, 1)
def setup_training(self): """ Sets up training function. """ training_batch_size = self.mini_batch_size cost = self.cnn.get_default_cost() data_specs = cost.get_data_specs(self.cnn) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=training_batch_size).astype("float32") theano_args.append(arg) theano_args = tuple(theano_args) y_hat = self.cnn.fprop(theano_args[0]) self.fprop_func = theano.function([theano_args[0]], y_hat) cost = self.cnn.cost(theano_args[1], y_hat) lr_scalers = self.cnn.get_lr_scalers() params = list(self.cnn.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) rms_vals_dict = OrderedDict(izip(params, self.rms_vals)) updates = OrderedDict() updates.update(dict(safe_zip(params, [param - self.learning_rate * (gradients[param] / T.sqrt(rms_vals_dict[param] + 1e-8)) for param in params]))) rmsprop_updates = OrderedDict() rmsprop_updates.update(dict(safe_zip(self.rms_vals, [(rms_vals_dict[param] * .9) + (T.sqr(gradients[param]) * .1) for param in params]))) self.training = theano.function(theano_args, updates=updates, on_unused_input='ignore') self.rmsprop_update = theano.function(theano_args, updates=rmsprop_updates, on_unused_input='ignore') temp = T.tensor4() self.dimshuf_func = theano.function([temp], temp.dimshuffle(1, 2, 3, 0)) #self.grads_func = theano.function(theano_args, grads) self.cost_function = theano.function(theano_args, cost)
def setup_training(self): """ Sets up training function. """ training_batch_size = self.mini_batch_size cost = self.cnn.get_default_cost() data_specs = cost.get_data_specs(self.cnn) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) #theano_args contains information about the shape of each layer theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=training_batch_size).astype("float32") theano_args.append(arg) theano_args = tuple(theano_args) y_hat = self.cnn.fprop(theano_args[0]) #function used for faster fprop self.fprop_func = theano.function([theano_args[0]], y_hat) cost = self.cnn.cost(theano_args[1], y_hat) #params is the list of layers in the NN params = list(self.cnn.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) rms_vals_dict = OrderedDict(izip(params, self.rms_vals)) momentum_vals_dict = OrderedDict(izip(params, self.momentum_vals)) grad_vals_dict = OrderedDict(izip(params, self.grad_vals)) grad_update = OrderedDict() grad_update.update(dict(safe_zip(self.grad_vals, [gradients[param] for param in params]))) #function used for getting gradients #this is so that we only calculate gradients once, then #the same values are used for updating momentum, rmsprop, and training self.grad_update_func = theano.function(theano_args, updates=grad_update, on_unused_input='ignore') updates = OrderedDict() updates.update(dict(safe_zip(params, [param - self.learning_rate * (grad_vals_dict[param] / T.sqrt(rms_vals_dict[param] + 1e-8)) + (self.momentum_step_size * momentum_vals_dict[param]) for param in params]))) rmsprop_updates = OrderedDict() #rmsprop update function rmsprop_updates.update(dict(safe_zip(self.rms_vals, [(rms_vals_dict[param] * .9) + (T.sqr(grad_vals_dict[param]) * .1) for param in params]))) self.training = theano.function([], updates=updates, on_unused_input='ignore') self.rmsprop_update = theano.function([], updates=rmsprop_updates, on_unused_input='ignore') momentum_updates = OrderedDict() #momentum update function momentum_updates.update(dict(safe_zip(self.momentum_vals, [-self.learning_rate * (grad_vals_dict[param] / T.sqrt(rms_vals_dict[param] + 1e-8)) + (self.momentum_step_size * momentum_vals_dict[param]) for param in params]))) self.momentum_update = theano.function([], updates=momentum_updates, on_unused_input='ignore') temp = T.tensor4() #function used for shuffling dimensions into c01b format self.dimshuf_func = theano.function([temp], temp.dimshuffle(1, 2, 3, 0)) #functions to get grads and costs for debugging self.grads_func = theano.function(theano_args, grads) self.cost_function = theano.function(theano_args, cost)
def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential', obj_prereqs=None, cost_monitoring_args=None): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. Parameters ---------- dataset : pylearn2.datasets.Dataset Dataset or dictionary mapping string names to Datasets. If string names are used, then for every dataset, each channel defined by the model or cost will be replicated with that dataset's name followed by an underscore as the prefix. For example, if your cost defines a channel called 'misclass', and datasets is {'train' : train_dataset, 'valid' : valid_dataset}, you will get channels called 'train_misclass' and 'valid_misclass'. cost : pylearn2.costs.Cost The cost being optimized by training. The value of the cost will appear as the `objective` channel. Its `get_monitoring_channels` method will also be used to supply other channels. extra_costs : OrderedDict, optional A dictionary mapping channel names to Cost objects. Their value will appear as the specified channel name. They will also provide more monitoring channels via their `get_monitoring_channels` method. obj_prereqs : None, or list of functions Functions to pass as prerequisites to the `objective` channel. cost_monitoring_args : dict Dictionary of kwargs that will be passed to `cost.get_monitoring_channels()` (but not for the extra_costs). """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: assert isinstance(extra_costs, (OrderedDict, dict)) costs = extra_costs assert '' not in costs costs[''] = cost if cost_monitoring_args is None: cost_monitoring_args = {} model = self.model # Build a composite data_specs containing the specs for all costs, # then the specs of the model cost_names = sorted(costs.keys()) spaces = [] sources = [] for c in cost_names: c_space, c_source = costs[c].get_data_specs(model) spaces.append(c_space) sources.append(c_source) # Ask the model for the data_specs needed m_space, m_source = model.get_monitoring_data_specs() spaces.append(m_space) sources.append(m_source) nested_space = CompositeSpace(spaces) nested_sources = tuple(sources) # Flatten this data_specs, so we build only one symbolic Theano # variable for each of the unique (space, source) pairs. mapping = DataSpecsMapping((nested_space, nested_sources)) space_tuple = mapping.flatten(nested_space, return_tuple=True) source_tuple = mapping.flatten(nested_sources, return_tuple=True) ipt = tuple(space.make_theano_batch(name='monitor_%s' % source, batch_size=None) for (space, source) in safe_zip(space_tuple, source_tuple)) # Build a nested tuple from ipt, to dispatch the appropriate parts # of the ipt batch to each cost nested_ipt = mapping.nest(ipt) custom_channels = {} for i, cost_name in enumerate(cost_names): if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] cost_ipt = nested_ipt[i] raw_channels = cost.get_monitoring_channels(model, cost_ipt) channels = {} for name in raw_channels: # We need three things: the value itself (raw_channels[name]), # the input variables (cost_ipt), and the data_specs for # these input variables ((spaces[i], sources[i])) channels[prefix + name] = (raw_channels[name], cost_ipt, (spaces[i], sources[i])) custom_channels.update(channels) # Use the last inputs from nested_ipt for the model model_channels = model.get_monitoring_channels(nested_ipt[-1]) channels = {} for name in model_channels: # Note: some code used to consider that model_channels[name] # could be a a (channel, prereqs) pair, this is not supported. channels[name] = (model_channels[name], nested_ipt[-1], (spaces[-1], sources[-1])) custom_channels.update(channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks # that respond to the values in the monitor use the name to find # it. for i, cost_name in enumerate(cost_names): cost = costs[cost_name] cost_ipt = nested_ipt[i] cost_value = cost.expr(model, cost_ipt) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' prereqs = obj_prereqs else: name = dprefix + cost_name prereqs = None cost.get_data_specs(model)[0].validate(cost_ipt) self.add_channel(name=name, ipt=cost_ipt, val=cost_value, data_specs=cost.get_data_specs(model), dataset=cur_dataset, prereqs=prereqs) for key in custom_channels: val, ipt, data_specs = custom_channels[key] data_specs[0].validate(ipt) self.add_channel(name=dprefix + key, ipt=ipt, val=val, data_specs=data_specs, dataset=cur_dataset)
def __init__(self, data_callbacks, data_specs): self.data_callbacks = data_callbacks self.data_specs = data_specs self._mapping = DataSpecsMapping(data_specs)
def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. \ (or a list/tuple containing symbolic tensors, following the \ data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of \ data drawn *from dataset* before the channel value is computed \ if two channels provide a prereq with exactly the same id, that \ prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt, ) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt, ) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and \ not isinstance(elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel ' + name + '\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not one of the " + "monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True
def __init__(self, nmap, input_space=None, nvisx=None, nvisy=None, input_source=("featuresX", "featuresY"), act_enc=None, act_dec=None, irange=1e-3, rng=9001): Block.__init__(self) Model.__init__(self) assert nmap > 0, "Number of mapping units must be positive" if nvisx is not None and nvisy is not None or input_space is not None: if nvisx is not None and nvisy is not None: assert nvisx > 0, "Number of visx units must be non-negative" assert nvisy > 0, "Number of visy units must be non-negative" input_space = CompositeSpace( [VectorSpace(nvisx), VectorSpace(nvisy)]) self.nvisx = nvisx self.nvisy = nvisy elif isinstance(input_space.components[0], Conv2DSpace): rx, cx = input_space.components[0].shape chx = input_space.components[0].num_channels ry, cy = input_space.components[1].shape chy = input_space.components[1].num_channels self.nvisx = rx * cx * chx self.nvisy = ry * cy * chy else: raise NotImplementedError( str(type(self)) + " does not support that input_space.") # Check whether the input_space and input_source structures match try: DataSpecsMapping((input_space, input_source)) except ValueError: raise ValueError("The structures of `input_space`, %s, and " "`input_source`, %s do not match. If you " "specified a CompositeSpace as an input, " "be sure to specify the data sources as well." % (input_space, input_source)) self.input_space = input_space self.input_source = input_source self.nmap = nmap self.output_space = VectorSpace(self.nmap) self._initialize_visbiasX(self.nvisx) # self.visbiasX self._initialize_visbiasY(self.nvisy) # self.visbiasY self._initialize_mapbias() # self.mapbias self.irange = irange self.rng = make_np_rng(rng, which_method="randn") seed = int(self.rng.randint(2**30)) self.s_rng = make_theano_rng(seed, which_method="uniform") def _resolve_callable(conf, conf_attr): if conf[conf_attr] is None or conf[conf_attr] == "linear": return None # If it's a callable, use it directly. if hasattr(conf[conf_attr], '__call__'): return conf[conf_attr] elif (conf[conf_attr] in globals() and hasattr(globals()[conf[conf_attr]], '__call__')): return globals()[conf[conf_attr]] elif hasattr(tensor.nnet, conf[conf_attr]): return getattr(tensor.nnet, conf[conf_attr]) elif hasattr(tensor, conf[conf_attr]): return getattr(tensor, conf[conf_attr]) else: raise ValueError("Couldn't interpret %s value: '%s'" % (conf_attr, conf[conf_attr])) self.act_enc = _resolve_callable(locals(), 'act_enc') self.act_dec = _resolve_callable(locals(), 'act_dec')
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry' ) updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args,) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it] self.num_examples = [float(i.num_examples) for i in it] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] inv_cur_num_examples = as_floatX(1./self.num_examples[index]) u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) * inv_cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append(function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names])
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model : object A Python object representing the model to train loosely \ implementing the interface of models.model.Model. dataset : pylearn2.datasets.dataset.Dataset Dataset object used to draw training data """ self.model = model if self.cost is None: self.cost = model.get_default_cost() if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but " + "model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) data_specs = self.cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space, # named according to the sources. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = 'BGD_[%s]' % source arg = space.make_theano_batch(name=name) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with their data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients( model, nested_args, ** fixed_var_descr.fixed_vars) assert isinstance(grads, OrderedDict) assert isinstance(grad_updates, OrderedDict) if cost_value is None: raise ValueError("BGD is incompatible with " + str(self.cost) + " because it is intractable, but BGD uses the " + "cost function value to do line searches.") # obj_prereqs has to be a list of function f called with f(*data), # where data is a data tuple coming from the iterator. # this function enables capturing "mapping" and "f", while # enabling the "*data" syntax def capture(f, mapping=mapping): new_f = lambda *args: f(mapping.flatten(args, return_tuple=True)) return new_f obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch] if self.monitoring_dataset is not None: self.monitor.setup( dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, obj_prereqs=obj_prereqs, cost_monitoring_args=fixed_var_descr.fixed_vars) # TODO : Why is this commented? ''' channels = model.get_monitoring_channels(theano_args) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) channels.update(self.cost.get_monitoring_channels(model, theano_args, ** fixed_var_descr.fixed_vars)) for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) # The monitor compiles all channels for the same dataset into one function, and # runs all prereqs before calling the function. So we only need to register the # on_load_batch prereq once per monitoring dataset. self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=cost_value, dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None self.monitor.add_channel(name= prefix + name, ipt=ipt, val=J, data_specs=data_specs, dataset = monitoring_dataset, prereqs=prereqs) ''' params = model.get_params() self.optimizer = BatchGradientDescent( objective = cost_value, gradients = grads, gradient_updates = grad_updates, params = params, param_constrainers = [ model.censor_updates ], lr_scalers = model.get_lr_scalers(), inputs = theano_args, verbose = self.verbose_optimization, max_iter = self.updates_per_batch, reset_alpha = self.reset_alpha, conjugate = self.conjugate, reset_conjugate = self.reset_conjugate, min_init_alpha = self.min_init_alpha, line_search_mode = self.line_search_mode, theano_function_mode=self.theano_function_mode, init_alpha=self.init_alpha) # These monitoring channels keep track of shared variables, # which do not need inputs nor data. if self.monitoring_dataset is not None: self.monitor.add_channel( name='ave_step_size', ipt=None, val=self.optimizer.ave_step_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_size', ipt=None, val=self.optimizer.ave_grad_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_mult', ipt=None, val=self.optimizer.ave_grad_mult, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True
def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if np.any(np.isinf(param.get_value()))] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]): nan_params = [param for param in model.get_params() if np.any(np.isnan(param.get_value()))] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates( learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.censor_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args, ) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [ d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size) ] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) / cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append( function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names])
def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if np.any(np.isinf(param.get_value()))] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]): nan_params = [param for param in model.get_params() if np.any(np.isnan(param.get_value()))] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model batch_size = self.batch_size if hasattr(model, "force_batch_size"): if model.force_batch_size > 0: if batch_size is not None: if batch_size != model.force_batch_size: if self.set_batch_size: model.set_batch_size(batch_size) else: raise ValueError("batch_size argument to SGD " + "conflicts with model's " + "force_batch_size attribute") else: self.batch_size = model.force_batch_size model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: self.monitor.setup( dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode ) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates( learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.censor_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params
def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. Parameters ---------- model : a Model instance dataset : Dataset """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [ param for param in model.get_params() if np.any(np.isinf(param.get_value())) ] if len(inf_params) > 0: raise ValueError("These params are Inf: " + str(inf_params)) if any([ np.any(np.isnan(param.get_value())) for param in model.get_params() ]): nan_params = [ param for param in model.get_params() if np.any(np.isnan(param.get_value())) ] raise ValueError("These params are NaN: " + str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() # test if force batch size and batch size if getattr(model, "force_batch_size", False) and \ any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for dataset in self.monitoring_dataset.values()) and \ not has_uniform_batch_size(self.monitor_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set monitor_iteration_mode to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, **fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i self.params = params grads, updates = self.cost.get_gradients(model, nested_args, **fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError( str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") assert len(updates.keys()) == 0 def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False): updates = OrderedDict() assert (learn_discriminator or learn_generator ) and not (learn_discriminator and learn_generator) if learn_discriminator: cur_params = model.discriminator.get_params() else: cur_params = model.generator.get_params() def check(): for param in params: if param not in cur_params: assert param not in updates cur_grads = OrderedDict() for param in cur_params: cur_grads[param] = grads[param] for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % { 'costname': cost_value.name, 'paramname': param.name }) assert grads[param].dtype == param.dtype cur_lr_scalers = OrderedDict() for param in cur_params: if param in lr_scalers: lr_scaler = lr_scalers[param] cur_lr_scalers[param] = lr_scaler log.info('Parameter and initial learning rate summary:') for param in cur_params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * cur_lr_scalers.get(param, 1.) log.info('\t' + param_name + ': ' + str(lr)) updates.update( self.learning_rule.get_updates(learning_rate, cur_grads, cur_lr_scalers)) check() for param in cur_params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' check() model.modify_updates(updates) check() for param in cur_params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) check() if dont_you_fucking_dare_touch_the_generator: for param in model.generator.get_params(): assert param not in updates with log_timing(log, 'Compiling sgd_update'): return function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.d_func = get_func(1, 0, dont_you_fucking_dare_touch_the_generator=True) self.g_func = get_func(0, 1)
def main(): parser = argparse.ArgumentParser(description='Pylearn2 lab.') parser.add_argument('-s', '--save', action='store_true', help='Save the resulting images') parser.add_argument( '-q', '--quit', action='store_true', help='Quit after plotting instead of dropping into IPython') parser.add_argument('directory', type=str, help='Which results directory to use') args = parser.parse_args() # OLD #config_file_path = '/home/jason/s/deep_learning/pylearn/pred_net.yaml' #train = yaml_parse.load_path(config_file_path) #train = serial.load_train_file(config_file_path) #result_prefix = '/home/jason/s/pylearn2/pylearn2/pred/results/' result_prefix = '/u/yosinski/s/galatea/fish/results/' result_dir = os.path.join(result_prefix, args.directory) print 'loading train object...' #train = serial.load_train_file(os.path.join(result_dir, 'pred_net.yaml')) train = serial.load_train_file(os.path.join(result_dir, 'model.yaml')) print 'loading saved model...' #model = serial.load(os.path.join(result_dir, 'pred_net.pkl')) model = serial.load(os.path.join(result_dir, 'model.pkl')) print 'done.' print 'model was trained on:' print model.dataset_yaml_src if train.algorithm.cost is not None: data_specs = train.algorithm.cost.get_data_specs(model) else: data_specs = train.model.get_default_cost().get_data_specs(train.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) num_frames = model.num_frames num_batches = 100 batch_size = train.algorithm.batch_size if train.algorithm.batch_size else 20 * num_frames train_dataset = train.dataset valid_dataset = train.algorithm.monitoring_dataset['valid'] rng = train.algorithm.rng if not is_stochastic(train.algorithm.train_iteration_mode): rng = None train_iterator = train_dataset.iterator( mode=train.algorithm.train_iteration_mode, batch_size=batch_size, data_specs=flat_data_specs, return_tuple=True, rng=rng, num_batches=num_batches * 10) valid_iterator = valid_dataset.iterator( mode=train.algorithm.train_iteration_mode, batch_size=batch_size, data_specs=flat_data_specs, return_tuple=True, # No rng override num_batches=num_batches * 10) train_batches = [train_iterator.next() for ii in range(num_batches)] valid_batches = [valid_iterator.next() for ii in range(num_batches)] print 'got batches with shape:' for dat in train_batches[0]: print ' ', dat.shape ######################### # Plot costs ######################### # Plot costs over time ch_train_objective = model.monitor.channels['train_objective'] ch_valid_objective = model.monitor.channels['valid_objective'] x_vals = ch_train_objective.epoch_record x_label = 'epoch' plot(x_vals, ch_train_objective.val_record, 'b-') plot(x_vals, ch_valid_objective.val_record, 'r-') legend(('train', 'valid')) if args.save: savefig(os.path.join(result_dir, 'costs_lin.png')) savefig(os.path.join(result_dir, 'costs_lin.pdf')) if args.save: gca().set_yscale('log') savefig(os.path.join(result_dir, 'costs_log.png')) savefig(os.path.join(result_dir, 'costs_log.pdf')) gca().set_yscale('linear') ######################### # Compute some accuracies ######################### try: model.fns.feat_to_compout except: model.redo_theano() all_acc_id = [] all_xy_errs = [] print 'Training set:' print ' acc_id\tx_err\ty_err' for bb, batch in enumerate(train_batches): feat, ids, xy = batch idsN_floatX = array(ids.argmax(1), dtype=theano.config.floatX) acc_id = model.fns.wiskott_id_accuracy(feat, idsN_floatX) all_acc_id.append(acc_id) xy_errs = model.fns.wiskott_xy_errors(feat, xy[:, 0:2]) all_xy_errs.append(xy_errs) # Old numpy way #ids_hat,xy_hat = model.fns.feat_to_idxy(feat) #idx_true = np.where( ids == 1 )[1] #idx_hat = np.where(np.sign(ids_hat.T - ids_hat.max(1)).T + 1)[1] #n_correct += (idx_true == idx_hat).sum() #n_total += len(idx_true) print '%2d:\t%g,\t%g,\t%g' % (bb, acc_id, xy_errs[0], xy_errs[1]) ######################### # Embed ######################### if not args.quit: # Start shell ipshell() print 'done.'
def setup_impl(self, model, dataset, algorithm): cost = algorithm.cost root = model.get_param_vector() dim = root.size rng = self.rng points = rng.randn(self.num_points, self.num_basis_vectors) points = points.astype(root.dtype) points *= self.scale if self.include_root: points[0, :] = 0. if not hasattr(self, 'cost_fn'): # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now # ======================================= data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) # End cargo culting # ====================== print "Compiling cost function..." cost_fn = function(theano_args, cost_value) self.cost_fn = cost_fn else: cost_fn = self.cost_fn cost_values = np.zeros(self.num_points) data = list(dataset.get_batch_design(self.batch_size, include_labels=True)) from pylearn2.utils.one_hot import one_hot data[1] = one_hot(data[1]) if self.method == 'gaussian': basis = rng.normal(dim, self.num_basis_vectors).astype(root.dtype) elif self.method == 'element': basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): basis[rng.randint(dim), i] = 1. elif self.method == 'gradient': if not hasattr(self, 'grad_fn'): self.grad_fn = function(theano_args, grad(cost_value, model.get_params())) grad_fn = self.grad_fn basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): ipt = list(dataset.get_batch_design(1, include_labels=True)) label = ipt[1] assert label.size == 1 label = label[0] one_hot = np.zeros((1, 10,),dtype='float32') one_hot[0, label] = 1 ipt[1] = one_hot g = grad_fn(*ipt) basis[:,i] = np.concatenate([e.reshape(e.size) for e in g], axis=0) else: assert False basis /= np.sqrt(np.square(basis).sum(axis=0)) # Orthogonalize basis for i in xrange(self.num_basis_vectors): v = basis[:,i ].copy() for j in xrange(i - 1): u = basis[:, j].copy() v -= np.dot(u, v) * u norm = np.sqrt(np.square(v).sum()) assert norm > 1e-4 v /= norm basis[:,i] = v for i in xrange(self.num_points): print "Evaluating cost at point ", i point = points[i, :] full_point = root + np.dot(basis, point) model.set_param_vector(full_point) cost_values[i] = cost_fn(*data) print cost_values[i] from pylearn2.utils import sharedX import theano.tensor as T print "!!!!!!!! FITTING THE QUADRATIC FUNCTION !!!!!!!!!!!!!!!!!!!" if not hasattr(self, 'fit_quad'): points = sharedX(points) #from theano import config #config.compute_test_value = 'raise' cost_values = sharedX(cost_values) A = sharedX(np.zeros((self.num_basis_vectors, self.num_basis_vectors))) if self.psd: mat = T.dot(A.T, A) else: mat = A b = sharedX(np.zeros(self.num_basis_vectors)) c = sharedX(0.) half_quad = T.dot(points, mat) quad = (points * half_quad).sum(axis=1) lin = T.dot(points, b) pred = quad + lin + c from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent mse = T.square(pred - cost_values).mean() mae = abs(pred - cost_values).mean() obj = locals()[self.fitting_cost] fit_quad = BatchGradientDescent(obj, params = [A, b, c], max_iter = self.num_basis_vectors ** 2, verbose = 3, tol = None, init_alpha = None, min_init_alpha = 1e-7, reset_alpha = False, conjugate = True, reset_conjugate = False, line_search_mode = 'exhaustive') self.fit_quad = fit_quad self.A = A self.b = b self.c = c self.points = points self.cost_values = cost_values else: self.A.set_value(.001 * np.identity(self.A.get_value().shape[0], dtype=self.A.dtype)) self.b.set_value(self.b.get_value() * 0.) self.c.set_value(self.c.get_value() * 0.) self.points.set_value(points) self.cost_values.set_value(cost_values.astype(self.cost_values.dtype)) self.fit_quad.minimize() print "!!!!!!!!!!!!! FINDING ITS MINIMUM !!!!!!!!!!!!!!!!!!!!!!!!!!!" if self.use_solver: if self.psd: Av = self.A.get_value() mat_v = np.dot(Av.T, Av) else: mat_v = self.A.get_value() bv = self.b.get_value() # minimize for x^T A x + b^T x + c # -> solve 2 A x + b = 0 # Ax = - b / 2 print "********** mat_v", mat_v.min(), mat_v.max() x, ignored_residuals, ignored_rank, ignored_singular_values = np.linalg.lstsq(mat_v, - 0.5 * bv) print "********** soln: ", x.min(), x.mean(), x.max() print "********** SVs: ", ignored_singular_values.min(), ignored_singular_values.max() assert x.ndim == 1, x.shape prod = np.dot(basis, x) norm = np.sqrt(np.square(prod).sum()) print "*************** Moving params by ",norm vector = root + prod model.set_param_vector(vector) else: # use minimizer if not hasattr(self, 'fit_params'): self.vector = sharedX(points.get_value().mean(axis=0)) vector = self.vector obj = T.dot(T.dot(mat, vector), vector) + T.dot(b, vector) def constrain(d): assert vector in d n = d[vector] norm = T.sqrt(T.square(n).sum()) desired_norm = T.clip(norm, 0., self.max_jump_norm) d[vector] = n * desired_norm / norm self.fit_params = BatchGradientDescent(obj, params=[vector], max_iter = self.num_basis_vectors, verbose = 3, tol=None, param_constrainers = [constrain], init_alpha = None, min_init_alpha = 1e-3, reset_alpha=False, conjugate=True, reset_conjugate=False, line_search_mode='exhaustive') else: self.vector.set_value(points.mean(axis=0).astype(self.vector.dtype)) self.fit_params.minimize() model.set_param_vector(root + np.dot(basis , self.vector.get_value()))
class Monitor(object): """ A class for monitoring Models while they are being trained. A monitor object records the number of minibatches and number of examples the model has trained, as well as any number of "channels" that track quantities of interest (examples: the objective function, measures of hidden unit activity, reconstruction error, sum of squared second derivatives, average norm of the weight vectors, etc.) Parameters ---------- model : `pylearn2.models.model.Model` """ def __init__(self, model): self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs() def _build_data_specs(self): """ Computes a nested data_specs for input and all channels Also computes the mapping to flatten it. This function is called from redo_theano. """ # Ask the model what it needs m_space, m_source = self.model.get_monitoring_data_specs() input_spaces = [m_space] input_sources = [m_source] for channel in self.channels.values(): space = channel.data_specs[0] assert isinstance(space, Space) input_spaces.append(space) input_sources.append(channel.data_specs[1]) nested_space = CompositeSpace(input_spaces) nested_source = tuple(input_sources) self._nested_data_specs = (nested_space, nested_source) self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs) flat_space = self._data_specs_mapping.flatten(nested_space, return_tuple=True) flat_source = self._data_specs_mapping.flatten(nested_source, return_tuple=True) self._flat_data_specs = (CompositeSpace(flat_space), flat_source) def set_theano_function_mode(self, mode): """ .. todo:: WRITEME Parameters ---------- mode : theano.compile.Mode Theano functions for the monitoring channels will be compiled and run using this mode. """ if self.theano_function_mode != mode: self._dirty = True self.theano_function_mode = mode def add_dataset(self, dataset, mode='sequential', batch_size=None, num_batches=None, seed=None): """ Determines the data used to calculate the values of each channel. Parameters ---------- dataset : object A `pylearn2.datasets.Dataset` object. mode : str or object, optional Iteration mode; see the docstring of the `iterator` method on `pylearn2.datasets.Dataset` for details. batch_size : int, optional The size of an individual batch. Optional if `mode` is 'sequential' and `num_batches` is specified (batch size will be calculated based on full dataset size). num_batches : int, optional The total number of batches. Unnecessary if `mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). seed : int, optional Optional. The seed to be used for random iteration modes. """ # The user can ommit using lists if only one dataset is set if not isinstance(dataset, list): dataset = [dataset] if not isinstance(mode, list): mode = [mode] if not isinstance(batch_size, list): batch_size = [batch_size] if not isinstance(num_batches, list): num_batches = [num_batches] if seed is None: seed = [None] * len(dataset) if not isinstance(seed, list): seed = [seed] if len(mode) != len(dataset): raise ValueError("Received " + str(len(dataset)) + " dataset but " + str(len(mode)) + " modes.") if any([len(l) != len(dataset) for l in [batch_size, seed]]): raise ValueError("make sure each dataset has its iteration " + "batch size and number of batches.") for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed): try: it = d.iterator(mode=m, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) except ValueError as exc: reraise_as(ValueError("invalid iteration parameters in " + "Monitor.add_dataset: " + str(exc))) if it.stochastic: # Must be a seed, not a random number generator. If it were a # random number generator, different iterators using it would # update its state, so we would not get the same iterator # each time. Also, must not be None, because this makes the # iterator pick a seed based on the clock if sd is None: raise TypeError("Monitor requires a seed when using " + "stochastic iteration modes.") if not isinstance(sd, (list, tuple, int)): raise TypeError("Monitor requires a seed (not a random " + "number generator) when using " + "stochastic iteration modes.") else: # The iterator should catch this, but let's double-check assert sd is None if not d in self._datasets: self._datasets.append(d) self._iteration_mode.append(m) self._batch_size.append(b) self._num_batches.append(n) self._rng_seed.append(sd) def __call__(self): """ Runs the model on the monitoring dataset in order to add one data point to each of the channels. """ # If the channels have changed at all, we need to recompile the theano # functions used to compute them if self._dirty: self.redo_theano() datasets = self._datasets # Set all channels' val_shared to 0 self.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size, self._num_batches, self.accum, self._rng_seed, self.num_examples): if isinstance(d, basestring): d = yaml_parse.load(d) raise NotImplementedError() # need to put d back into self._datasets myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) # If self._flat_data_specs is empty, no channel needs data, # so we do not need to call the iterator in order to average # the monitored values across different batches, we only # have to call them once. if len(self._flat_data_specs[1]) == 0: X = () self.run_prereqs(X, d) a(*X) else: actual_ne = 0 for X in myiterator: # X is a flat (not nested) tuple self.run_prereqs(X, d) a(*X) actual_ne += self._flat_data_specs[0].np_batch_size(X) # end for X if actual_ne != ne: raise RuntimeError("At compile time, your iterator said " "it had %d examples total, but at " "runtime it gave us %d." % (ne, actual_ne)) # end for d log.info("Monitoring step:") log.info("\tEpochs seen: %d" % self._epochs_seen) log.info("\tBatches seen: %d" % self._num_batches_seen) log.info("\tExamples seen: %d" % self._examples_seen) t = time.time() - self.t0 for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key): channel = self.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(self._num_batches_seen) channel.example_record.append(self._examples_seen) channel.epoch_record.append(self._epochs_seen) val = channel.val_shared.get_value() channel.val_record.append(val) # TODO: use logging infrastructure so that user can configure # formatting if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val log.info("\t%s: %s" % (channel_name, val_str)) def run_prereqs(self, data, dataset): """ Runs all "prerequistie functions" on a batch of data. Always called right before computing the monitoring channels on that batch. Parameters ---------- data : tuple or Variable a member of the Space used as input to the monitoring functions dataset : Dataset the Dataset the data was drawn from """ if dataset not in self.prereqs: return for prereq in self.prereqs[dataset]: prereq(*data) def get_batches_seen(self): """ Returns the number of batches the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly). """ return self._num_batches_seen def get_epochs_seen(self): """ .. todo:: WRITEME Returns ------- epochs_seen : int The number of epochs the model has been trained on. One "epoch" is one pass through Dataset.iterator. """ return self._epochs_seen def get_examples_seen(self): """ .. todo:: WRITEME Returns ------- examples_seen : int The number of examples the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._examples_seen def report_batch(self, num_examples): """ Call this whenever the model has learned on another batch of examples. Report how many examples were learned on. Parameters ---------- num_examples : int The number of examples learned on in this minibatch. """ self._examples_seen += num_examples self._num_batches_seen += 1 def report_epoch(self): """ Call this whenever the model has completed another "epoch" of learning. We regard one pass through Dataset.iterator as one epoch. """ self._epochs_seen += 1 def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry' ) updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args,) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it] self.num_examples = [float(i.num_examples) for i in it] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] inv_cur_num_examples = as_floatX(1./self.num_examples[index]) u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) * inv_cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append(function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names]) def register_names_to_del(self, names): """ Register names of fields that should be deleted before pickling. Parameters ---------- names : list A list of attribute names as strings. """ for name in names: if name not in self.names_to_del: self.names_to_del.append(name) def __getstate__(self): """ In order to avoid pickling a copy of the dataset whenever a monitor is saved, the __getstate__ method replaces the dataset field with the dataset's yaml source. This is not a perfect solution because it won't work with job resuming, which would require saving the state of the dataset's random number generator. Like in the Model class, we also need to avoid saving any Theano functions, so we delete everything that can be regenerated with `redo_theano` by deleting the fields in `self.names_to_del` """ # Patch old pickled monitors if not hasattr(self, '_datasets'): self._datasets = [self._dataset] del self._dataset temp = self._datasets if self._datasets: self._datasets = [] for dataset in temp: if isinstance(dataset, basestring): self._datasets.append(dataset) else: try: self._datasets.append(dataset.yaml_src) except AttributeError: warnings.warn('Trained model saved without ' + 'indicating yaml_src') d = copy.copy(self.__dict__) self._datasets = temp for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): """ Sets the object to have the state described by `d`. Parameters ---------- d : dict A dictionary mapping string names of fields to values for these fields. """ # patch old pkl files if '_dataset' in d: d['_datasets'] = [d['_dataset']] del d['_dataset'] self.__dict__.update(d) def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. (or a list/tuple containing symbolic tensors, following the data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt,) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt,) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and \ not isinstance(elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel '+name+'\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: reraise_as(ValueError("The dataset specified is not one of the " + "monitor's datasets")) if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True def _sanity_check(self): """ Sometimes we serialize models and then load them somewhere else but still try to use their Monitor, and the Monitor is in a mangled state. I've added some calls to _sanity_check to try to catch when that happens. Not sure what to do for a long term fix. I think it requires making theano graphs serializable first. """ for name in self.channels: channel = self.channels[name] assert hasattr(channel, 'prereqs') @classmethod def get_monitor(cls, model): """ Returns a model's monitor. If the model doesn't have a monitor yet, installs one and returns that. Parameters ---------- model : object An object that implements the `Model` interface specified in `pylearn2.models`. """ if hasattr(model, 'monitor'): rval = model.monitor rval._sanity_check() else: rval = Monitor(model) model.monitor = rval return rval # TODO: find out if this method is used anywhere, remove if not. @property def batch_size(self): """ .. todo:: WRITEME Returns ------- batch_size : int The size of the batches used for monitoring """ return self._batch_size # TODO: find out if this method is used anywhere, remove if not. @property def num_batches(self): """ .. todo:: WRITEME Returns ------- num_batches : int The number of batches used for monitoring """ return self._num_batches def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential', obj_prereqs=None, cost_monitoring_args=None): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. Parameters ---------- dataset : pylearn2.datasets.Dataset Dataset or dictionary mapping string names to Datasets. If string names are used, then for every dataset, each channel defined by the model or cost will be replicated with that dataset's name followed by an underscore as the prefix. For example, if your cost defines a channel called 'misclass', and datasets is {'train' : train_dataset, 'valid' : valid_dataset}, you will get channels called 'train_misclass' and 'valid_misclass'. cost : pylearn2.costs.Cost The cost being optimized by training. The value of the cost will appear as the `objective` channel. Its `get_monitoring_channels` method will also be used to supply other channels. extra_costs : OrderedDict, optional A dictionary mapping channel names to Cost objects. Their value will appear as the specified channel name. They will also provide more monitoring channels via their `get_monitoring_channels` method. obj_prereqs : None, or list of functions Functions to pass as prerequisites to the `objective` channel. cost_monitoring_args : dict Dictionary of kwargs that will be passed to `cost.get_monitoring_channels()` (but not for the extra_costs). """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: assert isinstance(extra_costs, (OrderedDict, dict)) costs = extra_costs assert '' not in costs costs[''] = cost if cost_monitoring_args is None: cost_monitoring_args = {} model = self.model # Build a composite data_specs containing the specs for all costs, # then the specs of the model cost_names = sorted(costs.keys()) spaces = [] sources = [] for c in cost_names: c_space, c_source = costs[c].get_data_specs(model) spaces.append(c_space) sources.append(c_source) # Ask the model for the data_specs needed m_space, m_source = model.get_monitoring_data_specs() spaces.append(m_space) sources.append(m_source) nested_space = CompositeSpace(spaces) nested_sources = tuple(sources) # Flatten this data_specs, so we build only one symbolic Theano # variable for each of the unique (space, source) pairs. mapping = DataSpecsMapping((nested_space, nested_sources)) space_tuple = mapping.flatten(nested_space, return_tuple=True) source_tuple = mapping.flatten(nested_sources, return_tuple=True) ipt = tuple(space.make_theano_batch(name='monitor_%s' % source, batch_size=None) for (space, source) in safe_zip(space_tuple, source_tuple)) # Build a nested tuple from ipt, to dispatch the appropriate parts # of the ipt batch to each cost nested_ipt = mapping.nest(ipt) custom_channels = {} for i, cost_name in enumerate(cost_names): if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] cost_ipt = nested_ipt[i] raw_channels = cost.get_monitoring_channels(model, cost_ipt) channels = {} for name in raw_channels: # We need three things: the value itself (raw_channels[name]), # the input variables (cost_ipt), and the data_specs for # these input variables ((spaces[i], sources[i])) channels[prefix + name] = (raw_channels[name], cost_ipt, (spaces[i], sources[i])) custom_channels.update(channels) # Use the last inputs from nested_ipt for the model model_channels = model.get_monitoring_channels(nested_ipt[-1]) channels = {} for name in model_channels: # Note: some code used to consider that model_channels[name] # could be a a (channel, prereqs) pair, this is not supported. channels[name] = (model_channels[name], nested_ipt[-1], (spaces[-1], sources[-1])) custom_channels.update(channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks # that respond to the values in the monitor use the name to find # it. for i, cost_name in enumerate(cost_names): cost = costs[cost_name] cost_ipt = nested_ipt[i] cost_value = cost.expr(model, cost_ipt) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' prereqs = obj_prereqs else: name = dprefix + cost_name prereqs = None cost.get_data_specs(model)[0].validate(cost_ipt) self.add_channel(name=name, ipt=cost_ipt, val=cost_value, data_specs=cost.get_data_specs(model), dataset=cur_dataset, prereqs=prereqs) for key in custom_channels: val, ipt, data_specs = custom_channels[key] data_specs[0].validate(ipt) self.add_channel(name=dprefix + key, ipt=ipt, val=val, data_specs=data_specs, dataset=cur_dataset)
def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. Parameters ---------- model : a Model instance dataset : Dataset """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if contains_inf(param.get_value())] if len(inf_params) > 0: raise ValueError("These params are Inf: " + str(inf_params)) if any([contains_nan(param.get_value()) for param in model.get_params()]): nan_params = [param for param in model.get_params() if contains_nan(param.get_value())] raise ValueError("These params are NaN: " + str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() # test if force batch size and batch size has_force_batch_size = getattr(model, "force_batch_size", False) train_dataset_is_uneven = dataset.get_num_examples() % self.batch_size != 0 has_monitoring_datasets = self.monitoring_dataset is not None and self.monitoring_dataset.values() > 0 if has_monitoring_datasets: monitoring_datasets_are_uneven = any( d.get_num_examples() % self.batch_size != 0 for d in self.monitoring_dataset.values() ) else: monitoring_datasets_are_uneven = False # or True it doesn't matter if has_force_batch_size and train_dataset_is_uneven and not has_uniform_batch_size(self.train_iteration_mode): raise ValueError( "Dataset size is not a multiple of batch size." "You should set train_iteration_mode (and " "maybe monitor_iteration_mode) to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential" ) if ( has_force_batch_size and has_monitoring_datasets and monitoring_datasets_are_uneven and not has_uniform_batch_size(self.monitor_iteration_mode) ): raise ValueError( "Dataset size is not a multiple of batch size." "You should set monitor_iteration_mode to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential" ) data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = "%s[%s]" % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, **fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = "objective" # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: if self.monitoring_batch_size is None and self.monitoring_batches is None: self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup( dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode, ) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] # TODO: have Monitor support non-data-dependent channels self.monitor.add_channel( name="learning_rate", ipt=None, val=learning_rate, data_specs=(NullSpace(), ""), dataset=monitoring_dataset, ) if self.learning_rule: self.learning_rule.add_channels_to_monitor(self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = "sgd_params[%d]" % i grads, updates = self.cost.get_gradients(model, nested_args, **fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError( str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict." ) for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = "grad(%(costname)s, %(paramname)s)" % { "costname": cost_value.name, "paramname": param.name, } assert grads[param].dtype == param.dtype lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError( "Tried to scale the learning rate on " + str(key) + " which is not an optimization parameter." ) log.info("Parameter and initial learning rate summary:") for param in params: param_name = param.name if param_name is None: param_name = "anon_param" lr = learning_rate.get_value() * lr_scalers.get(param, 1.0) log.info("\t" + param_name + ": " + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates(learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict( safe_zip( params, [param - learning_rate * lr_scalers.get(param, 1.0) * grads[param] for param in params] ) ) ) for param in params: if updates[param].name is None: updates[param].name = "sgd_update(" + param.name + ")" model.modify_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = "censor(sgd_update(" + param.name + "))" for update_val in get_debug_values(update): if contains_inf(update_val): raise ValueError("debug value of %s contains infs" % update.name) if contains_nan(update_val): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, "Compiling sgd_update"): self.sgd_update = function( theano_args, updates=updates, name="sgd_update", on_unused_input="ignore", mode=self.theano_function_mode, ) self.params = params