def run(replay, log=None): if not replay: log = StringIO() else: log = StringIO(log) record = Record(replay=replay, file_object=log) disturb_mem.disturb_mem() mode = RecordMode(record=record) b = sharedX(np.zeros((2, )), name="b") channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0.0, name="s_" + str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for _, update in updates): if var.name is not None and var.name != "b": if var.name[0] != "s" or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) f = theano.function([], mode=mode, updates=updates, on_unused_input="ignore", name="f") for output in f.maker.fgraph.outputs: mode.record.handle_line(var_descriptor(output) + "\n") disturb_mem.disturb_mem() f() mode.record.f.flush() if not replay: return log.getvalue()
def run(replay, log=None): if not replay: log = StringIO() else: log = StringIO(log) record = Record(replay=replay, file_object=log) disturb_mem.disturb_mem() mode = RecordMode(record=record) b = sharedX(np.zeros((2,)), name='b') channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0., name='s_' + str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for _, update in updates): if var.name is not None and var.name is not 'b': if var.name[0] != 's' or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f') for output in f.maker.fgraph.outputs: mode.record.handle_line(var_descriptor(output) + '\n') disturb_mem.disturb_mem() f() mode.record.f.flush() if not replay: return log.getvalue()
def run(replay): disturb_mem.disturb_mem() mode = RecordMode(file_path="nondeterminism_4.txt", replay=replay) b = sharedX(np.zeros((1, )), name='b') channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0., name='s_' + str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for var, update in updates): if var.name is not None and var.name is not 'b': if var.name[0] != 's' or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f') for output in f.maker.fgraph.outputs: mode.record.handle_line(var_descriptor(output) + '\n') disturb_mem.disturb_mem() f() mode.record.f.flush() mode.record.f.close()
def run(replay): disturb_mem.disturb_mem() mode = RecordMode(file_path= "nondeterminism_4.txt", replay=replay) b = sharedX(np.zeros((2,)), name='b') channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0., name='s_'+str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for var, update in updates): if var.name is not None and var.name is not 'b': if var.name[0] != 's' or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f') for output in f.maker.fgraph.outputs: mode.record.handle_line(var_descriptor(output)+'\n') disturb_mem.disturb_mem() f() mode.record.f.flush() mode.record.f.close()
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2 - t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = \ self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = \ function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = \ sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = \ function([norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) # beta_pr is the Polak-Ribiere formula for beta. # According to wikipedia, the beta to use for NCG is "a matter of # heuristics or taste" but max(0, beta_pr) is "a popular choice... # which provides direction reset automatically." (ie, it is meant # to revert to steepest descent when you have traveled far enough # that the objective function is behaving non-quadratically enough # that the conjugate gradient formulas aren't working anymore) # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = \ [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line( 'BatchGradientDescent._make_conjugate var ' + var_descriptor(v) + '\n') mode.record.handle_line( 'BatchGradientDescent._make_conjugate update ' + var_descriptor(u) + '\n') self._make_conjugate = \ function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line( 'BatchGradientDescent._make_conjugate output ' + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def add_channel(self, name, ipt, val, dataset=None, prereqs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name: str The display name in the monitor. ipt: tensor_like The symbolic tensor which should be clamped to the data. (or a (features,targets) list/tuple containing two symbolic tensors) val: tensor_like The value (function of `ipt`) to be tracked. dataset: A Dataset instance specifying which dataset to compute this channel on. prereqs: list of callables that take two numpy tensors (X and y, where y will be None if no labels are used) each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once """ if isinstance(val, (float, int)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if not isinstance(ipt, (list, tuple)): tmp = [ ipt ] else: tmp = ipt inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and not isinstance(elem, theano.gof.graph.Constant): if elem not in tmp: raise ValueError("Unspecified input: "+str(elem)) mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel '+name+'\n') if isinstance(ipt, (list, tuple)): for elem in ipt: mode.record.handle_line('Includes input var '+var_descriptor(elem)+'\n') else: mode.record.handle_line(name+' input var is '+var_descriptor(ipt)+'\n') mode.record.handle_line('channel '+name+' is '+var_descriptor(val)+'\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not " + \ "one of the monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) if isinstance(ipt, (list, tuple)): if dataset is not None: if not dataset.has_targets(): raise ValueError("Tried to create a channel ("+name \ +") that uses targets, but monitoring dataset has no targets") self.require_label = True assert len(ipt) == 2 self.channels[name] = MonitorChannel(ipt, val, name, dataset, prereqs) self._dirty = True
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is needed so that if new channels are added, Theano's optimizations make sure (to the extent that they can) that the new channels and old channels don't have any redundant calculations. It is also needed to regenerate Theano functions after pickling and unpickling, since Theano functions should not be pickled. """ self._dirty = False init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function(inputs=[], updates=updates, mode=self.theano_function_mode, name = 'Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() #Get the appropriate kind of theano variable to represent the data the model #acts on X = self.model.get_input_space().make_theano_batch(name = "monitoring_X") if config.compute_test_value != 'off': m = self.model.get_test_batch_size() test_value = self.model.get_input_space().get_origin_batch(m) X.tag.test_value = np.cast[X.type.dtype](test_value) if self.require_label: Y = self.model.get_output_space().make_theano_batch(name = "monitoring_Y") log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including channel '+key+'\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for channel in self.channels.values(): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] n = num_examples[index] u = updates[index] if isinstance(channel.graph_input, (list, tuple)): g[channel.graph_input[0]] = X g[channel.graph_input[1]] = Y else: g[channel.graph_input] = X if n == 0: raise ValueError("Iterating over 0 examples results in divide by 0") if self.topo: batch_index = d.get_topo_batch_axis() else: batch_index = 0 val = channel.val * T.cast(X.shape[batch_index], config.floatX) / n u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' \ + key.name + ' has dtype ' + key.dtype + \ ' but is driven by an expression with type ' + \ up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key '+var_descriptor(elem)+'\n') mode.record.handle_line('g val '+var_descriptor(g[elem])+'\n') for elem in u: mode.record.handle_line('u key '+var_descriptor(elem)+'\n') mode.record.handle_line('u val '+var_descriptor(u[elem])+'\n') function_name = 'Monitor.accum[%d]' % idx if self.require_label: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just monitor the model # parameters, or some shared variable updated by the training algorithm, so we # need to ignore the unused input error self.accum.append(function([X, Y], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) else: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling unsupervised accum\n') self.accum.append(function([X], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output '+var_descriptor(elem)+'\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names])
def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. \ (or a list/tuple containing symbolic tensors, following the \ data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of \ data drawn *from dataset* before the channel value is computed \ if two channels provide a prereq with exactly the same id, that \ prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt, ) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt, ) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and \ not isinstance(elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel ' + name + '\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not one of the " + "monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args, ) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [ d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size) ] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) / cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append( function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names])
def add_channel(self, name, ipt, val, dataset=None, prereqs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name: str The display name in the monitor. ipt: tensor_like The symbolic tensor which should be clamped to the data. (or a (features,targets) list/tuple containing two symbolic tensors) val: tensor_like The value (function of `ipt`) to be tracked. dataset: A Dataset instance specifying which dataset to compute this channel on. prereqs: list of callables that take two numpy tensors (X and y, where y will be None if no labels are used) each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if not isinstance(ipt, (list, tuple)): tmp = [ipt] else: tmp = ipt inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and not isinstance( elem, theano.gof.graph.Constant): if elem not in tmp: raise ValueError("Unspecified input: " + str(elem)) mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel ' + name + '\n') if isinstance(ipt, (list, tuple)): for elem in ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(ipt) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not " + \ "one of the monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) if isinstance(ipt, (list, tuple)): if dataset is not None: if not dataset.has_targets(): raise ValueError("Tried to create a channel ("+name \ +") that uses targets, but monitoring dataset has no targets") self.require_label = True assert len(ipt) == 2 self.channels[name] = MonitorChannel(ipt, val, name, dataset, prereqs) self._dirty = True
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is needed so that if new channels are added, Theano's optimizations make sure (to the extent that they can) that the new channels and old channels don't have any redundant calculations. It is also needed to regenerate Theano functions after pickling and unpickling, since Theano functions should not be pickled. """ self._dirty = False init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data the model # acts on X = self.model.get_input_space().make_theano_batch(name="monitoring_X") if config.compute_test_value != 'off': m = self.model.get_test_batch_size() test_value = self.model.get_input_space().get_origin_batch(m) X.tag.test_value = np.cast[X.type.dtype](test_value) if self.require_label: Y = self.model.get_output_space().make_theano_batch( name="monitoring_Y") log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling monitor including channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for channel in self.channels.values(): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] if isinstance(channel.graph_input, (list, tuple)): channel_X, channel_Y = channel.graph_input assert channel_X not in g or g[channel_X] is X assert channel_Y not in g or g[channel_Y] is Y g[channel_X] = X g[channel_Y] = Y else: channel_X = channel.graph_input assert channel_X not in g or g[channel_X] is X g[channel_X] = X if n == 0: raise ValueError( "Iterating over 0 examples results in divide by 0") if self.topo: batch_index = d.get_topo_batch_axis() else: batch_index = 0 val = channel.val * T.cast(X.shape[batch_index], config.floatX) / cur_num_examples u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' \ + key.name + ' has dtype ' + key.dtype + \ ' but is driven by an expression with type ' + \ up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if self.require_label: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just monitor the model # parameters, or some shared variable updated by the training algorithm, so we # need to ignore the unused input error self.accum.append( function([X, Y], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) else: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling unsupervised accum\n') self.accum.append( function([X], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names])
def __init__(self, objective, params, inputs = None, param_constrainers = None, max_iter = -1, lr_scalers = None, verbose = 0, tol = None, init_alpha = None, min_init_alpha = 1e-3, reset_alpha = True, conjugate = False, reset_conjugate = True, gradients = None, gradient_updates = None, line_search_mode = None, accumulate = False, theano_function_mode=None): """ objective: a theano expression to be minimized should be a function of params and, if provided, inputs params: A list of theano shared variables. These are the optimization variables inputs: (Optional) A list of theano variables to serve as inputs to the graph. param_constrainers: (Optional) A list of callables to be called on all updates dictionaries to be applied to params. This is how you implement constrained optimization. reset_alpha: If True, reverts to using init_alpha after each call. If False, the final set of alphas is used at the start of the next call to minimize. conjugate: If True, tries to pick conjugate gradient directions. For the directions to be truly conjugate, you must use line_search_mode = 'exhaustive' and the objective function must be quadratic. Using line_search_mode = 'exhaustive' on a non-quadratic objective function implements nonlinear conjugate gradient descent. reset_conjugate: has no effect unless conjugate == True if reset_conjugate == True, reverts to direction of steepest descent for the first step in each call to minimize. otherwise, tries to make the new search direction conjugate to the last one (even though the objective function might be totally different on each call to minimize) gradients: if None, compute the gradients of obj using T.grad otherwise, a dictionary mapping from params to expressions for their gradients (this allows you to use approximate gradients computed with something other than T.grad) gradient_updates: a dictionary of shared variable updates to run each time the gradient is computed Calling the ``minimize'' method with values for for ``inputs'' will update ``params'' to minimize ``objective''. """ self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [ param for param in params ] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX( param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: print 'batch gradient class compiling gradient function' t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates = updates) else: self._compute_grad = function(inputs, updates = updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() print 'done. Took ',t2-t1 if self.verbose: print 'batch gradient class compiling objective function' if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: print 'done' self.param_to_cache = OrderedDict() alpha = T.scalar(name = 'alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g], g * norm) for g in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [ grad_to_old_grad[g] for g in grad_ordered] def dot_product(x, y): return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) """ beta_pr is the Polak-Ribiere formula for beta. According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." (ie, it is meant to revert to steepest descent when you have traveled far enough that the objective function is behaving non-quadratically enough that the conjugate gradient formulas aren't working anymore) http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method """ assert grad not in grad_to_old_grad make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2-t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function( [alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = \ self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = \ function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = \ sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = \ function([norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) # beta_pr is the Polak-Ribiere formula for beta. # According to wikipedia, the beta to use for NCG is "a matter of # heuristics or taste" but max(0, beta_pr) is "a popular choice... # which provides direction reset automatically." (ie, it is meant # to revert to steepest descent when you have traveled far enough # that the objective function is behaving non-quadratically enough # that the conjugate gradient formulas aren't working anymore) # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = \ [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line( 'BatchGradientDescent._make_conjugate var ' + var_descriptor(v) + '\n') mode.record.handle_line( 'BatchGradientDescent._make_conjugate update ' + var_descriptor(u) + '\n') self._make_conjugate = \ function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line( 'BatchGradientDescent._make_conjugate output ' + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): """ objective: a theano expression to be minimized should be a function of params and, if provided, inputs params: A list of theano shared variables. These are the optimization variables inputs: (Optional) A list of theano variables to serve as inputs to the graph. param_constrainers: (Optional) A list of callables to be called on all updates dictionaries to be applied to params. This is how you implement constrained optimization. reset_alpha: If True, reverts to using init_alpha after each call. If False, the final set of alphas is used at the start of the next call to minimize. conjugate: If True, tries to pick conjugate gradient directions. For the directions to be truly conjugate, you must use line_search_mode = 'exhaustive' and the objective function must be quadratic. Using line_search_mode = 'exhaustive' on a non-quadratic objective function implements nonlinear conjugate gradient descent. reset_conjugate: has no effect unless conjugate == True if reset_conjugate == True, reverts to direction of steepest descent for the first step in each call to minimize. otherwise, tries to make the new search direction conjugate to the last one (even though the objective function might be totally different on each call to minimize) gradients: if None, compute the gradients of obj using T.grad otherwise, a dictionary mapping from params to expressions for their gradients (this allows you to use approximate gradients computed with something other than T.grad) gradient_updates: a dictionary of shared variable updates to run each time the gradient is computed Calling the ``minimize'' method with values for for ``inputs'' will update ``params'' to minimize ``objective''. """ self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: print 'batch gradient class compiling gradient function' t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() print 'done. Took ', t2 - t1 if self.verbose: print 'batch gradient class compiling objective function' if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: print 'done' self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + ( 1. - self.new_weight) * self.ave_grad_size self._normalize_grad = function( [], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_' + elem.name) self._store_old_grad = function( [norm], updates=OrderedDict([(grad_to_old_grad[g], g * norm) for g in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) """ beta_pr is the Polak-Ribiere formula for beta. According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." (ie, it is meant to revert to steepest descent when you have traveled far enough that the objective function is behaving non-quadratically enough that the conjugate gradient formulas aren't working anymore) http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method """ assert grad not in grad_to_old_grad make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function( [], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. (or a list/tuple containing symbolic tensors, following the data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt,) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt,) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and \ not isinstance(elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel '+name+'\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: reraise_as(ValueError("The dataset specified is not one of the " + "monitor's datasets")) if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry' ) updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args,) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it] self.num_examples = [float(i.num_examples) for i in it] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] inv_cur_num_examples = as_floatX(1./self.num_examples[index]) u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) * inv_cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append(function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names])