def get_updates(self, grads): grads = OrderedDict(grads) updates = OrderedDict() i_t = self.i + 1. fix1 = 1. - (1. - self.b1) ** i_t fix2 = 1. - (1. - self.b2) ** i_t lr_t = self.learning_rate * (T.sqrt(fix2) / fix1) for param in grads.keys(): m = theano.shared(param.get_value() * 0.) self.parameters.append(m) v = theano.shared(param.get_value() * 0.) self.parameters.append(v) b1t = 1. - (1. - self.b1) * self.lmbda**(i_t - 1) m_t = b1t * grads[param] + (1. - b1t) * m v_t = self.b2 * T.sqr(grads[param]) + (1. - self.b2) * v g_t = m_t / (T.sqrt(v_t) + self.epsilon) p_t = param - (lr_t * g_t) updates[m] = m_t updates[v] = v_t updates[param] = p_t updates[self.i] = i_t return updates
def get_gradients(self, model, data, ** kwargs): #print 'get_gradients' pos_v = data #pos_h = model.sample_h_given_v(pos_v)[-1] #chain_start = pos_v #h_samples = pos_h #print 'v_samples', v_samples.ndim [act_hids, hid_mfs, hid_samples, act_vis, vis_mfs, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, outputs_info=[None, None, None, None, None, pos_v], non_sequences=None, n_steps=self.k) neg_v = vis_samples[-1] #neg_h = hid_samples[-1] cost = -(- model.free_energy(pos_v).mean() + model.free_energy(neg_v).mean()) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, neg_v]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() updates.update(scan_updates) # add scan_updates return gradients, updates
def get_monitoring_channels(self, model, X, Y=None, ** kwargs): if Y is None and self.supervised: raise ValueError("no targets provided while some of the " + "costs in the sum are supervised costs") rval = OrderedDict() for i, cost in enumerate(self.costs): try: rval.update(cost.get_monitoring_channels(model, X, Y, **kwargs)) except TypeError: print 'SumOfCosts.get_monitoring_channels encountered TypeError while calling ' \ + str(type(cost))+'.get_monitoring_channels' raise Y_to_pass = Y if not cost.supervised: Y_to_pass = None value = cost(model, X, Y_to_pass, ** kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_'+str(i)+name] = value return rval
def get_params(self): """ This returns the list of theano shared variables that will be trained by the :class:`Optimizer`. These parameters are used in the gradient. This includes all of the parameters in every model in the Prototype, without duplication. Returns ------- dict(str: SharedVariable) Dictionary of {string_name: theano shared variables} to be trained with an :class:`Optimizer`. These are the parameters to be trained. """ params = OrderedDict() model_index = 0 for model in self.models: if isinstance(model, Model): model_params = model.get_params() # append the parameters only if they aren't already in the list! for name, param in model_params.items(): if param not in list(params.values()): name = model._classname + '_%d_' % model_index + name params[name] = param model_index += 1 return params
def get_gradients(self, model, data, ** kwargs): indiv_results = [] composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for cost, cost_data in safe_zip(self.costs, nested_data): result = cost.get_gradients(model, cost_data, ** kwargs) indiv_results.append(result) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip(self.coeffs, indiv_results): g, u = packed for param in g: if param not in params: raise ValueError("A shared variable (" + str(param) + ") that is not a parameter appeared " "a cost gradient dictionary.") for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
def __init__(self, valid=None, invalid=None, valid_equivalent=None): ''' Check if variables can be expressed without using variables in invalid. init_valid_equivalent provides a dictionary mapping some invalid variables to valid ones that can be used instead. ''' if valid is None: valid = [] if invalid is None: invalid = [] if valid_equivalent is None: valid_equivalent = OrderedDict() # Nodes that are valid to have in the graph computing outputs self.valid = set(valid) # Nodes that are NOT valid to have in the graph computing outputs self.invalid = set(invalid) # Mapping from invalid variables to equivalent valid ones. self.valid_equivalent = valid_equivalent.copy() self.valid.update(valid_equivalent.values()) self.invalid.update(valid_equivalent.keys())
def get_gradients(self, model, data, ** kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad ** 2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def get_gradients(self, model, data, ** kwargs): #print 'get_gradients' chain_start = theano.shared(numpy.zeros(shape=(self.chain_num, model.n_vis), dtype=theano.config.floatX), name='chain_start', borrow=True) [act_hids, hid_mfs, hid_samples, act_vis, vis_mfs, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, outputs_info=[None, None, None, None, None, chain_start], non_sequences=None, n_steps=self.k) chain_end = vis_samples[-1] scan_updates[chain_start] = chain_end pos_v = data cost = -(- model.free_energy(pos_v).mean() + model.free_energy(chain_end).mean()) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, chain_end]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() updates.update(scan_updates) # manual added return gradients, updates
def get_gradients(self, model, data, ** kwargs): #print 'get_gradients' chain_start = theano.shared(numpy.zeros(shape=(self.chain_num, model.n_vis)), name=None, borrow=True) v_samples = chain_start for i in xrange(self.k): v_samples = model.gibbs_vhv(v_samples)[-1] chain_end = v_samples #print 'chain_end', chain_end.ndim chain_updates = {} chain_updates[chain_start] = chain_end pos_v = data #neg_v = self.get_neg_v(model) cost = -(- model.free_energy(pos_v).mean() + model.free_energy(chain_end).mean()) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[chain_end]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() updates.update(chain_updates) # manual added return gradients, updates
def orderings(self): """ Return dict d s.t. d[node] is a list of nodes that must be evaluated before node itself can be evaluated. This is used primarily by the destroy_handler feature to ensure that all clients of any destroyed inputs have already computed their outputs. :note: This only calls the orderings() fct on all features. It does not take care of computing dependencies by itself. """ ords = OrderedDict() assert isinstance(self._features, list) for feature in self._features: if hasattr(feature, 'orderings'): orderings = feature.orderings(self) if not isinstance(orderings, OrderedDict): raise TypeError("Non-deterministic return value from " + str(feature.orderings) + ". Nondeterministic object is " + str(orderings)) for node, prereqs in orderings.items(): if not isinstance(prereqs, (list, OrderedSet)): raise TypeError( "prereqs must be a type with a " "deterministic iteration order, or toposort " " will be non-deterministic.") ords.setdefault(node, []).extend(prereqs) # eliminate duplicate prereqs for (node, prereqs) in ords.items(): ords[node] = list(OrderedSet(prereqs)) return ords
def get_monitoring_channels(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) rval = OrderedDict() composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for i, cost in enumerate(self.costs): cost_data = nested_data[i] try: channels = cost.get_monitoring_channels(model, cost_data, **kwargs) rval.update(channels) except TypeError: print ( "SumOfCosts.get_monitoring_channels encountered " "TypeError while calling " + str(type(cost)) + ".get_monitoring_channels" ) raise value = cost.expr(model, cost_data, **kwargs) if value is not None: name = "" if hasattr(value, "name") and value.name is not None: name = "_" + value.name rval["term_" + str(i) + name] = value return rval
def get_monitoring_channels(self, model, data, ** kwargs): self.get_data_specs(model)[0].validate(data) rval = OrderedDict() composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for i, cost in enumerate(self.costs): cost_data = nested_data[i] try: channels = cost.get_monitoring_channels(model, cost_data, **kwargs) rval.update(channels) except TypeError: logger.error('SumOfCosts.get_monitoring_channels encountered ' 'TypeError while calling {0}' '.get_monitoring_channels'.format(type(cost))) raise value = cost.expr(model, cost_data, ** kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_' + str(i) + name] = value return rval
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): W, = self.transformer.get_params() assert W.ndim == 4 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=(1, 2, 3))) rval = OrderedDict([ ('kernel_norms_min', row_norms.min()), ('kernel_norms_mean', row_norms.mean()), ('kernel_norms_max', row_norms.max()), ]) orval = super(CorrMMElemwise, self).get_monitoring_channels_from_state(state, targets) rval.update(orval) cst = self.cost orval = self.nonlin.get_monitoring_channels_from_state(state, targets, cost_fn=cst) rval.update(orval) return rval
def get_updates(self, grads): grads = OrderedDict(grads) updates = OrderedDict() for param in grads.keys(): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_grad_' + param.name, borrow=False) self.parameters.append(mean_square_grad) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_dx_' + param.name, borrow=False) self.parameters.append(mean_square_dx) # Accumulate gradient new_mean_squared_grad = self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param]) # Compute update rms_dx_tm1 = T.sqrt(mean_square_dx + self.epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + self.epsilon) delta_x_t = - rms_dx_tm1 / rms_grad_t * grads[param] # Accumulate updates new_mean_square_dx = self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t return updates
class OrderedSet(object): """ An implementation of OrderedSet based on the keys of an OrderedDict. """ def __init__(self, iterable=None): self.data = OrderedDict() if iterable is not None: self.update(iterable) def update(self, container): check_deterministic(container) for elem in container: self.add(elem) def add(self, key): self.data[key] = None def __len__(self): return len(self.data) def __contains__(self, key): return key in self.data def discard(self, key): if key in self.data: del self.data[key] def remove(self, key): if key in self.data: del self.data[key] else: raise KeyError(key) def __iter__(self): return self.data.__iter__() def __reversed__(self): return self.data.__reversed__() def pop(self, last=True): raise NotImplementedError() def __eq__(self, other): # Note that we implement only the comparison to another # `OrderedSet`, and not to a regular `set`, because otherwise we # could have a non-symmetric equality relation like: # my_ordered_set == my_set and my_set != my_ordered_set if isinstance(other, OrderedSet): return len(self) == len(other) and list(self) == list(other) elif isinstance(other, set): # Raise exception to avoid confusion. raise TypeError( 'Cannot compare an `OrderedSet` to a `set` because ' 'this comparison cannot be made symmetric: please ' 'manually cast your `OrderedSet` into `set` before ' 'performing this comparison.') else: return NotImplemented
def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt')) var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt')) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed_raw = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw]) train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw]) valid_collapsed_raw = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw]) valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw]) log.debug('compiling...') f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates) f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates) log.debug('done') t1=time.time() for epoch in range(10): t=time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) for name, service in train_services.items(): if name in m: service.write(m[name], TRAIN) log.debug('----- '+make_time_units_string(time.time()-t)) for epoch in range(10): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) for name, service in valid_services.items(): if name in m: service.write(m[name], VALID) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=NotImplementedError): if self.no_affine: return OrderedDict() W_class = self.W_class W_cluster = self.W_cluster assert W_class.ndim == 3 assert W_cluster.ndim == 2 sq_W = T.sqr(W_cluster) sq_W_class = T.sqr(W_class) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) row_norms_class = T.sqrt(sq_W_class.sum(axis=1)) col_norms_class = T.sqrt(sq_W_class.sum(axis=0)) rval = OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ('class_row_norms_min' , row_norms_class.min()), ('class_row_norms_mean' , row_norms_class.mean()), ('class_row_norms_max' , row_norms_class.max()), ('class_col_norms_min' , col_norms_class.min()), ('class_col_norms_mean' , col_norms_class.mean()), ('class_col_norms_max' , col_norms_class.max()), ]) if (state_below is not None) or (state is not None): if state is None: #for value in get_debug_values(state_below): #print 'value is'+ value state=self.fprop (state_below,targets) #print state probclass, probcluster = state mx = probclass.max(axis=1) rval.update(OrderedDict([('mean_max_class',mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ])) if targets is not None: rval['nll'] = self.cost(Y=targets,Y_hat=(probclass,probcluster)) rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32')) rval['entropy'] = rval['nll']/np.log(2).astype('float32') return rval
def get_updates(self, grads): grads = OrderedDict(grads) updates = OrderedDict() for param in grads.keys(): decreased_learning_rate = T.cast(self.learning_rate / (1 + (self.decrease_constant * self.current_iteration)), dtype=theano.config.floatX) updates[param] = param - decreased_learning_rate * grads[param] updates[self.current_iteration] = self.current_iteration + 1 return updates
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ Provides the updates for learning with gradient descent + momentum. Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) updates = OrderedDict() for param, grad in gshared.keys(): vel = sharedX(param.get_value() * 0.) assert param.dtype == vel.dtype assert grad.dtype == param.dtype if param.name is not None: vel.name = 'vel_' + param.name scaled_lr = learning_rate * lr_scalers.get(param, 1.) updates[vel] = self.momentum * vel - scaled_lr * grad inc = updates[vel] if self.nesterov_momentum: inc = self.momentum * inc - scaled_lr * grad assert inc.dtype == vel.dtype updates[param] = param + inc f_update = theano.function([learning_rate], [], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def get_gradients(self, model, data, **kwargs): cost = self._cost(model, data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs="ignore", consider_constant=[self.sampler.particles]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() sampler_updates = self.sampler.updates() updates.update(sampler_updates) return gradients, updates
def on_attach(self, fgraph): """ When attaching to a new fgraph, check that 1) This DestroyHandler wasn't already attached to some fgraph (its data structures are only set up to serve one) 2) The FunctionGraph doesn't already have a DestroyHandler. This would result in it validating everything twice, causing compilation to be slower. Give the FunctionGraph instance: 1) A new method "destroyers(var)" TODO: what does this do exactly? 2) A new attribute, "destroy_handler" TODO: WRITEME: what does this do besides the checks? """ ####### Do the checking ########### already_there = False if self.fgraph is fgraph: already_there = True if self.fgraph is not None: raise Exception( "A DestroyHandler instance can only serve one" " FunctionGraph. (Matthew 6:24)") for attr in ('destroyers', 'destroy_handler'): if hasattr(fgraph, attr): already_there = True if already_there: # FunctionGraph.attach_feature catches AlreadyThere and cancels the attachment raise toolbox.AlreadyThere( "DestroyHandler feature is already present" " or in conflict with another plugin.") ####### Annotate the FunctionGraph ############ self.unpickle(fgraph) fgraph.destroy_handler = self self.fgraph = fgraph self.destroyers = OrderedSet() # set of Apply instances with non-null destroy_map self.view_i = OrderedDict() # variable -> variable used in calculation self.view_o = OrderedDict() # variable -> set of variables that use this one as a direct input # clients: how many times does an apply use a given variable self.clients = OrderedDict() # variable -> apply -> ninputs self.stale_droot = True self.debug_all_apps = OrderedSet() if self.do_imports_on_attach: toolbox.Bookkeeper.on_attach(self, fgraph)
def __init__(self, recurrent=[], recurrent_dim=[], self_recurrent=1, clip_gradient = True, clip_bound = 5, init_U=InitCell('ortho'), **kwargs): super(RecurrentLayer, self).__init__(**kwargs) self.recurrent = OrderedDict() if self_recurrent: self.recurrent[self.name] = self.nout recurrent_dim = tolist(recurrent_dim) for i, rec in enumerate(tolist(recurrent)): if len(recurrent_dim) != 0: self.recurrent[rec] = recurrent_dim[i] else: self.recurrent[rec] = None self.clip_gradient = clip_gradient self.clip_bound = clip_bound self.init_U = init_U
def __init__(self, model): """ Makes a monitor for `model`. Assumes the model has not been trained at all yet. Parameters ---------- model : pylearn2.models.model.Model instance """ self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() # Determine whether the model should use topological or vector form of # examples. If the model acts on a space with more than the batch index # and channel dimension, the model has topological dimensions, so the # topological view of the data should be used. vector = model.get_input_space().make_theano_batch(name='monitoring_input') if isinstance(vector.type, theano.sparse.SparseType): self.topo = False else: self.topo = len(vector.type.broadcastable) > 2 self.require_label = False self.theano_function_mode = None
def __init__(self, model): """ Makes a monitor for `model`. Assumes the model has not been trained at all yet. Parameters ---------- model : pylearn2.models.model.Model instance WRITEME """ self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs()
def __init__(self, parent=[], parent_dim=[], nout=None, init_W=InitCell('randn'), init_b=InitCell('zeros'), cons=0., name=None, lr_scaler=None, **kwargs): super(StemCell, self).__init__(**kwargs) if name is None: name = self.__class__.name__.lower() self.name = name self.nout = nout self.init_W = init_W self.init_b = init_b self.cons = cons self.parent = OrderedDict() parent_dim = tolist(parent_dim) for i, par in enumerate(tolist(parent)): if len(parent_dim) != 0 and len(parent) != 0: if len(parent) != len(parent_dim): raise AssertionError("You probably had a mistake providing,\ write number of values. It will end,\ up with a model containing a bug.") self.parent[par] = parent_dim[i] else: self.parent[par] = None self.params = OrderedDict() self.lr_scaler = lr_scaler
def __init__(self, dim, layer_name, irange, indices=None, init_bias=0., svd=True, nonlinearity=tensor.tanh): self.rnn_friendly = True self._scan_updates = OrderedDict() self.__dict__.update(locals()) del self.self super(Recurrent, self).__init__()
def build_train_fn(self,): self.lr_theano = T.scalar('lr') self.grad_inputs = self.inputs + [self.lr_theano] if self.momentum: self.mom_theano = T.scalar('mom') self.grad_inputs = self.grad_inputs + [self.mom_theano] self.gparams = T.grad(self.costs[0],self.params,consider_constant=self.consider_constant) if not self.momentum: print 'Building SGD optimization graph without momentum' updates = OrderedDict((i, i - self.lr_theano*j) for i, j in zip(self.params, self.gparams)) else: print 'Building SGD optimization graph with momentum' updates = OrderedDict() for param,param_mom,gparam in zip(self.params,self.params_mom,self.gparams): param_inc = self.mom_theano * param_mom - self.lr_theano * gparam updates[param_mom] = param_inc updates[param] = param + param_inc self.calc_cost = theano.function(self.inputs,self.costs) if self.updates_old: updates_old = copy.copy(updates_old) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case. self.updates_old.update(updates) else: self.updates_old = OrderedDict() self.updates_old.update(updates) self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old)
def get_gradients(self, model, data, ** kwargs): cost_cd, cost_ci = model.cost_from_X(data) params_dict = model.get_params() params = list(params_dict) zero_grads = [] if self.zero_ci_grad_for_cd: #how to get this in less explicit way, i.e. using only dict? print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' assert model.layers[-1].M in params_dict assert model.layers[-1].m in params_dict zero_grads = [model.layers[-1].M, model.layers[-1].m] grads_cd = T.grad(cost_cd, params, disconnected_inputs = 'ignore', consider_constant=zero_grads) grads_ci = T.grad(cost_ci, params, disconnected_inputs = 'ignore') gradients_cd = OrderedDict(izip(params, grads_cd)) gradients_ci = OrderedDict(izip(params, grads_ci)) indiv_results = [] indiv_results.append((gradients_cd, OrderedDict())) indiv_results.append((gradients_ci, OrderedDict())) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip([self.coeff_cd, self.coeff_ci], indiv_results): g, u = packed for param in g: if param not in params: raise ValueError("A shared variable ("+str(param)+") that is not a parameter appeared in a cost gradient dictionary.") for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
class StemCell(NonlinCell): """ WRITEME Parameters ---------- .. todo:: """ def __init__(self, name, parent=[], parent_dim=[], nout=None, init_W=InitCell('randn'), init_b=InitCell('zeros'), cons=0., use_bias=1, lr_scaler=None, **kwargs): super(StemCell, self).__init__(**kwargs) if name is None: name = self.__class__.name__.lower() self.name = name self.nout = nout self.init_W = init_W self.init_b = init_b self.cons = cons self.parent = OrderedDict() parent_dim = tolist(parent_dim) for i, par in enumerate(tolist(parent)): if len(parent_dim) != 0 and len(parent) != 0: if len(parent) != len(parent_dim): raise AssertionError("You probably had a mistake providing,\ write number of values. It will end,\ up with a model containing a bug.") self.parent[par] = parent_dim[i] else: self.parent[par] = None self.lr_scaler = lr_scaler self.use_bias = use_bias def fprop(self): raise NotImplementedError( str(type(self)) + " does not implement Layer.fprop.") def initialize(self): params = OrderedDict() for parname, parout in self.parent.items(): W_shape = (parout, self.nout) W_name = 'W_' + parname + '__' + self.name params[W_name] = self.init_W.get(W_shape) if self.use_bias: params['b_'+self.name] = self.init_b.get(self.nout) return params
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): # channels that does not require state information # if self.no_affine: # rval = OrderedDict() # # W = self.W # # assert W.ndim == 2 # # sq_W = T.sqr(W) # # row_norms = T.sqrt(sq_W.sum(axis=1)) # col_norms = T.sqrt(sq_W.sum(axis=0)) # # rval = OrderedDict([('row_norms_min', row_norms.min()), # ('row_norms_mean', row_norms.mean()), # ('row_norms_max', row_norms.max()), # ('col_norms_min', col_norms.min()), # ('col_norms_mean', col_norms.mean()), # ('col_norms_max', col_norms.max()), ]) rval = OrderedDict() if (state_below is not None) or (state is not None): if state is None: state = self.fprop(state_below) mx = state.max(axis=1) rval.update(OrderedDict([ ('mean_max_class', mx.mean()), ('max_max_class', mx.max()), ('min_max_class', mx.min())])) if targets is not None: y_hat = self.target_convert(T.argmax(state, axis=1)) #Assume target is in [0,1] as binary one-hot y = self.target_convert(T.argmax(targets, axis=1)) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=targets) return rval
def __init__(self, dataset, model=None, epochs=10, batch_size=100, min_batch_size=1, save_freq=None, stop_threshold=None, stop_patience=None, learning_rate=1e-6, lr_decay=None, lr_decay_factor=None, decay=0.95, max_scaling=1e5, grad_clip=None, hard_clip=False): """ Initialize RMSProp. Parameters ---------- dataset : Dataset The Dataset to use when training the Model. model : Model The Model to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int How many epochs to train between each new save of the Model's parameters. stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_decay_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. decay : float, optional Decay constant similar to that used in AdaDelta and Momentum methods. max_scaling: float, optional Restrict the RMSProp gradient scaling coefficient to values below `max_scaling`. grad_clip : float, optional Whether to clip gradients. This will clip with a maximum of grad_clip or the parameter norm. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ # need to call the Optimizer constructor initial_parameters = locals().copy() initial_parameters.pop('self') super(RMSProp, self).__init__(**initial_parameters) assert max_scaling > 0., "Max_scaling needs to be > 0." self.max_scaling = max_scaling self.epsilon = 1. / self.max_scaling self.decay = decay self.mean_square_grads = OrderedDict()
def __init__(self, nh, nc, ne, de, cs, em, init, featdim): ''' nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings cs :: word window context size ''' # parameters of the model self.featdim = featdim tmp_emb = 0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de)) if init: for row in xrange(ne + 1): if em[row] is not None: tmp_emb[row] = em[row] self.emb = theano.shared(tmp_emb.astype( theano.config.floatX)) # add one for PADDING at the end # weights for LSTM n_in = de * cs print "de,cs", de, cs # print "n_i",n_i n_hidden = n_i = n_c = n_o = n_f = nh n_y = nc print "n_y", n_y print "n_hidden, n_i, n_c, n_o,nh", n_hidden, n_i, n_c, n_o, nh self.W_xi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_i)).astype(dtype)) self.W_hi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_i)).astype(dtype)) self.W_ci = theano.shared(0.2 * uniform(-1.0, 1.0, (n_c, n_i)).astype(dtype)) self.b_i = theano.shared(numpy.cast[dtype](uniform(-0.5, .5, size=n_i))) self.W_xf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_f)).astype(dtype)) self.W_hf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_f)).astype(dtype)) self.W_cf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_c, n_f)).astype(dtype)) self.b_f = theano.shared(numpy.cast[dtype](uniform(0, 1., size=n_f))) self.W_xc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_c)).astype(dtype)) self.W_hc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_c)).astype(dtype)) self.b_c = theano.shared(numpy.zeros(n_c, dtype=dtype)) self.W_xo = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_o)).astype(dtype)) self.W_ho = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_o)).astype(dtype)) self.W_co = theano.shared(0.2 * uniform(-1.0, 1.0, (n_c, n_o)).astype(dtype)) self.b_o = theano.shared(numpy.cast[dtype](uniform(-0.5, .5, size=n_o))) self.W_hy = theano.shared( 0.2 * uniform(-1.0, 1.0, (n_hidden + featdim, n_y)).astype(dtype)) self.b_y = theano.shared(numpy.zeros(n_y, dtype=dtype)) self.c0 = theano.shared(numpy.zeros(n_hidden, dtype=dtype)) self.h0 = T.tanh(self.c0) # bundle weights self.params = [self.emb, self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, \ self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_xo, self.W_ho, \ self.W_co, self.b_o, self.W_hy, self.b_y, self.c0] self.names = ['embeddings', 'W_xi', 'W_hi', 'W_ci', 'b_i', 'W_xf', 'W_hf', 'W_cf', 'b_f', \ 'W_xc', 'W_hc', 'b_c', 'W_xo', 'W_ho', 'W_co', 'b_o', 'W_hy', 'b_y', 'c0'] idxs = T.imatrix( ) # as many columns as context window size/lines as words in the sentence # print idxs.shape() x = self.emb[idxs].reshape((idxs.shape[0], de * cs)) # print type(x), x.shape(), "details of x" f = T.matrix('f') f.reshape((idxs.shape[0], featdim)) # print type(f), f.shape(), "details of f" y = T.iscalar('y') # label # print type(y), y.shape(), "details of y" def recurrence(x_t, feat_t, h_tm1, c_tm1): i_t = sigma( theano.dot(x_t, self.W_xi) + theano.dot(h_tm1, self.W_hi) + theano.dot(c_tm1, self.W_ci) + self.b_i) f_t = sigma( theano.dot(x_t, self.W_xf) + theano.dot(h_tm1, self.W_hf) + theano.dot(c_tm1, self.W_cf) + self.b_f) c_t = f_t * c_tm1 + i_t * T.tanh( theano.dot(x_t, self.W_xc) + theano.dot(h_tm1, self.W_hc) + self.b_c) o_t = sigma( theano.dot(x_t, self.W_xo) + theano.dot(h_tm1, self.W_ho) + theano.dot(c_t, self.W_co) + self.b_o) h_t = o_t * T.tanh(c_t) if self.featdim > 0: all_t = T.concatenate([h_t, feat_t]) else: all_t = h_t # print "all_t", type(all_t), T.shape(all_t) s_t = softmax(theano.dot(all_t, self.W_hy) + self.b_y) # print T.shape(h_t), T.shape(c_t), T.shape(s_t) return [h_t, c_t, s_t] # Initialization occurs in outputs_info # scan gives -- result, updates [h, _, s], _ = theano.scan(fn=recurrence, sequences=[x, f], outputs_info=[self.h0, self.c0, None], n_steps=x.shape[0]) p_y_given_x_lastword = s[-1, 0, :] p_y_given_x_sentence = s[:, 0, :] y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost and gradients and learning rate lr = T.scalar('lr') nll = -T.mean(T.log(p_y_given_x_lastword)[y]) gradients = T.grad(nll, self.params) updates = OrderedDict( (p, p - lr * g) for p, g in zip(self.params, gradients)) # theano functions self.classify = theano.function(inputs=[idxs, f], outputs=y_pred) self.train = theano.function(inputs=[idxs, f, y, lr], outputs=nll, updates=updates) self.normalize = theano.function( inputs=[], updates={ self.emb: self.emb / T.sqrt( (self.emb**2).sum(axis=1)).dimshuffle(0, 'x') })
def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt')) var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt')) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed_raw = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw]) train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw]) valid_collapsed_raw = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw]) valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw]) log.debug('compiling...') f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates) f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates) log.debug('done') t1 = time.time() for epoch in range(10): t = time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) for name, service in train_services.items(): if name in m: service.write(m[name], TRAIN) log.debug('----- ' + make_time_units_string(time.time() - t)) for epoch in range(10): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) for name, service in valid_services.items(): if name in m: service.write(m[name], VALID) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME " + make_time_units_string(time.time() - t1))
def __init__(self, model, dataset, iterator_class=SequentialIterator, config=None, defaults=_defaults, rng=None, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, momentum=None, momentum_decay=None, momentum_factor=None, nesterov_momentum=None, flag_para_load=None): # superclass init super(SGD, self).__init__(config=config, defaults=defaults) # config and defaults are now combined in self.args! yay! self.model = model self.dataset = dataset self.iterator = iterator_class # Training epochs - how many times to iterate over the whole dataset self.n_epoch = n_epoch or self.args.get('n_epoch') # Dataset iteration batch sizes - number of examples in each calculation self.batch_size = batch_size or self.args.get('batch_size') self.minimum_batch_size = minimum_batch_size or self.args.get( 'minimum_batch_size') # Number of epochs between saving model parameters self.save_frequency = save_frequency or self.args.get('save_frequency') # Early stopping threshold and patience - by how much does the cost have to improve over a number of epochs self.early_stop_threshold = early_stop_threshold or self.args.get( 'early_stop_threshold') self.early_stop_length = early_stop_length or self.args.get( 'early_stop_length') # Learning rate - how drastic of a step do the parameters change lr = learning_rate or self.args.get('learning_rate') self.learning_rate = sharedX(lr, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if lr_decay or self.args.get('lr_decay'): self.learning_rate_decay = get_decay_function( lr_decay or self.args.get('lr_decay'), self.learning_rate, self.learning_rate.get_value(), lr_factor or self.args.get('lr_factor')) # Momentum - smoothing over the parameter changes (see Hinton) self.momentum = sharedX(momentum or self.args.get('momentum'), 'momentum') if self.args.get('momentum_decay'): self.momentum_decay = get_decay_function( momentum_decay or self.args.get('momentum_decay'), self.momentum, self.momentum.get_value(), momentum_factor or self.args.get('momentum_factor')) self.nesterov_momentum = nesterov_momentum or self.args.get( 'nesterov_momentum') # RNG for working on random iterator if rng is None: random.seed(123) self.rng = random else: self.rng = rng self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters log.info("%s params: %s", str(type(self.model)), str(self.params)) # gradient! gradient = grad(self.model.get_train_cost(), self.params) grads = OrderedDict(zip(self.params, gradient)) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta! # It tells how to update the params each training epoch gradient_updates = self.get_updates(grads) # Combine the updates from the model also if applicable train_updates = model.get_updates() if train_updates: train_updates.update(gradient_updates) else: train_updates = gradient_updates # Compile the training function! log.info('Compiling f_learn function for model %s...', str(type(self.model))) t = time.time() self.f_learn = function(inputs=model.get_inputs(), updates=train_updates, outputs=self.model.get_train_cost(), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # Determine if this function is unsupervised or not by looking at the number of inputs to the f_learn function. # If there is only one input, it is unsupervised, otherwise, it is supervised. # This workaround was provided by Pascal Lamblin on the theano-users google group num_inputs = len( [i for i in self.f_learn.maker.inputs if not i.shared]) if num_inputs == 1: log.debug("Model is unsupervised: 1 input to f_learn.") self.unsupervised = True elif num_inputs == 2: log.debug("Model is supervised: 2 inputs to f_learn.") self.unsupervised = False else: log.error( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised.", str(type(self.model)), str(num_inputs)) raise AssertionError( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised." % str(type(self.model)), str(num_inputs)) # grab the function(s) to use to monitor different model values during training self.monitors = self.model.get_monitors()
def __init__(self, nh, nc, ne, de, cs, decay): ''' nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings cs :: word window context size decay :: adaptive learning rate ''' # parameters of the model # weights for GRU n_in = de * cs n_hidden = n_i = n_c = n_f = nh n_y = nc # forward pass self.W_xi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_i)).astype(dtype)) self.W_hi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_i)).astype(dtype)) self.b_i = theano.shared(numpy.cast[dtype](uniform(-0.5, .5, size=n_i))) self.W_xf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_f)).astype(dtype)) self.W_hf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_f)).astype(dtype)) self.b_f = theano.shared(numpy.cast[dtype](uniform(0, 1., size=n_f))) self.W_xc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_c)).astype(dtype)) self.W_hc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_c)).astype(dtype)) self.b_c = theano.shared(numpy.zeros(n_c, dtype=dtype)) self.c0 = theano.shared(numpy.zeros(n_hidden, dtype=dtype)) self.h0 = T.tanh(self.c0) self.W_hy = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_y)).astype(dtype)) self.b_y = theano.shared(numpy.zeros(n_y, dtype=dtype)) ''' # backward pass self.bW_xi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_i)).astype(dtype)) self.bW_hi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_i)).astype(dtype)) self.bb_i = theano.shared(numpy.cast[dtype](uniform(-0.5,.5,size = n_i))) self.bW_xf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_f)).astype(dtype)) self.bW_hf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_f)).astype(dtype)) self.bb_f = theano.shared(numpy.cast[dtype](uniform(0, 1.,size = n_f))) self.bW_xc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_c)).astype(dtype)) self.bW_hc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_c)).astype(dtype)) self.bb_c = theano.shared(numpy.zeros(n_c, dtype=dtype)) self.bc0 = theano.shared(numpy.zeros(n_hidden, dtype=dtype)) self.bh0 = T.tanh(self.bc0) self.bW_hy = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_y)).astype(dtype)) ''' # bundle weights self.params = [self.W_xi, self.W_hi, self.b_i, self.W_xf, self.W_hf, \ self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_hy, self.b_y] self.names = ['W_xi', 'W_hi', 'b_i', 'W_xf', 'W_hf', 'b_f', \ 'W_xc', 'W_hc', 'b_c', 'W_xo', 'W_ho', 'b_o', 'W_hy', 'b_y', 'c0'] # for dropout self.allcache = [ theano.shared(W.get_value() * numpy.asarray(0., dtype=dtype)) for W in self.params ] # input context vectors in a batch embs = T.ftensor3('embs') mask = T.ivector('mask') idxs = T.itensor3( ) # as many columns as context window size/lines as words in the sentence x, _ = theano.scan(lambda idx, emb: emb[idx].reshape( (idx.shape[0], de * cs)), sequences=[idxs, embs]) y = T.imatrix('y') def recurrence(x_t, h_tm1): i_t = sigma( theano.dot(x_t, self.W_xi) + theano.dot(h_tm1, self.W_hi) + self.b_i) f_t = sigma( theano.dot(x_t, self.W_xf) + theano.dot(h_tm1, self.W_hf) + self.b_f) c_t = T.tanh( theano.dot(x_t, self.W_xc) + theano.dot(h_tm1 * f_t, self.W_hc) + self.b_c) h_t = (T.ones_like(i_t) - i_t) * h_tm1 + i_t * c_t s_t = softmax(theano.dot(h_t, self.W_hy) + self.b_y)[0] return [h_t, s_t] ''' def brecurrence(x_t, feat_t, h_tm1, c_tm1): i_t = sigma(theano.dot(x_t, self.bW_xi) + theano.dot(h_tm1, self.bW_hi) + self.bb_i) f_t = sigma(theano.dot(x_t, self.bW_xf) + theano.dot(h_tm1, self.bW_hf) + self.bb_f) c_t = T.tanh(theano.dot(x_t, self.bW_xc) + theano.dot(h_tm1 * f_t, self.bW_hc) + self.bb_c) h_t = (T.ones_like(i_t) - i_t) * h_tm1 + i_t * c_t return [h_t, c_t] ''' # loss for each sentence, m is mask def sent_model(x_sent, m, y_sent): [h, s], _ = theano.scan(fn=recurrence, sequences=[x_sent], outputs_info=[self.h0, None]) max_y, _ = theano.scan(lambda v, l: T.log(v)[l], sequences=[s[:m], y_sent[:m]]) nll = -T.mean(max_y) return nll # prediction for each sentence, m is mask def pred_model(x_sent, m): [h, s], _ = theano.scan(fn=recurrence, sequences=[x_sent], outputs_info=[self.h0, None]) y_pred = T.argmax(s, axis=1) return y_pred nll_all, _ = theano.scan(fn=sent_model, sequences=[x, mask, y]) nll_all = T.mean(nll_all) y_pred, _ = theano.scan(fn=pred_model, sequences=[x, mask]) # cost and gradients and learning rate lr = T.scalar('lr') gradients = T.grad(nll_all, self.params) # rmsprop allcache = [ decay * cacheW + (1 - decay) * gradient**2 for cacheW, gradient in zip(self.allcache, gradients) ] updates = OrderedDict([( p, p-lr*g/T.sqrt(cache+1e-6) ) for p, g, cache in zip( self.params , gradients, allcache)] \ + [(w, new_w) for w, new_w in zip(self.allcache, allcache)]) # gradients for input context vectors emb_update = T.grad(nll_all, embs) # theano functions self.predict = theano.function(inputs=[idxs, embs, mask], outputs=y_pred, allow_input_downcast=True) self.train = theano.function(inputs=[idxs, embs, y, lr, mask], outputs=nll_all, updates=updates, allow_input_downcast=True) #self.normalize = theano.function(inputs=[], updates={self.emb: self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0,'x')}) #self.update_emb = theano.function(inputs=[new, idxs], updates={self.emb[idxs]: theano.shared(new[idxs].get_value())}) #add returning gradients for embedding self.grad = theano.function(inputs=[idxs, embs, y, mask], outputs=emb_update, allow_input_downcast=True)
def build_updates(cost, params, clip_c=0, clip_idx=None, shrink_grad=None, choice=None): updates = OrderedDict() grads = T.grad(cost, params) def apply_clip(g): g2 = 0. g2 += (g**2).sum() new_grad = T.switch(g2 > (clip_c**2), g / T.sqrt(g2) * clip_c, g) return new_grad if clip_c > 0. and clip_idx is not None: for idx in clip_idx: grads[idx] = apply_clip(grads[idx]) if shrink_grad is not None: for idx in shrink_grad: grads[idx] *= 0.001 def get_updates_adadelta(grads, params, decay=0.95): decay = constantX(decay) print 'build updates with adadelta' for param, grad in zip(params, grads): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX( numpy.zeros(param.get_value().shape, dtype=floatX)) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX( numpy.zeros(param.get_value().shape, dtype=floatX)) if param.name is not None: mean_square_grad.name = 'mean_square_grad_' + param.name mean_square_dx.name = 'mean_square_dx_' + param.name # Accumulate gradient new_mean_squared_grad = \ decay * mean_square_grad +\ (1. - decay) * T.sqr(grad) # Compute update epsilon = constantX(1e-7) rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = -rms_dx_tm1 / rms_grad_t * grad # Accumulate updates new_mean_square_dx = \ decay * mean_square_dx + \ (1. - decay) * T.sqr(delta_x_t) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t def get_updates_grads_momentum(gparams, params, lr=0.1, momentum=0.5): print 'building updates with momentum' # build momentum gparams_mom = [] for param in params: gparam_mom = theano.shared( numpy.zeros(param.get_value(borrow=True).shape, dtype=floatX)) gparams_mom.append(gparam_mom) for gparam, gparam_mom, param in zip(gparams, gparams_mom, params): inc = momentum * gparam_mom - (constantX(1) - momentum) * lr * gparam updates[gparam_mom] = inc updates[param] = param + inc def get_updates_rmsprop(grads, params, lr=0.1, decay=0.95): for param, grad in zip(params, grads): mean_square_grad = sharedX( numpy.zeros(param.get_value().shape, dtype=floatX)) new_mean_squared_grad = (decay * mean_square_grad + (1. - decay) * T.sqr(grad)) rms_grad_t = T.sqrt(new_mean_squared_grad) delta_x_t = constantX(-1) * lr * grad / rms_grad_t updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t get_updates_adadelta(grads, params) #get_updates_grads_momentum(grads, params) #get_updates_rmsprop(grads, params) return updates
import numpy as np from theano.compat.python2x import OrderedDict from kdl_template import * # random state so script is deterministic random_state = np.random.RandomState(1999) # home of the computational graph graph = OrderedDict() # minibatch size minibatch_size = 20 # number of input units n_in = 5 # number of hidden units n_hid = 10 # number of output units n_out = 5 # Generate sinewaves offset in phase n_timesteps = 50 d1 = 3 * np.arange(n_timesteps) / (2 * np.pi) d2 = 3 * np.arange(n_in) / (2 * np.pi) all_sines = np.sin(np.array([d1] * n_in).T + d2) all_sines = all_sines[:, None, :] all_sines = np.concatenate([all_sines] * minibatch_size, axis=1) # Setup dataset and initial hidden vector of zeros X = all_sines[:-1].astype(theano.config.floatX) y = all_sines[1:].astype(theano.config.floatX) X_mask = np.ones_like(X[:, :, 0]) y_mask = np.ones_like(y[:, :, 0])
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on theano_args = self._flat_data_specs[0].make_theano_batch( ['monitoring_%s' % s for s in self._flat_data_specs[1]]) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args, ) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) / cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append( function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names])
def __init__(self, network): self.trial_num = 0 batch_size = network.parameters.batch_size num_iterations = network.parameters.num_iterations keep_spikes = network.parameters.keep_spikes norm_infer = network.parameters.norm_infer firing_decay = network.parameters.firing_decay #firing_decay = False time_data = network.parameters.time_data X = network.X updates = OrderedDict() for layer in range(network.n_layers): M = network.parameters.M[layer] Q = network.Q[layer] theta = network.theta[layer] W = network.W[layer] Y = T.alloc(0., batch_size, M) if time_data and self.trial_num != 0: Ys = network.Ys_tm1[layer] aas = network.aas_tm1[layer] else: Ys = T.zeros_like(Y) aas = T.zeros_like(Y) if keep_spikes: spike_train = T.alloc(0., batch_size, M, num_iterations) Q_norm = (Q * Q).sum(axis=0, keepdims=True) B = X.dot(Q) Th = theta.dimshuffle('x', 0) eta = .1 for tt in xrange(num_iterations): if norm_infer: Ys = (1. - eta * Q_norm) * Ys + eta * (B - aas.dot(W)) elif firing_decay: Ys = (1. - eta) * Ys + eta * (B - Y.dot(W)) else: Ys = (1. - eta) * Ys + eta * (B - aas.dot(W)) aas = 0. * aas # This resets the current activity of the time step to 0's aas = T.switch(Ys > Th, 1., aas) # If the activity of a given neuron is above the threshold, set it to 1 a.k.a. fire. if keep_spikes: spike_train = T.set_subtensor(spike_train[:, :, tt], aas) Y += aas # Update total activity Ys = T.switch(Ys > Th, 0., Ys) # Setting input of next layer to spikes of current one X = Y updates[network.Y[layer]] = Y if keep_spikes: if time_data: updates[network.spike_train_tm1[ layer]] = network.spike_train[layer] updates[network.spike_train[layer]] = spike_train if time_data: updates[network.Ys_tm1[layer]] = Ys updates[network.aas_tm1[layer]] = aas self.f = theano.function([], [], updates=updates)
def _build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images( input=self.x, image_shape=(self.batch_size, 3, 256, 256), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in bc01 format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=(( self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer( inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = Dense(inputs_hook=(9216, fc_layer6_input), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # Add the dropout noise switch self.noise_switches += fc_layer6.get_noise_switch() log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = Dense(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer7.get_params() # Add the dropout noise switch self.noise_switches += fc_layer7.get_noise_switch() # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, **softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() self.targets = softmax_layer8.get_targets() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8.negative_log_likelihood() cost = softmax_layer8.negative_log_likelihood() errors = softmax_layer8.errors() train_errors = softmax_layer8.errors() self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_run...") # use the actual argmax from the classification self.f_run = function(inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("compilation took %s", make_time_units_string(time.time() - t))
def orderings(self, fgraph): """Return orderings induced by destructive operations. Raise InconsistencyError when a) attempting to destroy indestructable variable, or b) attempting to destroy a value multiple times, or c) an Apply destroys (illegally) one of its own inputs by aliasing """ rval = OrderedDict() if self.destroyers: # BUILD DATA STRUCTURES # CHECK for multiple destructions during construction of variables droot, impact, __ignore = self.refresh_droot_impact() # check for destruction of constants illegal_destroy = [r for r in droot if \ getattr(r.tag, 'indestructible', False) or \ isinstance(r, graph.Constant)] if illegal_destroy: raise InconsistencyError( "Attempting to destroy indestructible variables: %s" % illegal_destroy) # add destroyed variable clients as computational dependencies for app in self.destroyers: # for each destroyed input... for output_idx, input_idx_list in app.op.destroy_map.items(): destroyed_idx = input_idx_list[0] destroyed_variable = app.inputs[destroyed_idx] root = droot[destroyed_variable] root_impact = impact[root] # we generally want to put all clients of things which depend on root # as pre-requisites of app. # But, app is itself one such client! # App will always be a client of the node we're destroying # (destroyed_variable, but the tricky thing is when it is also a client of # *another variable* viewing on the root. Generally this is illegal, (e.g., # add_inplace(x, x.T). In some special cases though, the in-place op will # actually be able to work properly with multiple destroyed inputs (e.g, # add_inplace(x, x). An Op that can still work in this case should declare # so via the 'destroyhandler_tolerate_same' attribute or # 'destroyhandler_tolerate_aliased' attribute. # # destroyhandler_tolerate_same should be a list of pairs of the form # [(idx0, idx1), (idx0, idx2), ...] # The first element of each pair is the input index of a destroyed # variable. # The second element of each pair is the index of a different input where # we will permit exactly the same variable to appear. # For example, add_inplace.tolerate_same might be [(0,1)] if the destroyed # input is also allowed to appear as the second argument. # # destroyhandler_tolerate_aliased is the same sort of list of # pairs. # op.destroyhandler_tolerate_aliased = [(idx0, idx1)] tells the # destroyhandler to IGNORE an aliasing between a destroyed # input idx0 and another input idx1. # This is generally a bad idea, but it is safe in some # cases, such as # - the op reads from the aliased idx1 before modifying idx0 # - the idx0 and idx1 are guaranteed not to overlap (e.g. # they are pointed at different rows of a matrix). # # CHECK FOR INPUT ALIASING # OPT: pre-compute this on import tolerate_same = getattr(app.op, 'destroyhandler_tolerate_same', []) assert isinstance(tolerate_same, list) tolerated = OrderedSet(idx1 for idx0, idx1 in tolerate_same if idx0 == destroyed_idx) tolerated.add(destroyed_idx) tolerate_aliased = getattr( app.op, 'destroyhandler_tolerate_aliased', []) assert isinstance(tolerate_aliased, list) ignored = OrderedSet(idx1 for idx0, idx1 in tolerate_aliased if idx0 == destroyed_idx) # print 'tolerated', tolerated # print 'ignored', ignored for i, input in enumerate(app.inputs): if i in ignored: continue if input in root_impact \ and (i not in tolerated or input is not destroyed_variable): raise InconsistencyError( "Input aliasing: %s (%i, %i)" % (app, destroyed_idx, i)) # add the rule: app must be preceded by all other Apply instances that # depend on destroyed_input root_clients = OrderedSet() for r in root_impact: assert not [ a for a, c in self.clients[r].items() if not c ] root_clients.update( [a for a, c in self.clients[r].items() if c]) root_clients.remove(app) if root_clients: rval[app] = root_clients return rval
class DestroyHandler(toolbox.Bookkeeper): """ The DestroyHandler class detects when a graph is impossible to evaluate because of aliasing and destructive operations. Several data structures are used to do this. An Op can use its view_map property to declare that an output may be aliased to an input. If that output is destroyed, the input is also considered to be destroyed. The view_maps of several Ops can feed into one another and form a directed graph. The consequence of destroying any variable in such a graph is that all variables in the graph must be considered to be destroyed, because they could all be refering to the same underlying storage. In the current implementation, that graph is a tree, and the root of that tree is called the foundation. TODO: why "in the current implementation" ? is there another implementation planned? TODO: why is the graph a tree? isn't it possible that one variable could be aliased to many variables? for example, don't switch and ifelse have to do this? The original DestroyHandler (if 0'ed out above) computed several data structures from scratch each time it was asked to validate the graph. Because this happens potentially thousands of times and each graph to validate is extremely similar to the previous one, computing the data structures from scratch repeatedly was wasteful and resulted in high compile times for large graphs. This implementation computes the data structures once at initialization and then incrementally updates them. It is a work in progress. The following data structures have been converted to use the incremental strategy: <none> The following data structures remain to be converted: <unknown> """ pickle_rm_attr = ["destroyers"] def __init__(self, do_imports_on_attach=True): self.fgraph = None self.do_imports_on_attach = do_imports_on_attach """maps every variable in the graph to its "foundation" (deepest ancestor in view chain) TODO: change name to var_to_vroot""" self.droot = OrderedDict() """maps a variable to all variables that are indirect or direct views of it (including itself) essentially the inverse of droot TODO: do all variables appear in this dict, or only those that are foundations? TODO: do only destroyed variables go in here? one old docstring said so TODO: rename to x_to_views after reverse engineering what x is""" self.impact = OrderedDict() """if a var is destroyed, then this dict will map droot[var] to the apply node that destroyed var TODO: rename to vroot_to_destroyer""" self.root_destroyer = OrderedDict() def on_attach(self, fgraph): """ When attaching to a new fgraph, check that 1) This DestroyHandler wasn't already attached to some fgraph (its data structures are only set up to serve one) 2) The FunctionGraph doesn't already have a DestroyHandler. This would result in it validating everything twice, causing compilation to be slower. Give the FunctionGraph instance: 1) A new method "destroyers(var)" TODO: what does this do exactly? 2) A new attribute, "destroy_handler" TODO: WRITEME: what does this do besides the checks? """ ####### Do the checking ########### already_there = False if self.fgraph is fgraph: already_there = True if self.fgraph is not None: raise Exception("A DestroyHandler instance can only serve one" " FunctionGraph. (Matthew 6:24)") for attr in ('destroyers', 'destroy_handler'): if hasattr(fgraph, attr): already_there = True if already_there: # FunctionGraph.attach_feature catches AlreadyThere and cancels the attachment raise toolbox.AlreadyThere( "DestroyHandler feature is already present" " or in conflict with another plugin.") ####### Annotate the FunctionGraph ############ self.unpickle(fgraph) fgraph.destroy_handler = self self.fgraph = fgraph self.destroyers = OrderedSet( ) # set of Apply instances with non-null destroy_map self.view_i = OrderedDict() # variable -> variable used in calculation self.view_o = OrderedDict( ) # variable -> set of variables that use this one as a direct input # clients: how many times does an apply use a given variable self.clients = OrderedDict() # variable -> apply -> ninputs self.stale_droot = True self.debug_all_apps = OrderedSet() if self.do_imports_on_attach: toolbox.Bookkeeper.on_attach(self, fgraph) def unpickle(self, fgraph): def get_destroyers_of(r): droot, impact, root_destroyer = self.refresh_droot_impact() try: return [root_destroyer[droot[r]]] except Exception: return [] fgraph.destroyers = get_destroyers_of def refresh_droot_impact(self): """ Makes sure self.droot, self.impact, and self.root_destroyer are up to date, and returns them. (see docstrings for these properties above) """ if self.stale_droot: droot = OrderedDict( ) # destroyed view + nonview variables -> foundation impact = OrderedDict( ) # destroyed nonview variable -> it + all views of it root_destroyer = OrderedDict() # root -> destroyer apply for app in self.destroyers: for output_idx, input_idx_list in app.op.destroy_map.items(): if len(input_idx_list) != 1: raise NotImplementedError() input_idx = input_idx_list[0] input = app.inputs[input_idx] input_root = getroot(input, self.view_i) if input_root in droot: raise InconsistencyError("Multiple destroyers of %s" % input_root) droot[input_root] = input_root root_destroyer[input_root] = app input_impact = get_impact(input_root, self.view_o) for v in input_impact: assert v not in droot droot[v] = input_root impact[input_root] = input_impact impact[input_root].add(input_root) self.droot, self.impact, self.root_destroyer = droot, impact, root_destroyer self.stale_droot = False return self.droot, self.impact, self.root_destroyer def on_detach(self, fgraph): if fgraph is not self.fgraph: raise Exception("detaching wrong fgraph", fgraph) del self.destroyers del self.view_i del self.view_o del self.clients del self.stale_droot assert self.fgraph.destroyer_handler is self delattr(self.fgraph, 'destroyers') delattr(self.fgraph, 'destroy_handler') self.fgraph = None def on_import(self, fgraph, app, reason): """Add Apply instance to set which must be computed""" if app in self.debug_all_apps: raise ProtocolError("double import") self.debug_all_apps.add(app) # print 'DH IMPORT', app, id(app), id(self), len(self.debug_all_apps) # If it's a destructive op, add it to our watch list if getattr(app.op, 'destroy_map', OrderedDict()): self.destroyers.add(app) # add this symbol to the forward and backward maps for o_idx, i_idx_list in getattr(app.op, 'view_map', OrderedDict()).items(): if len(i_idx_list) > 1: raise NotImplementedError( 'destroying this output invalidates multiple inputs', (app.op)) o = app.outputs[o_idx] i = app.inputs[i_idx_list[0]] self.view_i[o] = i self.view_o.setdefault(i, OrderedSet()).add(o) # update self.clients for i, input in enumerate(app.inputs): self.clients.setdefault(input, OrderedDict()).setdefault(app, 0) self.clients[input][app] += 1 for i, output in enumerate(app.outputs): self.clients.setdefault(output, OrderedDict()) self.stale_droot = True def on_prune(self, fgraph, app, reason): """Remove Apply instance from set which must be computed""" if app not in self.debug_all_apps: raise ProtocolError("prune without import") self.debug_all_apps.remove(app) # UPDATE self.clients for i, input in enumerate(OrderedSet(app.inputs)): del self.clients[input][app] if getattr(app.op, 'destroy_map', OrderedDict()): self.destroyers.remove(app) # Note: leaving empty client dictionaries in the struct. # Why? It's a pain to remove them. I think they aren't doing any harm, they will be # deleted on_detach(). # UPDATE self.view_i, self.view_o for o_idx, i_idx_list in getattr(app.op, 'view_map', OrderedDict()).items(): if len(i_idx_list) > 1: # destroying this output invalidates multiple inputs raise NotImplementedError() o = app.outputs[o_idx] i = app.inputs[i_idx_list[0]] del self.view_i[o] self.view_o[i].remove(o) if not self.view_o[i]: del self.view_o[i] self.stale_droot = True def on_change_input(self, fgraph, app, i, old_r, new_r, reason): """app.inputs[i] changed from old_r to new_r """ if app == 'output': # app == 'output' is special key that means FunctionGraph is redefining which nodes are being # considered 'outputs' of the graph. pass else: if app not in self.debug_all_apps: raise ProtocolError("change without import") # UPDATE self.clients self.clients[old_r][app] -= 1 if self.clients[old_r][app] == 0: del self.clients[old_r][app] self.clients.setdefault(new_r, OrderedDict()).setdefault(app, 0) self.clients[new_r][app] += 1 # UPDATE self.view_i, self.view_o for o_idx, i_idx_list in getattr(app.op, 'view_map', OrderedDict()).items(): if len(i_idx_list) > 1: # destroying this output invalidates multiple inputs raise NotImplementedError() i_idx = i_idx_list[0] output = app.outputs[o_idx] if i_idx == i: if app.inputs[i_idx] is not new_r: raise ProtocolError("wrong new_r on change") self.view_i[output] = new_r self.view_o[old_r].remove(output) if not self.view_o[old_r]: del self.view_o[old_r] self.view_o.setdefault(new_r, OrderedSet()).add(output) self.stale_droot = True def validate(self, fgraph): """Return None Raise InconsistencyError when a) orderings() raises an error b) orderings cannot be topologically sorted. """ if self.destroyers: ords = self.orderings(fgraph) if _contains_cycle(fgraph, ords): raise InconsistencyError("Dependency graph contains cycles") else: # James's Conjecture: # If there are no destructive ops, then there can be no cycles. # FB: This isn't always True. It can happend that # optimization introduce node that depend on itself. This # is very rare and should not happen in general. It will be # caught later. The error will be far from the source. But # doing this conjecture should speed up compilation most of # the time. The user should create such dependency except # if he mess too much with the internal. pass return True def orderings(self, fgraph): """Return orderings induced by destructive operations. Raise InconsistencyError when a) attempting to destroy indestructable variable, or b) attempting to destroy a value multiple times, or c) an Apply destroys (illegally) one of its own inputs by aliasing """ rval = OrderedDict() if self.destroyers: # BUILD DATA STRUCTURES # CHECK for multiple destructions during construction of variables droot, impact, __ignore = self.refresh_droot_impact() # check for destruction of constants illegal_destroy = [r for r in droot if \ getattr(r.tag, 'indestructible', False) or \ isinstance(r, graph.Constant)] if illegal_destroy: raise InconsistencyError( "Attempting to destroy indestructible variables: %s" % illegal_destroy) # add destroyed variable clients as computational dependencies for app in self.destroyers: # for each destroyed input... for output_idx, input_idx_list in app.op.destroy_map.items(): destroyed_idx = input_idx_list[0] destroyed_variable = app.inputs[destroyed_idx] root = droot[destroyed_variable] root_impact = impact[root] # we generally want to put all clients of things which depend on root # as pre-requisites of app. # But, app is itself one such client! # App will always be a client of the node we're destroying # (destroyed_variable, but the tricky thing is when it is also a client of # *another variable* viewing on the root. Generally this is illegal, (e.g., # add_inplace(x, x.T). In some special cases though, the in-place op will # actually be able to work properly with multiple destroyed inputs (e.g, # add_inplace(x, x). An Op that can still work in this case should declare # so via the 'destroyhandler_tolerate_same' attribute or # 'destroyhandler_tolerate_aliased' attribute. # # destroyhandler_tolerate_same should be a list of pairs of the form # [(idx0, idx1), (idx0, idx2), ...] # The first element of each pair is the input index of a destroyed # variable. # The second element of each pair is the index of a different input where # we will permit exactly the same variable to appear. # For example, add_inplace.tolerate_same might be [(0,1)] if the destroyed # input is also allowed to appear as the second argument. # # destroyhandler_tolerate_aliased is the same sort of list of # pairs. # op.destroyhandler_tolerate_aliased = [(idx0, idx1)] tells the # destroyhandler to IGNORE an aliasing between a destroyed # input idx0 and another input idx1. # This is generally a bad idea, but it is safe in some # cases, such as # - the op reads from the aliased idx1 before modifying idx0 # - the idx0 and idx1 are guaranteed not to overlap (e.g. # they are pointed at different rows of a matrix). # # CHECK FOR INPUT ALIASING # OPT: pre-compute this on import tolerate_same = getattr(app.op, 'destroyhandler_tolerate_same', []) assert isinstance(tolerate_same, list) tolerated = OrderedSet(idx1 for idx0, idx1 in tolerate_same if idx0 == destroyed_idx) tolerated.add(destroyed_idx) tolerate_aliased = getattr( app.op, 'destroyhandler_tolerate_aliased', []) assert isinstance(tolerate_aliased, list) ignored = OrderedSet(idx1 for idx0, idx1 in tolerate_aliased if idx0 == destroyed_idx) # print 'tolerated', tolerated # print 'ignored', ignored for i, input in enumerate(app.inputs): if i in ignored: continue if input in root_impact \ and (i not in tolerated or input is not destroyed_variable): raise InconsistencyError( "Input aliasing: %s (%i, %i)" % (app, destroyed_idx, i)) # add the rule: app must be preceded by all other Apply instances that # depend on destroyed_input root_clients = OrderedSet() for r in root_impact: assert not [ a for a, c in self.clients[r].items() if not c ] root_clients.update( [a for a, c in self.clients[r].items() if c]) root_clients.remove(app) if root_clients: rval[app] = root_clients return rval
def get_lr_scalers(self): rval = OrderedDict() if self.encoder is not None: safe_update(rval, self.encoder.get_lr_scalers()) return rval
class Monitor(object): """ A class for monitoring Models while they are being trained. A monitor object records the number of minibatches and number of examples the model has trained, as well as any number of "channels" that track quantities of interest (examples: the objective function, measures of hidden unit activity, reconstruction error, sum of squared second derivatives, average norm of the weight vectors, etc.) """ def __init__(self, model): """ Makes a monitor for `model`. Assumes the model has not been trained at all yet. Parameters ---------- model : pylearn2.models.model.Model instance """ self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs() def _build_data_specs(self): """ Computes a nested data_specs for input and all channels Also computes the mapping to flatten it. This function is called from redo_theano. """ # Ask the model what it needs m_space, m_source = self.model.get_monitoring_data_specs() input_spaces = [m_space] input_sources = [m_source] for channel in self.channels.values(): space = channel.data_specs[0] assert isinstance(space, Space) input_spaces.append(space) input_sources.append(channel.data_specs[1]) nested_space = CompositeSpace(input_spaces) nested_source = tuple(input_sources) self._nested_data_specs = (nested_space, nested_source) self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs) flat_space = self._data_specs_mapping.flatten(nested_space, return_tuple=True) flat_source = self._data_specs_mapping.flatten(nested_source, return_tuple=True) self._flat_data_specs = (CompositeSpace(flat_space), flat_source) def set_theano_function_mode(self, mode): """ Parameters ---------- mode : theano.compile.Mode Theano functions for the monitoring channels will be compiled and run using this mode. """ if self.theano_function_mode != mode: self._dirty = True self.theano_function_mode = mode def add_dataset(self, dataset, mode='sequential', batch_size=None, num_batches=None, seed=None): """ Determines the data used to calculate the values of each channel. Parameters ---------- dataset : object A `pylearn2.datasets.Dataset` object. mode : str or object, optional Iteration mode; see the docstring of the `iterator` method \ on `pylearn2.datasets.Dataset` for details. batch_size : int, optional The size of an individual batch. Optional if `mode` is \ 'sequential' and `num_batches` is specified (batch size \ will be calculated based on full dataset size). num_batches : int, optional The total number of batches. Unnecessary if `mode` is \ 'sequential' and `batch_size` is specified (number of \ batches will be calculated based on full dataset size). seed : int, optional Optional. The seed to be used for random iteration modes. """ # The user can ommit using lists if only one dataset is set if not isinstance(dataset, list): dataset = [dataset] if not isinstance(mode, list): mode = [mode] if not isinstance(batch_size, list): batch_size = [batch_size] if not isinstance(num_batches, list): num_batches = [num_batches] if seed is None: seed = [None] * len(dataset) if not isinstance(seed, list): seed = [seed] if len(mode) != len(dataset): raise ValueError("Received " + str(len(dataset)) + " dataset but " + str(len(mode)) + " modes.") if any([len(l) != len(dataset) for l in [batch_size, seed]]): raise ValueError("make sure each dataset has its iteration " + "batch size and number of batches.") for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed): try: it = d.iterator(mode=m, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) except ValueError as exc: raise ValueError("invalid iteration parameters in " + "Monitor.add_dataset: " + str(exc)) if it.stochastic: # Must be a seed, not a random number generator. If it were a # random number generator, different iterators using it would # update its state, so we would not get the same iterator # each time. Also, must not be None, because this makes the # iterator pick a seed based on the clock if sd is None: raise TypeError("Monitor requires a seed when using " + "stochastic iteration modes.") if not isinstance(sd, (list, tuple, int)): raise TypeError("Monitor requires a seed (not a random " + "number generator) when using " + "stochastic iteration modes.") else: # The iterator should catch this, but let's double-check assert sd is None if not d in self._datasets: self._datasets.append(d) self._iteration_mode.append(m) self._batch_size.append(b) self._num_batches.append(n) self._rng_seed.append(sd) def __call__(self): """ Runs the model on the monitoring dataset in order to add one data point to each of the channels. """ # If the channels have changed at all, we need to recompile the theano # functions used to compute them if self._dirty: self.redo_theano() datasets = self._datasets # Set all channels' val_shared to 0 self.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size, self._num_batches, self.accum, self._rng_seed, self.num_examples): if isinstance(d, basestring): d = yaml_parse.load(d) raise NotImplementedError() # need to put d back into self._datasets myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) # If self._flat_data_specs is empty, no channel needs data, # so we do not need to call the iterator in order to average # the monitored values across different batches, we only # have to call them once. if len(self._flat_data_specs[1]) == 0: X = () self.run_prereqs(X, d) a(*X) else: actual_ne = 0 for X in myiterator: # X is a flat (not nested) tuple self.run_prereqs(X, d) a(*X) actual_ne += self._flat_data_specs[0].np_batch_size(X) # end for X if actual_ne != ne: raise RuntimeError("At compile time, your iterator said " "it had " + str(ne) + " examples total, but at " "runtime it gave us " + str(actual_ne) + ".") # end for d log.info("Monitoring step:") log.info("\tEpochs seen: %d" % self._epochs_seen) log.info("\tBatches seen: %d" % self._num_batches_seen) log.info("\tExamples seen: %d" % self._examples_seen) t = time.time() - self.t0 for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key): channel = self.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(self._num_batches_seen) channel.example_record.append(self._examples_seen) channel.epoch_record.append(self._epochs_seen) val = channel.val_shared.get_value() channel.val_record.append(val) # TODO: use logging infrastructure so that user can configure # formatting if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val log.info("\t%s: %s" % (channel_name, val_str)) def run_prereqs(self, data, dataset): """ Runs all "prerequistie functions" on a batch of data. Always called right before computing the monitoring channels on that batch. Parameters ---------- data : tuple or Variable a member of the Space used as input to the monitoring functions dataset : Dataset the Dataset the data was drawn from """ if dataset not in self.prereqs: return for prereq in self.prereqs[dataset]: prereq(*data) def get_batches_seen(self): """ Returns the number of batches the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly). """ return self._num_batches_seen def get_epochs_seen(self): """ Returns ------- epochs_seen : int The number of epochs the model has been trained on. One "epoch" is one pass through Dataset.iterator. """ return self._epochs_seen def get_examples_seen(self): """ Returns ------- examples_seen : int The number of examples the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._examples_seen def report_batch(self, num_examples): """ Call this whenever the model has learned on another batch of examples. Report how many examples were learned on. Parameters ---------- num_examples : int The number of examples learned on in this minibatch. """ self._examples_seen += num_examples self._num_batches_seen += 1 def report_epoch(self): """ Call this whenever the model has completed another "epoch" of learning. We regard one pass through Dataset.iterator as one epoch. """ self._epochs_seen += 1 def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on theano_args = self._flat_data_specs[0].make_theano_batch( ['monitoring_%s' % s for s in self._flat_data_specs[1]]) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args, ) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) / cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append( function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names]) def register_names_to_del(self, names): """ Register names of fields that should be deleted before pickling. Parameters ---------- names : list A list of attribute names as strings. """ for name in names: if name not in self.names_to_del: self.names_to_del.append(name) def __getstate__(self): """ In order to avoid pickling a copy of the dataset whenever a monitor is saved, the __getstate__ method replaces the dataset field with the dataset's yaml source. This is not a perfect solution because it won't work with job resuming, which would require saving the state of the dataset's random number generator. Like in the Model class, we also need to avoid saving any Theano functions, so we delete everything that can be regenerated with `redo_theano` by deleting the fields in `self.names_to_del` """ # Patch old pickled monitors if not hasattr(self, '_datasets'): self._datasets = [self._dataset] del self._dataset temp = self._datasets if self._datasets: self._datasets = [] for dataset in temp: if isinstance(dataset, basestring): self._datasets.append(dataset) else: try: self._datasets.append(dataset.yaml_src) except AttributeError: warnings.warn('Trained model saved without ' + 'indicating yaml_src') d = copy.copy(self.__dict__) self._datasets = temp for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): """ Sets the object to have the state described by `d`. Parameters ---------- d : dict A dictionary mapping string names of fields to values for these fields. """ # patch old pkl files if '_dataset' in d: d['_datasets'] = [d['_dataset']] del d['_dataset'] self.__dict__.update(d) def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. \ (or a list/tuple containing symbolic tensors, following the \ data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of \ data drawn *from dataset* before the channel value is computed \ if two channels provide a prereq with exactly the same id, that \ prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt, ) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt, ) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and not isinstance( elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel ' + name + '\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not one of the " + "monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True def _sanity_check(self): """ Sometimes we serialize models and then load them somewhere else but still try to use their Monitor, and the Monitor is in a mangled state. I've added some calls to _sanity_check to try to catch when that happens. Not sure what to do for a long term fix. I think it requires making theano graphs serializable first. """ for name in self.channels: channel = self.channels[name] assert hasattr(channel, 'prereqs') @classmethod def get_monitor(cls, model): """ Returns a model's monitor. If the model doesn't have a monitor yet, installs one and returns that. Parameters ---------- model : object An object that implements the `Model` interface specified in \ `pylearn2.models`. """ if hasattr(model, 'monitor'): rval = model.monitor rval._sanity_check() else: rval = Monitor(model) model.monitor = rval return rval # TODO: find out if this method is used anywhere, remove if not. @property def batch_size(self): """ Returns ------- batch_size : int The size of the batches used for monitoring """ return self._batch_size # TODO: find out if this method is used anywhere, remove if not. @property def num_batches(self): """ Returns ------- num_batches : int The number of batches used for monitoring """ return self._num_batches def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential', obj_prereqs=None, cost_monitoring_args=None): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. Parameters ---------- dataset : pylearn2.datasets.Dataset Dataset or dictionary mapping string names to Datasets. If \ string names are used, then for every dataset, each channel \ defined by the model or cost will be replicated with that \ dataset's name followed by an underscore as the prefix. For \ example, if your cost defines a channel called 'misclass', and \ datasets is {'train' : train_dataset, 'valid' : valid_dataset} \ you will get channels called 'train_misclass' and 'valid_misclass'. cost : pylearn2.costs.Cost The cost being optimized by training. The value of the cost will appear as the `objective` channel. Its `get_monitoring_channels` method will also be used to supply other channels. extra_costs : OrderedDict, optional A dictionary mapping channel names to Cost objects. Their value will appear as the specified channel name. They will also provide more monitoring channels via their `get_monitoring_channels` method. obj_prereqs : None, or list of functions Functions to pass as prerequisites to the `objective` channel. cost_monitoring_args : dict Dictionary of kwargs that will be passed to \ `cost.get_monitoring_channels()` (but not for the extra_costs). """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: costs = extra_costs assert '' not in costs costs[''] = cost if cost_monitoring_args is None: cost_monitoring_args = {} model = self.model # Build a composite data_specs containing the specs for all costs, # then the specs of the model cost_names = sorted(costs.keys()) spaces = [] sources = [] for c in cost_names: c_space, c_source = costs[c].get_data_specs(model) spaces.append(c_space) sources.append(c_source) # Ask the model for the data_specs needed m_space, m_source = model.get_monitoring_data_specs() spaces.append(m_space) sources.append(m_source) nested_space = CompositeSpace(spaces) nested_sources = tuple(sources) # Flatten this data_specs, so we build only one symbolic Theano # variable for each of the unique (space, source) pairs. mapping = DataSpecsMapping((nested_space, nested_sources)) space_tuple = mapping.flatten(nested_space, return_tuple=True) source_tuple = mapping.flatten(nested_sources, return_tuple=True) ipt = tuple( space.make_theano_batch(name='monitor_%s' % source, batch_size=None) for (space, source) in safe_zip(space_tuple, source_tuple)) # Build a nested tuple from ipt, to dispatch the appropriate parts # of the ipt batch to each cost nested_ipt = mapping.nest(ipt) custom_channels = {} for i, cost_name in enumerate(cost_names): if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] cost_ipt = nested_ipt[i] raw_channels = cost.get_monitoring_channels(model, cost_ipt) channels = {} for name in raw_channels: # We need three things: the value itself (raw_channels[name]), # the input variables (cost_ipt), and the data_specs for # these input variables ((spaces[i], sources[i])) channels[prefix + name] = (raw_channels[name], cost_ipt, (spaces[i], sources[i])) custom_channels.update(channels) # Use the last inputs from nested_ipt for the model model_channels = model.get_monitoring_channels(nested_ipt[-1]) channels = {} for name in model_channels: # Note: some code used to consider that model_channels[name] # could be a a (channel, prereqs) pair, this is not supported. channels[name] = (model_channels[name], nested_ipt[-1], (spaces[-1], sources[-1])) custom_channels.update(channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks # that respond to the values in the monitor use the name to find # it. for i, cost_name in enumerate(cost_names): cost = costs[cost_name] cost_ipt = nested_ipt[i] cost_value = cost.expr(model, cost_ipt) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' prereqs = obj_prereqs else: name = dprefix + cost_name prereqs = None cost.get_data_specs(model)[0].validate(cost_ipt) self.add_channel(name=name, ipt=cost_ipt, val=cost_value, data_specs=cost.get_data_specs(model), dataset=cur_dataset, prereqs=prereqs) for key in custom_channels: val, ipt, data_specs = custom_channels[key] data_specs[0].validate(ipt) self.add_channel(name=dprefix + key, ipt=ipt, val=val, data_specs=data_specs, dataset=cur_dataset)
def train(self, monitor_channels=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! # First find the basic variables that will be updated params = set() for param in self.params.values(): params.update(base_variables(param)) params = list(params) gradients = grad(cost=self.loss_expression, wrt=params) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(params, gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") self.model.set_param_values(self.best_params, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
class StemCell(NonlinCell): """ WRITEME Parameters ---------- .. todo:: """ def __init__(self, parent=[], parent_dim=[], nout=None, init_W=InitCell('randn'), init_b=InitCell('zeros'), cons=0., name=None, lr_scaler=None, **kwargs): super(StemCell, self).__init__(**kwargs) if name is None: name = self.__class__.name__.lower() self.name = name self.nout = nout self.init_W = init_W self.init_b = init_b self.cons = cons self.parent = OrderedDict() parent_dim = tolist(parent_dim) for i, par in enumerate(tolist(parent)): if len(parent_dim) != 0 and len(parent) != 0: if len(parent) != len(parent_dim): raise AssertionError( "You probably had a mistake providing,\ write number of values. It will end,\ up with a model containing a bug.") self.parent[par] = parent_dim[i] else: self.parent[par] = None self.params = OrderedDict() self.lr_scaler = lr_scaler def get_params(self): return self.params def fprop(self, x=None): raise NotImplementedError( str(type(self)) + " does not implement Layer.fprop.") def alloc(self, x): self.params[x.name] = x def initialize(self): for parname, parout in self.parent.items(): W_shape = (parout, self.nout) W_name = 'W_' + parname + '__' + self.name self.alloc(self.init_W.get(W_shape, W_name)) self.alloc(self.init_b.get(self.nout, 'b_' + self.name)) def add_noisy_params(self, key=['W'], weight_noise=0.075): self.noisy_params = OrderedDict() for param in self.params.items(): if param[0].split('_')[0] in key: self.noisy_params[param[0]] = add_noise( param[1], weight_noise, self.theano_rng) def del_noisy_params(self): del self.noisy_params
def __init__(self, iterable=None): self.data = OrderedDict() if iterable is not None: self.update(iterable)
class Optimizer(object): """ Default interface for an optimizer implementation - this provides the necessary parameter updates when training a model on a dataset using an online stochastic process. The base framework for performing stochastic gradient descent. """ def __init__(self, dataset, loss=None, model=None, epochs=1000, batch_size=100, min_batch_size=1, save_freq=10, stop_threshold=None, stop_patience=50, learning_rate=1e-3, lr_decay=None, lr_decay_factor=None, grad_clip=None, hard_clip=False, **kwargs): """ Initialize the Optimizer. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int How many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int, optional How many epochs to train between each new save of the Model's parameters. stop_threshold : float, optional The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int, optional The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for classes of decay and documentation. lr_decay_factor : float The amount of decay to use for the ``lr_decay`` type of decay. grad_clip : float, optional Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ log.info("Initializing optimizer %s", str(self.__class__.__name__)) # Deal with early stopping None initializations (no early stopping). if not stop_threshold: stop_threshold = numpy.inf if not save_freq: save_freq = 1000000 if not stop_patience: stop_patience = 1 # Put all init parameters in self.args so we can log the initial configuration. self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("Optimizer config args: %s", str(self.args)) # if the optimizer wasn't initialized with a Model (train() being called from the model class itself), # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't # the best option because other methods besides stochastic ones can exist for optimizers in the future. # TODO: fix this up - feels like a hack just to make model.train() work... if not model: return # Otherwise, things are proceeding as normal. Carry on... assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \ "Found %s" % str(model.__class__.__name__) assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \ "Found %s" % str(dataset.__class__.__name__) # deal with loss expression/targets if loss is not None: assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \ "Found %s" % str(loss.__class__.__name__) if isinstance(loss, Loss): self.loss_targets = loss.get_targets() self.loss_expression = loss.get_loss() else: assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented." if isinstance(model.get_loss(), tuple): self.loss_targets = raise_to_list(model.get_loss()[0]) self.loss_expression = model.get_loss()[1] else: self.loss_targets = None self.loss_expression = model.get_loss() model_inputs = raise_to_list(model.get_inputs()) n_model_inputs = len(model_inputs) model_targets = self.loss_targets or [] for input in model_inputs: if input in model_targets: model_targets.remove(input) n_model_targets = len(model_targets) self.unsupervised = (n_model_targets is 0) # make sure the number of inputs/targets matches up with the dataset properties # train assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \ "Dataset has %d train inputs, while model expects %d" % \ (len(raise_to_list(dataset.train_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \ "Dataset has %d train targets, while model expects %d" % \ (len(raise_to_list(dataset.train_targets) or []), n_model_targets) # valid if dataset.valid_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \ "Dataset has %d valid inputs, while model expects %d" % \ (len(raise_to_list(dataset.valid_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \ "Dataset has %d valid targets, while model expects %d" % \ (len(raise_to_list(dataset.valid_targets) or []), n_model_targets) # test if dataset.test_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \ "Dataset has %d test inputs, while model expects %d" % \ (len(raise_to_list(dataset.test_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \ "Dataset has %d test targets, while model expects %d" % \ (len(raise_to_list(dataset.test_targets) or []), n_model_targets) # now we are happy, we can add them to `self` self.model = model self.dataset = dataset self.loss = loss # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') # whether to scale individual model parameters' learning rates. self.lr_scalers = self.model.get_lr_scalers() # whether to decay if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, learning_rate, lr_decay_factor) else: self.learning_rate_decay = False # rest of initial parameters needed for training. self.batch_size = batch_size self.min_batch_size = min_batch_size self.n_epoch = epochs self.save_frequency = save_freq self.early_stop_threshold = stop_threshold self.early_stop_length = stop_patience self.grad_clip = grad_clip self.hard_clip = hard_clip def get_updates(self, gradients): """ This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up Stochastic Gradient Descent for optimizer...') updates = OrderedDict() for (param, gradient) in iteritems(gradients): scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[param] = param - scaled_lr * gradient return updates def train(self, monitor_channels=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! # First find the basic variables that will be updated params = set() for param in self.params.values(): params.update(base_variables(param)) params = list(params) gradients = grad(cost=self.loss_expression, wrt=params) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(params, gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") self.model.set_param_values(self.best_params, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) if not self.model.switches_on: self.model.turn_on_switches() ######### # train # ######### train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} train_data = [ minibatch(input_data, self.batch_size, self.min_batch_size) for input_data in raise_to_list(self.dataset.train_inputs) ] if self.dataset.train_targets is not None and not self.unsupervised: train_data += [ minibatch(target, self.batch_size, self.min_batch_size) for target in raise_to_list(self.dataset.train_targets) ] for batch in min_normalized_izip(*train_data): _outs = raise_to_list(f_learn(*batch)) train_costs.append(_outs[0]) # handle any user defined monitors (if different from the train cost) if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: val = numpy.asarray(val) train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], "train") # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if self.model.switches_on: self.model.turn_off_switches() ######### # valid # ######### self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets, self.valid_monitors_dict, self.valid_monitor_function, self.valid_monitors_outservice_dict, plot) ######## # test # ######## self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets, self.test_monitors_dict, self.test_monitor_function, self.test_monitors_outservice_dict, plot) ########### # cleanup # ########### # check for early stopping on train costs cost = numpy.sum(train_costs) # if the cost improved, reset the patience and record the best cost. if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = self.model.get_param_values(borrow=False) elif not numpy.isnan(cost): self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter)) # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # return whether or not to stop this epoch return stop def _compute_over_subset(self, subset, inputs, targets, monitors_dict, monitor_function, monitors_outservice_dict, plot): inputs = raise_to_list(inputs) targets = raise_to_list(targets) if inputs is not None and len(monitors_dict) > 0: monitors = {key: [] for key in monitors_dict.keys()} data = [minibatch(input, self.batch_size, self.min_batch_size) for input in inputs] if targets is not None and not self.unsupervised: data += [minibatch(target, self.batch_size, self.min_batch_size) for target in targets] for batch in min_normalized_izip(*data): _outs = raise_to_list(monitor_function(*batch)) current_monitors = zip(monitors_dict.keys(), _outs) for name, val in current_monitors: val = numpy.asarray(val) monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in monitors.items()} # log the mean values! log.info('%s monitors: %s', subset, str(current_mean_monitors)) # send the values to their outservices for name, service in monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], subset) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) def get_decay_params(self): """ Returns a list of all the Decay objects to decay during training. Returns ------- list List of Decay objects to use after each training epoch - in this case the learning rate decay. """ decay_params = self.model.get_decay_params() if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: decay_params.append(self.learning_rate_decay) return decay_params
def build_computation_graph(self): ################# # Build the GSN # ################# log.debug("Building GSN graphs...") # GSN for training - with noise specified in initialization # if there is no hiddens_hook, build the GSN normally using the input X if not self.hiddens_flag: p_X_chain, _ = self.build_gsn(add_noise=self.add_noise) # if there is a hiddens_hook, we want to change the order layers are updated and make this purely # generative from the hiddens else: p_X_chain, _, = self.build_gsn(hiddens=self.hiddens, add_noise=self.add_noise, reverse=True) # GSN for prediction - same as above but no noise # deal with hiddens_hook exactly as above. if not self.hiddens_flag: p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False) else: p_X_chain_recon, recon_hiddens = self.build_gsn( hiddens=self.hiddens, add_noise=False, reverse=True) #################### # Costs and output # #################### log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN') # use the noisy ones for training cost costs = [ self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain ] self.show_cost = costs[-1] # for a monitor to show progress cost = numpy.sum( costs ) # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH # use the non-noisy graph for prediction gsn_costs_recon = [ self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain_recon ] # another monitor, same as self.show_cost but on the non-noisy graph. self.monitor = gsn_costs_recon[-1] # this should be considered the main output of the computation, the sample after the # last walkback from the non-noisy graph. output = p_X_chain_recon[-1] # these should be considered the model's hidden representation - the hidden representation after # the last walkback from the non-noisy graph. hiddens = recon_hiddens train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0) train_mse = T.mean(train_mse) mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0) mse = T.mean(mse) monitors = OrderedDict([('noisy_recon_cost', self.show_cost), ('recon_cost', self.monitor), ('mse', mse), ('train_mse', train_mse)]) ############ # Sampling # ############ # the input to the sampling function X_sample = T.matrix("X_sampling") self.network_state_input = [X_sample] + [ T.matrix("H_sampling_" + str(i + 1)) for i in range(self.layers) ] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.debug("Performing one walkback in network state sampling.") self.update_layers(self.network_state_output, visible_pX_chain, add_noise=True, reverse=False) ##################################################### # Create the run and monitor functions # ##################################################### log.debug("Compiling functions...") t = time.time() # doesn't make sense to have this if there is a hiddens_hook if not self.hiddens_flag: # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy # computation graph log.debug("f_run...") self.f_run = function(inputs=[self.X], outputs=output, name='gsn_f_run') # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the # input to f_run) log.debug("f_noise...") self.f_noise = function(inputs=[self.X], outputs=self.input_noise(self.X), name='gsn_f_noise') # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood # or visualization) log.debug("f_sample...") if self.layers == 1: self.f_sample = function(inputs=[X_sample], outputs=visible_pX_chain[-1], name='gsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = function(inputs=self.network_state_input, outputs=self.network_state_output + visible_pX_chain, name='gsn_f_sample') log.debug("GSN compiling done. Took %s", make_time_units_string(time.time() - t)) return cost, monitors, output, hiddens
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is needed so that if new channels are added, Theano's optimizations make sure (to the extent that they can) that the new channels and old channels don't have any redundant calculations. It is also needed to regenerate Theano functions after pickling and unpickling, since Theano functions should not be pickled. """ self._dirty = False init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data the model # acts on X = self.model.get_input_space().make_theano_batch(name="monitoring_X") if config.compute_test_value != 'off': m = self.model.get_test_batch_size() test_value = self.model.get_input_space().get_origin_batch(m) X.tag.test_value = np.cast[X.type.dtype](test_value) if self.require_label: Y = self.model.get_output_space().make_theano_batch( name="monitoring_Y") log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling monitor including channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for channel in self.channels.values(): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] if isinstance(channel.graph_input, (list, tuple)): channel_X, channel_Y = channel.graph_input assert channel_X not in g or g[channel_X] is X assert channel_Y not in g or g[channel_Y] is Y g[channel_X] = X g[channel_Y] = Y else: channel_X = channel.graph_input assert channel_X not in g or g[channel_X] is X g[channel_X] = X if n == 0: raise ValueError( "Iterating over 0 examples results in divide by 0") if self.topo: batch_index = d.get_topo_batch_axis() else: batch_index = 0 val = channel.val * T.cast(X.shape[batch_index], config.floatX) / cur_num_examples u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' \ + key.name + ' has dtype ' + key.dtype + \ ' but is driven by an expression with type ' + \ up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if self.require_label: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just monitor the model # parameters, or some shared variable updated by the training algorithm, so we # need to ignore the unused input error self.accum.append( function([X, Y], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) else: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling unsupervised accum\n') self.accum.append( function([X], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names])
def add_noisy_params(self, key=['W'], weight_noise=0.075): self.noisy_params = OrderedDict() for param in self.params.items(): if param[0].split('_')[0] in key: self.noisy_params[param[0]] = add_noise( param[1], weight_noise, self.theano_rng)
def clip_gradients(gradients, grad_clip=5., hard_clip=False): """ This returns the gradient parameters clipped according to the grad_clip value given in initialization. As described here: http://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/ Code mostly taken from https://github.com/kastnerkyle/minet/blob/master/minet/net.py Based on: Pascanu, Razvan, Tomas Mikolov, and Yoshua Bengio. "On the difficulty of training recurrent neural networks." arXiv preprint arXiv:1211.5063 (2012). Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. grad_clip : float, optional How much to clip gradients (if at all). hard_clip : bool Whether to use hard clipping (keeping gradients at grad_clip level), or soft clipping (rescaling based on grad_clip). Returns ------- clipgrads : dict A dictionary mapping from the model's parameters to their correctly clipped gradients. (If no self.grad_clip, this just returns the original `gradients` input parameter). """ if grad_clip: gradients = gradients.items() params = [item[0] for item in gradients] grads = [item[1] for item in gradients] # Gradient clipping grad_norm = T.sqrt(sum([T.sqr(grad).sum() for grad in grads])) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = grad_clip scaling_den = T.maximum(grad_clip, grad_norm) if hard_clip: # do the NaN/inf trick grads = [T.switch(not_finite, 0.1 * param, grad) for param, grad in gradients] # hard clip gradients above or below grad_clip to be = grad_clip grads = [T.switch(T.ge(grad_norm, grad_clip), T.sgn(grad) * grad_clip, grad) for grad in grads] else: # NaN/inf trick combined with scaling. grads = [T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) for param, grad in gradients] clipgrads = OrderedDict(zip(params, grads)) return clipgrads else: return gradients
def init_params(params): tparams = OrderedDict() for kk, pp in params.items(): tparams[kk] = theano.shared(params[kk], name=kk) return tparams
class AlexNet(Model): """ This is the base model for AlexNet, Alex Krizhevsky's efficient deep convolutional net described in: 'ImageNet Classification with Deep Convolutional Neural Networks' Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf Most of the code here is adapted from the authors listed in the license above, from the paper: 'Theano-based large-scale visual recognition with multiple GPUs' Weiguang Ding & Ruoyan Wnag, Fei Mao, Graham Taylor http://arxiv.org/pdf/1412.2302.pdf Copyright (c) 2014, Weiguang Ding, Ruoyan Wang, Fei Mao and Graham Taylor All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ defaults = { # data stuff "use_data_layer": False, "rand_crop": True, "batch_size": 256, # convolutional nets are particular about the batch size "output_path": '/outputs/alexnet/' } def __init__(self, config=None, defaults=defaults, inputs_hook=None, hiddens_hook=None, params_hook=None, use_data_layer=None, rand_crop=None, batch_size=None): # init Model to combine the defaults and config dictionaries. super(AlexNet, self).__init__(config, defaults) # all configuration parameters are now in self.args if inputs_hook or hiddens_hook or params_hook: log.critical( "Inputs_hook, hiddens_hook, and params_hook not implemented yet for AlexNet!" ) raise NotImplementedError() self.flag_datalayer = use_data_layer or self.args.get('use_data_layer') self.batch_size = batch_size or self.args.get('batch_size') self.rand_crop = rand_crop or self.args.get('rand_crop') #################### # Theano variables # #################### # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data self.x = T.ftensor4('x') self.y = T.lvector('y') self.rand = T.fvector('rand') ########## # params # ########## self.params = [] # make the network! self.build_computation_graph() def build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images( input=self.x, image_shape=( self.batch_size, 3, 256, 256, ), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in c01b format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=(( self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer( inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input), output_size=4096, config=fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # now apply dropout to the output for training dropout_layer6 = dropout(fc_layer6.get_outputs(), corruption_level=0.5) log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, config=fc_config) fc_layer7_train = BasicLayer(inputs_hook=(4096, dropout_layer6), output_size=4096, params_hook=fc_layer7.get_params(), config=fc_config) # Add this layer's parameters! self.params += fc_layer7_train.get_params() # apply dropout again for training dropout_layer7 = dropout(fc_layer7_train.get_outputs(), corruption_level=0.5) # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, config=softmax_config) softmax_layer8_train = SoftmaxLayer( inputs_hook=(4096, dropout_layer7), output_size=1000, params_hook=softmax_layer8.get_params(), config=softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8_train.negative_log_likelihood(self.y) cost = softmax_layer8.negative_log_likelihood(self.y) errors = softmax_layer8.errors(self.y) train_errors = softmax_layer8_train.errors(self.y) self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_predict...") # use the actual argmax from the classification self.f_predict = function( inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("f_monitors") self.f_monitors = function(inputs=[self.x, self.y], outputs=self.monitors.values()) log.debug("compilation took %s" % make_time_units_string(time.time() - t)) def get_inputs(self): """ This should return the input(s) to the model's computation graph. This is called by the Optimizer when creating the theano train function on the cost expression returned by get_train_cost(). This should normally return the same theano variable list that is used in the inputs= argument to the f_predict function. ------------------ :return: Theano variables representing the input(s) to the training function. :rtype: List(theano variable) """ return [self.x] def get_outputs(self): """ This method will return the model's output variable expression from the computational graph. This should be what is given for the outputs= part of the 'f_predict' function from self.predict(). This will be used for creating hooks to link models together, where these outputs can be strung as the inputs or hiddens to another model :) ------------------ :return: theano expression of the outputs from this model's computation :rtype: theano tensor (expression) """ return self.output def predict(self, input): """ This method will return the model's output (run through the function), given an input. In the case that input_hooks or hidden_hooks are used, the function should use them appropriately and assume they are the input. Try to avoid re-compiling the theano function created for predict - check a hasattr(self, 'f_predict') or something similar first. I recommend creating your theano f_predict in a create_computation_graph method to be called after the class initializes. ------------------ :param input: Theano/numpy tensor-like object that is the input into the model's computation graph. :type input: tensor :return: Theano/numpy tensor-like object that is the output of the model's computation graph. :rtype: tensor """ if not hasattr(self, 'f_predict'): log.error( "Missing self.f_predict - make sure you ran self.build_computation_graph()! " "This should have run during initialization....") raise NotImplementedError() return self.f_predict(*input) def get_train_cost(self): """ This returns the expression that represents the cost given an input, which is used for the Optimizer during training. The reason we can't just compile a f_train theano function is because updates need to be calculated for the parameters during gradient descent - and these updates are created in the Optimizer object. ------------------ :return: theano expression of the model's training cost, from which parameter gradients will be computed. :rtype: theano tensor """ return self.train_cost def get_monitors(self): """ This returns a dictionary of (monitor_name: monitor_function) of variables (monitors) whose values we care about during training. For every monitor returned by this method, the function will be run on the train/validation/test dataset and its value will be reported. Again, please avoid recompiling the monitor functions every time - check your hasattr to see if they already exist! ------------------ :return: Dictionary of String: theano_function for each monitor variable we care about in the model. :rtype: Dictionary """ if not hasattr(self, 'f_monitors'): log.error( "Missing self.f_monitors - make sure you ran self.build_computation_graph()! " "This should have run during initialization....") raise NotImplementedError() names = ', '.join(self.monitors.keys()) return {names: self.f_monitors} def get_params(self): """ This returns the list of theano shared variables that will be trained by the Optimizer. These parameters are used in the gradient. ------------------ :return: flattened list of theano shared variables to be trained :rtype: List(shared_variables) """ return self.params
def __init__(self, nh, nc, ne, de, cs, em, init): """ nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings cs :: word window context size """ tmp_emb = 0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de)) if init: for row in xrange(ne + 1): if em[row] is not None: tmp_emb[row] = em[row] self.emb = theano.shared(tmp_emb.astype(theano.config.floatX)) self.Wx = theano.shared( 0.2 * numpy.random.uniform(-1.0, 1.0, (de * cs, nh)).astype(theano.config.floatX)) self.Ws = theano.shared( 0.2 * numpy.random.uniform(-1.0, 1.0, (nc, nh)).astype(theano.config.floatX)) self.W = theano.shared( 0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nc)).astype(theano.config.floatX)) self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX)) self.s0 = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX)) # bundle self.params = [ self.emb, self.Wx, self.Ws, self.W, self.bh, self.b, self.s0 ] self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 's0'] idxs = T.imatrix( ) # as many columns as context window size/lines as words in the sentence x = self.emb[idxs].reshape((idxs.shape[0], de * cs)) y = T.iscalar('y') # label def recurrence(x_t, s_tm1): h_t = T.nnet.sigmoid( T.dot(x_t, self.Wx) + T.dot(s_tm1, self.Ws) + self.bh) s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b)[0] return [h_t, s_t] [h, s], _ = theano.scan(fn=recurrence, sequences=x, outputs_info=[None, self.s0], n_steps=x.shape[0]) p_y_given_x_lastword = s[-1, :] p_y_given_x_sentence = s y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost and gradients and learning rate lr = T.scalar('lr') nll = -T.mean(T.log(p_y_given_x_lastword)[y]) gradients = T.grad(nll, self.params) updates = OrderedDict( (p, p - lr * g) for p, g in zip(self.params, gradients)) # theano functions self.classify = theano.function(inputs=[idxs], outputs=y_pred) self.train = theano.function(inputs=[idxs, y, lr], outputs=nll, updates=updates) self.normalize = theano.function( inputs=[], updates={ self.emb: self.emb / T.sqrt( (self.emb**2).sum(axis=1)).dimshuffle(0, 'x') })
def get_monitoring_channels(self, model, X, Y = None, drop_mask = None, drop_mask_Y = None, **kwargs): """ .. todo:: WRITEME """ if self.supervised: assert Y is not None rval = OrderedDict() # TODO: shouldn't self() handle this? if drop_mask is not None and drop_mask.ndim < X.ndim: if self.mask_gen is not None: assert self.mask_gen.sync_channels if X.ndim != 4: raise NotImplementedError() drop_mask = drop_mask.dimshuffle(0,1,2,'x') scratch = self(model, X, Y, drop_mask = drop_mask, drop_mask_Y = drop_mask_Y, return_locals = True) history = scratch['history'] new_history = scratch['new_history'] new_drop_mask = scratch['new_drop_mask'] new_drop_mask_Y = None drop_mask = scratch['drop_mask'] if self.supervised: drop_mask_Y = scratch['drop_mask_Y'] new_drop_mask_Y = scratch['new_drop_mask_Y'] ii = 0 for name in ['inpaint_cost', 'l1_act_cost', 'toronto_act_cost', 'reweighted_act_cost']: var = scratch[name] if var is not None: rval['total_inpaint_cost_term_'+str(ii)+'_'+name] = var ii = ii + 1 if self.monitor_each_step: for ii, packed in enumerate(safe_izip(history, new_history)): state, new_state = packed rval['all_inpaint_costs_after_' + str(ii)] = self.cost_from_states(state, new_state, model, X, Y, drop_mask, drop_mask_Y, new_drop_mask, new_drop_mask_Y) if ii > 0: prev_state = history[ii-1] V_hat = state['V_hat'] prev_V_hat = prev_state['V_hat'] rval['max_pixel_diff[%d]'%ii] = abs(V_hat-prev_V_hat).max() final_state = history[-1] #empirical beta code--should be moved to gaussian visible layer, should support topo data #V_hat = final_state['V_hat'] #err = X - V_hat #masked_err = err * drop_mask #sum_sqr_err = T.sqr(masked_err).sum(axis=0) #recons_count = T.cast(drop_mask.sum(axis=0), 'float32') # empirical_beta = recons_count / sum_sqr_err # assert empirical_beta.ndim == 1 #rval['empirical_beta_min'] = empirical_beta.min() #rval['empirical_beta_mean'] = empirical_beta.mean() #rval['empirical_beta_max'] = empirical_beta.max() layers = model.get_all_layers() states = [ final_state['V_hat'] ] + final_state['H_hat'] for layer, state in safe_izip(layers, states): d = layer.get_monitoring_channels_from_state(state) for key in d: mod_key = 'final_inpaint_' + layer.layer_name + '_' + key assert mod_key not in rval rval[mod_key] = d[key] if self.supervised: inpaint_Y_hat = history[-1]['H_hat'][-1] err = T.neq(T.argmax(inpaint_Y_hat, axis=1), T.argmax(Y, axis=1)) assert err.ndim == 1 assert drop_mask_Y.ndim == 1 err = T.dot(err, drop_mask_Y) / drop_mask_Y.sum() if err.dtype != inpaint_Y_hat.dtype: err = T.cast(err, inpaint_Y_hat.dtype) rval['inpaint_err'] = err Y_hat = model.mf(X)[-1] Y = T.argmax(Y, axis=1) Y = T.cast(Y, Y_hat.dtype) argmax = T.argmax(Y_hat,axis=1) if argmax.dtype != Y_hat.dtype: argmax = T.cast(argmax, Y_hat.dtype) err = T.neq(Y , argmax).mean() if err.dtype != Y_hat.dtype: err = T.cast(err, Y_hat.dtype) rval['err'] = err if self.monitor_multi_inference: Y_hat = model.inference_procedure.multi_infer(X) argmax = T.argmax(Y_hat,axis=1) if argmax.dtype != Y_hat.dtype: argmax = T.cast(argmax, Y_hat.dtype) err = T.neq(Y , argmax).mean() if err.dtype != Y_hat.dtype: err = T.cast(err, Y_hat.dtype) rval['multi_err'] = err return rval
class Monitor(object): """ A class for monitoring Models while they are being trained. A monitor object records the number of minibatches and number of examples the model has trained, as well as any number of "channels" that track quantities of interest (examples: the objective function, measures of hidden unit activity, reconstruction error, sum of squared second derivatives, average norm of the weight vectors, etc.) """ def __init__(self, model): """ Makes a monitor for `model`. Assumes the model has not been trained at all yet. Parameters ---------- model : pylearn2.models.model.Model instance """ self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() # Determine whether the model should use topological or vector form of # examples. If the model acts on a space with more than the batch index # and channel dimension, the model has topological dimensions, so the # topological view of the data should be used. vector = model.get_input_space().make_theano_batch( name='monitoring_input') if isinstance(vector.type, theano.sparse.SparseType): self.topo = False else: self.topo = len(vector.type.broadcastable) > 2 self.require_label = False self.theano_function_mode = None def set_theano_function_mode(self, mode): if self.theano_function_mode != mode: self._dirty = True self.theano_function_mode = mode def add_dataset(self, dataset, mode='sequential', batch_size=None, num_batches=None, seed=None): """ Determines the data used to calculate the values of each channel. Parameters ---------- dataset : object A `pylearn2.datasets.Dataset` object. mode : str or object, optional Iteration mode; see the docstring of the `iterator` method on `pylearn2.datasets.Dataset` for details. batch_size : int, optional The size of an individual batch. Optional if `mode` is 'sequential' and `num_batches` is specified (batch size will be calculated based on full dataset size). num_batches : int, optional The total number of batches. Unnecessary if `mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). """ # The user can ommit using lists if only one dataset is set if not isinstance(dataset, list): dataset = [dataset] if not isinstance(mode, list): mode = [mode] if not isinstance(batch_size, list): batch_size = [batch_size] if not isinstance(num_batches, list): num_batches = [num_batches] if seed is None: seed = [None] * len(dataset) if not isinstance(seed, list): seed = [seed] if len(mode) != len(dataset): raise ValueError("Received " + str(len(dataset)) + " dataset but " + str(len(mode)) + " modes.") if any([len(l) != len(dataset) for l in [batch_size, seed]]): raise ValueError("make sure each dataset has its iteration " + \ "batch size and number of batches.") for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed): try: it = d.iterator(mode=m, batch_size=b, num_batches=n, topo=self.topo, targets=self.require_label, rng=sd) except ValueError as exc: raise ValueError("invalid iteration parameters in " "Monitor.add_dataset: " + str(exc)) if it.stochastic: # must be a seed, not a random number generator # if it were a random number generator, different iterators using # it would update its state, so we would not get the same iterator # each time # Also, must not be None, because this makes the iterator pick # a seed based on the clock if sd is None: raise TypeError( "Monitor requires a seed when using stochastic iteration modes." ) if not isinstance(sd, (list, tuple, int)): raise TypeError( "Monitor requires a seed (not a random number generator) when using stochastic iteration modes." ) else: assert sd is None # the iterator should catch this, but let's double-check if not d in self._datasets: self._datasets.append(d) self._iteration_mode.append(m) self._batch_size.append(b) self._num_batches.append(n) self._rng_seed.append(sd) def __call__(self): """ Runs the model on the monitoring dataset in order to add one data point to each of the channels. """ # If the channels have changed at all, we need to recompile the theano # functions used to compute them if self._dirty: self.redo_theano() model = self.model datasets = self._datasets # Set all channels' val_shared to 0 self.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size, self._num_batches, self.accum, self._rng_seed, self.num_examples): if isinstance(d, basestring): d = yaml_parse.load(d) raise NotImplementedError() # need to put d back into self._datasets myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, topo=self.topo, targets=self.require_label, rng=sd) actual_ne = 0 for X in myiterator: if self.require_label: X, y = X self.run_prereqs(X, y, d) a(X, y) else: self.run_prereqs(X, None, d) a(X) if X.ndim == 2: actual_batch_size = X.shape[0] else: actual_batch_size = X.shape[d.get_topo_batch_axis()] actual_ne += actual_batch_size # end for X if actual_ne != ne: raise RuntimeError( "At compile time, your iterator said it had " + str(ne) + " examples total, but at runtime it gave us " + str(actual_ne) + ".") # end for d log.info("Monitoring step:") log.info("\tEpochs seen: %d" % self._epochs_seen) log.info("\tBatches seen: %d" % self._num_batches_seen) log.info("\tExamples seen: %d" % self._examples_seen) t = time.time() - self.t0 for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key): channel = self.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(self._num_batches_seen) channel.example_record.append(self._examples_seen) channel.epoch_record.append(self._epochs_seen) val = channel.val_shared.get_value() channel.val_record.append(val) # TODO: use logging infrastructure so that user can configure # formatting if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val log.info("\t%s: %s" % (channel_name, val_str)) def run_prereqs(self, X, y, dataset): if dataset not in self.prereqs: return for prereq in self.prereqs[dataset]: prereq(X, y) def get_batches_seen(self): """ Returns the number of batches the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._num_batches_seen def get_epochs_seen(self): return self._epochs_seen def get_examples_seen(self): """ Returns the number of examples the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._examples_seen def report_batch(self, num_examples): """ Call this whenever the model has learned on another batch of examples. Report how many examples were learned on. """ self._examples_seen += num_examples self._num_batches_seen += 1 def report_epoch(self): self._epochs_seen += 1 def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is needed so that if new channels are added, Theano's optimizations make sure (to the extent that they can) that the new channels and old channels don't have any redundant calculations. It is also needed to regenerate Theano functions after pickling and unpickling, since Theano functions should not be pickled. """ self._dirty = False init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data the model # acts on X = self.model.get_input_space().make_theano_batch(name="monitoring_X") if config.compute_test_value != 'off': m = self.model.get_test_batch_size() test_value = self.model.get_input_space().get_origin_batch(m) X.tag.test_value = np.cast[X.type.dtype](test_value) if self.require_label: Y = self.model.get_output_space().make_theano_batch( name="monitoring_Y") log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling monitor including channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for channel in self.channels.values(): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] if isinstance(channel.graph_input, (list, tuple)): channel_X, channel_Y = channel.graph_input assert channel_X not in g or g[channel_X] is X assert channel_Y not in g or g[channel_Y] is Y g[channel_X] = X g[channel_Y] = Y else: channel_X = channel.graph_input assert channel_X not in g or g[channel_X] is X g[channel_X] = X if n == 0: raise ValueError( "Iterating over 0 examples results in divide by 0") if self.topo: batch_index = d.get_topo_batch_axis() else: batch_index = 0 val = channel.val * T.cast(X.shape[batch_index], config.floatX) / cur_num_examples u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' \ + key.name + ' has dtype ' + key.dtype + \ ' but is driven by an expression with type ' + \ up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if self.require_label: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just monitor the model # parameters, or some shared variable updated by the training algorithm, so we # need to ignore the unused input error self.accum.append( function([X, Y], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) else: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling unsupervised accum\n') self.accum.append( function([X], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names]) def register_names_to_del(self, names): """ Register names of fields that should be deleted before pickling. Parameters ---------- names : list A list of attribute names as strings. """ for name in names: if name not in self.names_to_del: self.names_to_del.append(name) def __getstate__(self): """ In order to avoid pickling a copy of the dataset whenever a monitor is saved, the __getstate__ method replaces the dataset field with the dataset's yaml source. This is not a perfect solution because it won't work with job resuming, which would require saving the state of the dataset's random number generator. Like in the Model class, we also need to avoid saving any Theano functions, so we delete everything that can be regenerated with `redo_theano` by deleting the fields in `self.names_to_del` """ # Patch old pickled monitors if not hasattr(self, '_datasets'): self._datasets = [self._dataset] del self._dataset temp = self._datasets if self._datasets: self._datasets = [] for dataset in temp: if isinstance(dataset, basestring): self._datasets.append(dataset) else: try: self._datasets.append(dataset.yaml_src) except AttributeError: warnings.warn( 'Trained model saved without indicating yaml_src') d = copy.copy(self.__dict__) self._datasets = temp for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): # patch old pkl files if '_dataset' in d: d['_datasets'] = [d['_dataset']] del d['_dataset'] self.__dict__.update(d) def add_channel(self, name, ipt, val, dataset=None, prereqs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name: str The display name in the monitor. ipt: tensor_like The symbolic tensor which should be clamped to the data. (or a (features,targets) list/tuple containing two symbolic tensors) val: tensor_like The value (function of `ipt`) to be tracked. dataset: A Dataset instance specifying which dataset to compute this channel on. prereqs: list of callables that take two numpy tensors (X and y, where y will be None if no labels are used) each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if not isinstance(ipt, (list, tuple)): tmp = [ipt] else: tmp = ipt inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and not isinstance( elem, theano.gof.graph.Constant): if elem not in tmp: raise ValueError("Unspecified input: " + str(elem)) mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel ' + name + '\n') if isinstance(ipt, (list, tuple)): for elem in ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(ipt) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not " + \ "one of the monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) if isinstance(ipt, (list, tuple)): if dataset is not None: if not dataset.has_targets(): raise ValueError("Tried to create a channel ("+name \ +") that uses targets, but monitoring dataset has no targets") self.require_label = True assert len(ipt) == 2 self.channels[name] = MonitorChannel(ipt, val, name, dataset, prereqs) self._dirty = True def _sanity_check(self): """ Sometimes we serialize models and then load them somewhere else but still try to use their Monitor, and the Monitor is in a mangled state. I've added some calls to _sanity_check to try to catch when that happens. Not sure what to do for a long term fix. I think it requires making theano graphs serializable first. """ for name in self.channels: channel = self.channels[name] assert hasattr(channel, 'prereqs') @classmethod def get_monitor(cls, model): """ Returns a model's monitor. If the model doesn't have a monitor yet, installs one and returns that. Parameters ---------- model : object An object that implements the `Model` interface specified in `pylearn2.models`. """ if hasattr(model, 'monitor'): rval = model.monitor rval._sanity_check() else: rval = Monitor(model) model.monitor = rval return rval # TODO: find out if monitor.foo below are used anywhere, remove if not. @property def batch_size(self): return self._batch_size @property def num_batches(self): return self._num_batches def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential'): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. dataset: a Dataset or dictionary mapping string names to Datasets If string names are used, then for every dataset, each channel defined by the model or cost will be replicated with that dataset's name followed by an underscore as the prefix. For example, if your cost defines a channel called 'misclass', and datasets is {'train' : train_dataset, 'valid' : valid_dataset} you will get channels called 'train_misclass' and 'valid_misclass'. cost: a Cost """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: costs = extra_costs assert '' not in costs costs[''] = cost supervised = any(cost.supervised for cost in costs.values()) model = self.model X_space = model.get_input_space() X = X_space.make_theano_batch(name='monitor_X') if config.compute_test_value != 'off': X.tag.test_value = X_space.get_origin_batch(batch_size).astype( X.dtype) if supervised: Y_space = model.get_output_space() Y = Y_space.make_theano_batch(name='monitor_Y') if config.compute_test_value != 'off': Y.tag.test_value = Y_space.get_origin_batch(batch_size).astype( Y.dtype) ipt = (X, Y) else: Y = None ipt = X custom_channels = {} for cost_name in costs: if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] raw_channels = cost.get_monitoring_channels(model, X, Y) channels = {} for name in raw_channels: channels[prefix + name] = raw_channels[name] custom_channels.update(channels) model_channels = model.get_monitoring_channels(X, Y) custom_channels.update(model_channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks that respond to the # values in the monitor use the name to find it. for cost_name in costs: cost = costs[cost_name] cost_value = cost(model, X, Y) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' else: name = dprefix + cost_name self.add_channel(name=name, ipt=ipt, val=cost_value, dataset=cur_dataset) for key in custom_channels: self.add_channel(name=dprefix + key, ipt=ipt, val=custom_channels[key], dataset=cur_dataset)
def get_gradients(self, model, data, **kwargs): """ .. todo:: WRITEME """ self.get_data_specs(model)[0].validate(data) obj, scratch = self.base_cost.expr(model, data, return_locals=True, **kwargs) if self.supervised: assert isinstance(data, (list, tuple)) assert len(data) == 2 (X, Y) = data else: X = data H_hat = scratch['H_hat'] terms = scratch['terms'] hidden_layers = scratch['hidden_layers'] grads = OrderedDict() assert len(H_hat) == len(terms) assert len(terms) == len(hidden_layers) num_layers = len(hidden_layers) for i in xrange(num_layers): state = H_hat[i] layer = model.hidden_layers[i] term = terms[i] if term == 0.: continue else: print 'term is ',term if i == 0: state_below = X layer_below = model.visible_layer else: layer_below = model.hidden_layers[i-1] state_below = H_hat[i-1] state_below = layer_below.upward_state(state_below) components = flatten(state) real_grads = T.grad(term, components) fake_state = layer.linear_feed_forward_approximation(state_below) fake_components = flatten(fake_state) real_grads = OrderedDict(safe_zip(fake_components, real_grads)) params = list(layer.get_params()) fake_grads = pylearn2.utils.grad( cost=None, consider_constant=flatten(state_below), wrt=params, known_grads=real_grads ) for param, grad in safe_zip(params, fake_grads): if param in grads: grads[param] = grads[param] + grad else: grads[param] = grad return grads, OrderedDict()
def get_monitoring_channels(self, data): rval = OrderedDict() if self.encoder is not None: rval = self.encoder.get_layer_monitoring_channels(state_below=data) return rval
class Cost(object): """ Represents a cost that can be called either as a supervised cost or an unsupervised cost. """ # If True, the data argument to expr and get_gradients must be a # (X, Y) pair, and Y cannot be None. supervised = False def expr(self, model, data, **kwargs): """ Returns a theano expression for the cost function. Parameters ---------- model: a pylearn2 Model instance data : a batch in cost.get_data_specs() form kwargs : dict Optional extra arguments. Not used by the base class. Returns a symbolic expression for a cost function applied to the minibatch of data. Optionally, may return None. This represents that the cost function is intractable but may be optimized via the get_gradients method. """ raise NotImplementedError( str(type(self)) + " does not implement " "expr.") def get_gradients(self, model, data, **kwargs): """ Provides the gradients of the cost function with respect to the model parameters. These are not necessarily those obtained by theano.tensor.grad--you may wish to use approximate or even intentionally incorrect gradients in some cases. Parameters ---------- model : a pylearn2 Model instance data : a batch in cost.get_data_specs() form kwargs : dict Optional extra arguments, not used by the base class. Returns ------- gradients: OrderedDict a dictionary mapping from the model's parameters to their gradients The default implementation is to compute the gradients using T.grad applied to the value returned by expr. However, subclasses may return other values for the gradient. For example, an intractable cost may return a sampling-based approximation to its gradient. updates: OrderedDict a dictionary mapping shared variables to updates that must be applied to them each time these gradients are computed. This is to facilitate computation of sampling-based approximate gradients. The parameters should never appear in the updates dictionary. This would imply that computing their gradient changes their value, thus making the gradient value outdated. """ try: cost = self.expr(model=model, data=data, **kwargs) except TypeError, e: # If anybody knows how to add type(self) to the exception message # but still preserve the stack trace, please do so # The current code does neither e.message += " while calling " + str(type(self)) + ".expr" logger.error(type(self)) logger.error(e.message) raise e if cost is None: raise NotImplementedError( str(type(self)) + " represents an intractable cost and " "does not provide a gradient " "approximation scheme.") params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates