def get_adadelta_update(params, grads, rho, eps): # E[g^2]_{t-1} E_g_square = [] for p in params: tmp = theano.shared(np.zeros(p.get_value().shape, dtype=theano.config.floatX), borrow=True) E_g_square.append(tmp) # E[g^2]_t = rho * E[g^2]_{t-1} + (1 - rho) * g_t^2 E_g_square_next = [] for e, g in izip(E_g_square, grads): tmp = rho * e + (1.0 - rho) * (g**2) E_g_square_next.append(tmp) # E[dW^2]_{t-1} E_dW_square = [] for p in params: tmp = theano.shared(np.zeros(p.get_value().shape, dtype=theano.config.floatX), borrow=True) E_dW_square.append(tmp) # dW_t = - {sqrt(E[dW^2]_t + eps) / sqrt(E[g^2]_t + eps)} * g_t dW = [] for ew, eg, g in izip(E_dW_square, E_g_square, grads): tmp = - (T.sqrt(ew + eps) / T.sqrt(eg + eps)) * g dW.append(tmp) # E[dW^2]_t = rho * E[dW^2]_{t-1} + (1 - rho) * dW_t^2 E_dW_square_next = [] for ew, d in izip(E_dW_square, dW): tmp = rho * ew + (1.0 - rho) * (d**2) E_dW_square_next.append(tmp) E_g_square_updates = zip(E_g_square, E_g_square_next) E_dW_square_updates = zip(E_dW_square, E_dW_square_next) params_updates = [] for p, d in izip(params, dW): # W_t = W_{t-1} + dW params_updates.append((p, p + d)) return E_g_square_updates + E_dW_square_updates + params_updates
def test_adadelta(): """ Make sure that learning_rule.AdaDelta obtains the same parameter values as with a hand-crafted AdaDelta implementation, given a dummy model and learning rate scaler for each parameter. Reference: "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) decay = 0.95 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaDelta(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) state[param]['dx2'] = np.zeros(param_shape) def adadelta_manual(model, state): inc = [] rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = -rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t ** 2 rval += [param_val + dx_t] return rval manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_adagrad(): """ Make sure that learning_rule.AdaGrad obtains the same parameter values as with a hand-crafted AdaGrad implementation, given a dummy model and learning rate scaler for each parameter. Reference: "Adaptive subgradient methods for online learning and stochastic optimization", Duchi J, Hazan E, Singer Y. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaGrad(), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['sg2'] = np.zeros(param_shape) def adagrad_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['sg2'] += param_val ** 2 dx_t = - (scale * learning_rate / np.sqrt(pstate['sg2']) * param_val) rval += [param_val + dx_t] return rval manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def set_input_space(self, space): """ Note: this function will reset the parameters! """ self.input_space = space if not isinstance(space, Conv2DSpace): raise BadInputSpaceError(self.__class__.__name__ + ".set_input_space " "expected a Conv2DSpace, got " + str(space) + " of type " + str(type(space))) rng = self.get_mlp().rng if self.pad != (0,0): output_shape = \ [int(np.ceil((i_sh + 2. * k_pad - k_sh) / float(k_st))) + 1 for i_sh, k_sh, k_st, k_pad in izip(self.input_space.shape, self.kernel_shape, self.kernel_stride, self.pad)] elif self.border_mode == 'valid': output_shape = [(self.input_space.shape[0] - self.kernel_shape[0]) / self.kernel_stride[0] + 1, (self.input_space.shape[1] - self.kernel_shape[1]) / self.kernel_stride[1] + 1] elif self.border_mode == 'full': output_shape = [(self.input_space.shape[0] + self.kernel_shape[0]) / self.kernel_stride[0] - 1, (self.input_space.shape[1] + self.kernel_shape[1]) / self.kernel_stride[1] - 1] print "In:", self.layer_name, self.input_space.shape, self.kernel_shape, self.kernel_stride, self.pad print "Out:", self.layer_name, output_shape self.detector_space = Conv2DSpace(shape=output_shape, num_channels=self.output_channels, axes=('b', 'c', 0, 1)) self.initialize_transformer(rng) W, = self.transformer.get_params() W.name = self.layer_name + '_W' assert self.tied_b if self.tied_b: self.b = sharedX(np.zeros((self.detector_space.num_channels)) + self.init_bias) else: self.b = sharedX(self.detector_space.get_origin() + self.init_bias) self.b.name = self.layer_name + '_b' logger.info('Input shape: {0}'.format(self.input_space.shape)) logger.info('Detector space: {0}'.format(self.detector_space.shape)) self.initialize_output_space()
def get_gradients(self, model, data, ** kwargs): """ Provides the gradients of the cost function with respect to the model parameters. These are not necessarily those obtained by theano.tensor.grad --you may wish to use approximate or even intentionally incorrect gradients in some cases. Parameters ---------- model : a pylearn2 Model instance data : a batch in cost.get_data_specs() form kwargs : dict Optional extra arguments, not used by the base class. Returns ------- gradients : OrderedDict a dictionary mapping from the model's parameters to their gradients The default implementation is to compute the gradients using T.grad applied to the value returned by expr. However, subclasses may return other values for the gradient. For example, an intractable cost may return a sampling-based approximation to its gradient. updates : OrderedDict a dictionary mapping shared variables to updates that must be applied to them each time these gradients are computed. This is to facilitate computation of sampling-based approximate gradients. The parameters should never appear in the updates dictionary. This would imply that computing their gradient changes their value, thus making the gradient value outdated. """ try: cost = self.expr(model=model, data=data, **kwargs) except TypeError: # If anybody knows how to add type(self) to the exception message # but still preserve the stack trace, please do so # The current code does neither message = "Error while calling " + str(type(self)) + ".expr" reraise_as(TypeError(message)) if cost is None: raise NotImplementedError(str(type(self)) + " represents an intractable cost and " "does not provide a gradient " "approximation scheme.") params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def get_gradients(self, model, data, ** kwargs): """ Provides the gradients of the cost function with respect to the model parameters. These are not necessarily those obtained by theano.tensor.grad --you may wish to use approximate or even intentionally incorrect gradients in some cases. Parameters ---------- model : a pylearn2 Model instance data : a batch in cost.get_data_specs() form kwargs : dict Optional extra arguments, not used by the base class. Returns ------- gradients : OrderedDict a dictionary mapping from the model's parameters to their gradients The default implementation is to compute the gradients using T.grad applied to the value returned by expr. However, subclasses may return other values for the gradient. For example, an intractable cost may return a sampling-based approximation to its gradient. updates : OrderedDict a dictionary mapping shared variables to updates that must be applied to them each time these gradients are computed. This is to facilitate computation of sampling-based approximate gradients. The parameters should never appear in the updates dictionary. This would imply that computing their gradient changes their value, thus making the gradient value outdated. """ try: cost,mask = self.expr(model=model, data=data, **kwargs) except TypeError: # If anybody knows how to add type(self) to the exception message # but still preserve the stack trace, please do so # The current code does neither message = "Error while calling " + str(type(self)) + ".expr" reraise_as(TypeError(message)) if cost is None: raise NotImplementedError(str(type(self)) + " represents an intractable cost and " "does not provide a gradient " "approximation scheme.") params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def set_input_space(self, space): """ Note: this function will reset the parameters! """ self.input_space = space if not isinstance(space, Conv2DSpace): raise BadInputSpaceError(self.__class__.__name__ + ".set_input_space " "expected a Conv2DSpace, got " + str(space) + " of type " + str(type(space))) rng = self.get_mlp().rng if self.pad != (0,0): output_shape = \ [int(np.ceil((i_sh + 2. * k_pad - k_sh) / float(k_st))) + 1 for i_sh, k_sh, k_st, k_pad in izip(self.input_space.shape, self.kernel_shape, self.kernel_stride, self.pad)] elif self.border_mode == 'valid': output_shape = [(self.input_space.shape[0] - self.kernel_shape[0]) / self.kernel_stride[0] + 1, (self.input_space.shape[1] - self.kernel_shape[1]) / self.kernel_stride[1] + 1] elif self.border_mode == 'full': output_shape = [(self.input_space.shape[0] + self.kernel_shape[0]) / self.kernel_stride[0] - 1, (self.input_space.shape[1] + self.kernel_shape[1]) / self.kernel_stride[1] - 1] print "In:", self.layer_name, self.input_space.shape, self.kernel_shape, self.kernel_stride, self.pad print "Out:", self.layer_name, output_shape self.detector_space = Conv2DSpace(shape=output_shape, num_channels=self.output_channels, axes=('b', 'c', 0, 1)) self.initialize_transformer(rng) W, = self.transformer.get_params() W.name = self.layer_name + '_W' #assert self.tied_b if self.tied_b: self.b = sharedX(np.zeros((self.detector_space.num_channels)) + self.init_bias) else: self.b = sharedX(self.detector_space.get_origin() + self.init_bias) self.b.name = self.layer_name + '_b' logger.info('Input shape: {0}'.format(self.input_space.shape)) logger.info('Detector space: {0}'.format(self.detector_space.shape)) self.initialize_output_space()
def build_stacked_ae(nvis, nhids, act_enc, act_dec, tied_weights=False, irange=1e-3, rng=None, corruptor=None, contracting=False): """ .. todo:: WRITEME properly Allocate a stack of autoencoders. """ rng = make_np_rng(rng, which_method='randn') layers = [] final = {} # "Broadcast" arguments if they are singular, or accept sequences if # they are the same length as nhids for c in [ 'corruptor', 'contracting', 'act_enc', 'act_dec', 'tied_weights', 'irange' ]: if type(locals()[c]) is not str and hasattr(locals()[c], '__len__'): assert len(nhids) == len(locals()[c]) final[c] = locals()[c] else: final[c] = [locals()[c]] * len(nhids) # The number of visible units in each layer is the initial input # size and the first k-1 hidden unit sizes. nviss = [nvis] + nhids[:-1] seq = izip( nhids, nviss, final['act_enc'], final['act_dec'], final['corruptor'], final['contracting'], final['tied_weights'], final['irange'], ) # Create each layer. for (nhid, nvis, act_enc, act_dec, corr, cae, tied, ir) in seq: args = (nvis, nhid, act_enc, act_dec, tied, ir, rng) if cae and corr is not None: raise ValueError("Can't specify denoising and contracting " "objectives simultaneously") elif cae: autoenc = ContractiveAutoencoder(*args) elif corr is not None: autoenc = DenoisingAutoencoder(corr, *args) else: autoenc = Autoencoder(*args) layers.append(autoenc) # Create the stack return StackedBlocks(layers)
def test_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for scale in scales] manual = [param + i for param, i in izip(manual, inc)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = [param - learning_rate * scale + i * momentum for param, scale, i in izip(manual, scales, inc)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def __init__(self, autoencoders): super(DeepComposedAutoencoder, self).__init__() self.fn = None self.cpu_only = False assert all(pre.get_output_space().dim == post.get_input_space().dim for pre, post in izip(autoencoders[:-1], autoencoders[1:])) self.autoencoders = list(autoencoders) self.input_space = autoencoders[0].get_input_space() self.output_space = autoencoders[-1].get_output_space()
def adagrad_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['sg2'] += param_val**2 dx_t = -(scale * learning_rate / np.sqrt(pstate['sg2']) * param_val) rval += [param_val + dx_t] return rval
def adagrad_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['sg2'] += param_val ** 2 dx_t = - (scale * learning_rate / np.sqrt(pstate['sg2']) * param_val) rval += [param_val + dx_t] return rval
def get_gradients(self, model, data, ** kwargs): """ Overwrites the Cost.get_gradients so we can inject our theano.Op This will do a separate call back for each model.param Consider rewriting your model to have one param """ srng = RandomStreams(seed=232) params = list(model.get_params()) grads = [OverwriteOp(self.grad,model)(srng.uniform(size=i.shape,dtype=theano.config.floatX),data) for i in params] gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def test_rmsprop(): """ Make sure that learning_rule.RMSProp obtains the same parameter values as with a hand-crafted RMSProp implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 decay = 0.90 max_scaling = 1e5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=RMSProp(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) def rmsprop_manual(model, state): inc = [] rval = [] epsilon = 1. / max_scaling for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin rmsprop pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2 rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon) dx_t = - scale * learning_rate / rms_g_t * param_val rval += [param_val + dx_t] return rval manual = rmsprop_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_rmsprop(): """ Make sure that learning_rule.RMSProp obtains the same parameter values as with a hand-crafted RMSProp implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 decay = 0.90 max_scaling = 1e5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=RMSProp(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) def rmsprop_manual(model, state): inc = [] rval = [] epsilon = 1. / max_scaling for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin rmsprop pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon) dx_t = -scale * learning_rate / rms_g_t * param_val rval += [param_val + dx_t] return rval manual = rmsprop_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def rmsprop_manual(model, state): inc = [] rval = [] epsilon = 1. / max_scaling for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin rmsprop pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2 rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon) dx_t = - scale * learning_rate / rms_g_t * param_val rval += [param_val + dx_t] return rval
def get_gradients(self, model, data, **kwargs): cost, neg_v = self._cost(model,data,**kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant = [neg_v]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def adadelta_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = -rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t ** 2 rval += [param_val + dx_t] return rval
def get_gradients(self, model, data, **kwargs): cost, neg_v = self._cost(model, data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore', consider_constant=[neg_v]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def rmsprop_manual(model, state): inc = [] rval = [] epsilon = 1. / max_scaling for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin rmsprop pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon) dx_t = -scale * learning_rate / rms_g_t * param_val rval += [param_val + dx_t] return rval
def adadelta_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = -rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t**2 rval += [param_val + dx_t] return rval
def build_stacked_ae(nvis, nhids, act_enc, act_dec, tied_weights=False, irange=1e-3, rng=None, corruptor=None, contracting=False): """ .. todo:: WRITEME properly Allocate a stack of autoencoders. """ rng = make_np_rng(rng, which_method='randn') layers = [] final = {} # "Broadcast" arguments if they are singular, or accept sequences if # they are the same length as nhids for c in ['corruptor', 'contracting', 'act_enc', 'act_dec', 'tied_weights', 'irange']: if type(locals()[c]) is not str and hasattr(locals()[c], '__len__'): assert len(nhids) == len(locals()[c]) final[c] = locals()[c] else: final[c] = [locals()[c]] * len(nhids) # The number of visible units in each layer is the initial input # size and the first k-1 hidden unit sizes. nviss = [nvis] + nhids[:-1] seq = izip(nhids, nviss, final['act_enc'], final['act_dec'], final['corruptor'], final['contracting'], final['tied_weights'], final['irange'], ) # Create each layer. for (nhid, nvis, act_enc, act_dec, corr, cae, tied, ir) in seq: args = (nvis, nhid, act_enc, act_dec, tied, ir, rng) if cae and corr is not None: raise ValueError("Can't specify denoising and contracting " "objectives simultaneously") elif cae: autoenc = ContractiveAutoencoder(*args) elif corr is not None: autoenc = DenoisingAutoencoder(corr, *args) else: autoenc = Autoencoder(*args) layers.append(autoenc) # Create the stack return StackedBlocks(layers)
def get_gradients(self, model, data, **kwargs): cost = self._cost(model,data,**kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant = [self.sampler.particles]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() sampler_updates = self.sampler.updates() updates.update(sampler_updates) return gradients, updates
def get_gradients(self, model, data, **kwargs): cost = self._cost(model, data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore', consider_constant=[self.sampler.particles]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() sampler_updates = self.sampler.updates() updates.update(sampler_updates) return gradients, updates
def test_adagrad(): """ Make sure that learning_rule.AdaGrad obtains the same parameter values as with a hand-crafted AdaGrad implementation, given a dummy model and learning rate scaler for each parameter. Reference: "Adaptive subgradient methods for online learning and stochastic optimization", Duchi J, Hazan E, Singer Y. """ cost, model, dataset, sgd, state = prepare_adagrad_test() def adagrad_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['sg2'] += param_val ** 2 dx_t = - (scale * learning_rate / np.sqrt(pstate['sg2']) * param_val) rval += [param_val + dx_t] return rval manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_nesterov_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum, nesterov_momentum=True), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] vel = [-learning_rate * scale for scale in scales] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) vel = [ -learning_rate * scale + i * momentum for scale, i in izip(scales, vel) ] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_nesterov_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum, nesterov_momentum=True), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] vel = [-learning_rate * scale for scale in scales] updates = [-learning_rate * scale + v * momentum for scale, v in izip(scales, vel)] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) vel = [-learning_rate * scale + i * momentum for scale, i in izip(scales, vel)] updates = [-learning_rate * scale + v * momentum for scale, v in izip(scales, vel)] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_adadelta(): """ Make sure that learning_rule.AdaDelta obtains the same parameter values as with a hand-crafted AdaDelta implementation, given a dummy model and learning rate scaler for each parameter. Reference: "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 decay = 0.95 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaDelta(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) state[param]['dx2'] = np.zeros(param_shape) def adadelta_manual(model, state): inc = [] rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = - rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t ** 2 rval += [param_val + dx_t] return rval manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def feature_sign_search(dictionary, signals, sparsity, max_iter=1000, solution=None): """ Solve L1-penalized quadratic minimization problems with feature-sign search. Employs the feature sign search algorithm of Lee et al (2006) to solve an L1-penalized quadratic optimization problem as a sequence of unconstrained quadratic minimization problems over subsets of the variables, with candidates for non-zero elements chosen by means of a gradient-based criterion. Parameters ---------- dictionary : array_like, 2-dimensional The dictionary of basis functions from which to form the sparse linear combination. Each column constitutes a basis vector for the sparse code. There should be as many rows as input dimensions in the signal. signals : array_like, 1- or 2-dimensional The signal(s) to be decomposed as a sparse linear combination of the columns of the dictionary. If 2-dimensional, each different signal (training case) should be a row of this matrix. sparsity : float The coefficient on the L1 penalty term of the cost function. max_iter : int, optional The maximum number of iterations to run, per code vector, if the optimization has still not converged. Default is 1000. solution : ndarray, 1- or 2-dimensional, optional Pre-allocated vector or matrix used to store the solution(s). If provided, it should have the same rank as `signals`. If 2-dimensional, it should have as many rows as `signals`. Returns ------- solution : ndarray, 1- or 2-dimensional Matrix where each row contains the solution corresponding to a row of `signals`. If an array was passed in as the argument `solution`, it will be updated in place and the same object will be returned. Notes ----- It might seem more natural, from a linear-algebraic point of view, to think of both `signals` and `solution` as matrices with training examples contained as column vectors; then the overall cost function being minimized is .. math:: (Y - AX)^2 + \gamma \sum_{i,j} |X_{ij}| with :math:`$A$` representing the dictionary, :math:`Y` being `signals.T` and math:`X` being `solutions.T`. However, in order to maintain the convention of training examples being indexed along the first dimension in the case of 2-dimensional `signals` input (as well as provide faster computation via memory locality in the case of C-contiguous inputs), this function expects and returns input with training examples as rows of a matrix. References ---------- .. [1] H. Lee, A. Battle, R. Raina, and A. Y. Ng. "Efficient sparse coding algorithms". Advances in Neural Information Processing Systems 19, 2007. """ dictionary = np.asarray(dictionary) _feature_sign_checkargs(dictionary, signals, sparsity, max_iter, solution) # Make things the code a bit simpler by always forcing the # 2-dimensional case. signals_ndim = signals.ndim signals = np.atleast_2d(signals) if solution is None: solution = np.zeros((signals.shape[0], dictionary.shape[1]), dtype=signals.dtype) orig_sol = None else: orig_sol = solution solution = np.atleast_2d(solution) # Solve each minimization in sequence. for row, (signal, sol) in enumerate(izip(signals, solution)): _, iters = _feature_sign_search_single(dictionary, signal, sparsity, max_iter, sol) if iters >= max_iter: log.warning( "maximum number of iterations reached when " "optimizing code for training case %d; solution " "may not be optimal" % iters ) # Attempt to return the exact same object reference. if orig_sol is not None and orig_sol.ndim == 1: solution = orig_sol # Return a vector with the same rank as the input `signals`. elif orig_sol is None and signals_ndim == 1: solution = solution.squeeze() return solution
def pooling_matrix(groups, per_group, strides=None, dtype=None, sparse=None): """ Construct a pooling matrix, optionally with overlapping pools arranged in a 1 or 2D topology. Parameters ---------- groups : int or tuple The grid dimensions of a 1- or 2-dimensional pooling grid. per_group : int or tuple The grid dimensions of a single 1- or 2-dimensional feature pool. Must be same length as `groups`. strides : int or tuple, optional The stride of the pools along each dimension. A value of `None` is equivalent to setting equal to `per_group`, i.e. no overlap dtype : dtype object or str, optional The dtype of the resulting pooling matrix. sparse : str, optional If `None`, the function will return a dense matrix (a rank-2 `numpy.ndarray`). Specifying 'csc' or 'csr' in this argument will cause the function to return a `scipy.sparse.csc_matrix` or a `scipy.sparse.csr_matrix`, instead. Returns ------- pools : ndarray or sparse matrix Either a dense 2-dimensional NumPy array or one of `scipy.sparse.csc_matrix` or `scipy.sparse.csr_matrix`, depending on the value of the `sparse` argument. In any case, the shape is `(n_pools, n_filters)` and the value of `pools[i, j]` is 1 if feature `j` is in pool `i`, and 0 otherwise. """ # Error-check arguments and fill in row_stride and col_stride # if either argument is absent. def _validate_shape(shape, param_name): try: shape = tuple(shape) [int(val) for val in shape] except (ValueError, TypeError): try: shape = (int(shape),) except TypeError: reraise_as(TypeError("%s must be int or int tuple" % param_name)) return shape groups = _validate_shape(groups, 'groups') per_group = _validate_shape(per_group, 'per_group') if strides is not None: strides = _validate_shape(strides, 'strides') else: strides = per_group if len(groups) != len(per_group): raise ValueError('groups and per_group must have the same length') elif len(per_group) != len(strides): raise ValueError('per_group and strides must have the same length') if len(groups) > 2 or len(per_group) > 2: raise ValueError('only <= 2-dimensional pooling grids are supported') if not all(stride <= dim for stride, dim in izip(strides, per_group)): raise ValueError('strides must each be <= per_group dimensions') try: group_rows, group_cols = groups rows_per_group, cols_per_group = per_group row_stride, col_stride = strides except ValueError: group_rows, group_cols = groups[0], 1 rows_per_group, cols_per_group = per_group[0], 1 row_stride, col_stride = strides[0], 1 if sparse is not None and sparse not in ('csc', 'csr'): raise ValueError("sparse must be one of (None, 'csr', 'csc')") # The total number of filters along either dimension is the # the number of groups times the stride, plus whatever dangles # off the last filter (the added term is zero if there's no # overlapping pools). filter_rows = group_rows * row_stride + (rows_per_group - row_stride) filter_cols = group_cols * col_stride + (cols_per_group - col_stride) if dtype is None: dtype = theano.config.floatX # If the return type is dense we can treat it as a 4-tensor and # then reshape. If not we'll need some index math, but it happens shape = (group_rows, group_cols, filter_rows, filter_cols) matrix_shape = group_rows * group_cols, filter_rows * filter_cols if sparse is not None: # Use a dictionary-of-keys matrix at construction time, # since they are efficient for arbitrary assignment. # TODO: I think CSC/CSR are fast to construct if you know the total # number of elements, which should be easy to calculate. pools = scipy.sparse.dok_matrix(matrix_shape, dtype=dtype) else: pools = np.zeros(shape, dtype=dtype) for g_row in xrange(group_rows): for g_col in xrange(group_cols): # The start and end points of the contiguous block of 1's. row_start = row_stride * g_row row_end = row_start + rows_per_group col_start = col_stride * g_col col_end = col_start + cols_per_group if sparse is not None: for f_row in xrange(row_start, row_end): matrix_cols = slice(f_row * shape[3] + col_start, f_row * shape[3] + col_end) # The group to which this belongs. matrix_row = g_row * shape[1] + g_col pools[matrix_row, matrix_cols] = 1. else: # If the matrix is a dense 4-tensor then we can get # away with doing an entire pool in one assignment. pools[g_row, g_col, row_start:row_end, col_start:col_end] = 1 if sparse is not None: # Call either .tocsr() or .tocsc() pools = getattr(pools, 'to' + sparse)() else: pools = pools.reshape(matrix_shape) return pools
def feature_sign_search(dictionary, signals, sparsity, max_iter=1000, solution=None): """ Solve L1-penalized quadratic minimization problems with feature-sign search. Employs the feature sign search algorithm of Lee et al (2006) to solve an L1-penalized quadratic optimization problem as a sequence of unconstrained quadratic minimization problems over subsets of the variables, with candidates for non-zero elements chosen by means of a gradient-based criterion. Parameters ---------- dictionary : array_like, 2-dimensional The dictionary of basis functions from which to form the sparse linear combination. Each column constitutes a basis vector for the sparse code. There should be as many rows as input dimensions in the signal. signals : array_like, 1- or 2-dimensional The signal(s) to be decomposed as a sparse linear combination of the columns of the dictionary. If 2-dimensional, each different signal (training case) should be a row of this matrix. sparsity : float The coefficient on the L1 penalty term of the cost function. max_iter : int, optional The maximum number of iterations to run, per code vector, if the optimization has still not converged. Default is 1000. solution : ndarray, 1- or 2-dimensional, optional Pre-allocated vector or matrix used to store the solution(s). If provided, it should have the same rank as `signals`. If 2-dimensional, it should have as many rows as `signals`. Returns ------- solution : ndarray, 1- or 2-dimensional Matrix where each row contains the solution corresponding to a row of `signals`. If an array was passed in as the argument `solution`, it will be updated in place and the same object will be returned. Notes ----- It might seem more natural, from a linear-algebraic point of view, to think of both `signals` and `solution` as matrices with training examples contained as column vectors; then the overall cost function being minimized is .. math:: (Y - AX)^2 + \gamma \sum_{i,j} |X_{ij}| with :math:`$A$` representing the dictionary, :math:`Y` being `signals.T` and math:`X` being `solutions.T`. However, in order to maintain the convention of training examples being indexed along the first dimension in the case of 2-dimensional `signals` input (as well as provide faster computation via memory locality in the case of C-contiguous inputs), this function expects and returns input with training examples as rows of a matrix. References ---------- .. [1] H. Lee, A. Battle, R. Raina, and A. Y. Ng. "Efficient sparse coding algorithms". Advances in Neural Information Processing Systems 19, 2007. """ dictionary = np.asarray(dictionary) _feature_sign_checkargs(dictionary, signals, sparsity, max_iter, solution) # Make things the code a bit simpler by always forcing the # 2-dimensional case. signals_ndim = signals.ndim signals = np.atleast_2d(signals) if solution is None: solution = np.zeros((signals.shape[0], dictionary.shape[1]), dtype=signals.dtype) orig_sol = None else: orig_sol = solution solution = np.atleast_2d(solution) # Solve each minimization in sequence. for row, (signal, sol) in enumerate(izip(signals, solution)): _, iters = _feature_sign_search_single(dictionary, signal, sparsity, max_iter, sol) if iters >= max_iter: log.warning("maximum number of iterations reached when " "optimizing code for training case %d; solution " "may not be optimal" % iters) # Attempt to return the exact same object reference. if orig_sol is not None and orig_sol.ndim == 1: solution = orig_sol # Return a vector with the same rank as the input `signals`. elif orig_sol is None and signals_ndim == 1: solution = solution.squeeze() return solution
def safe_izip(*args): """Like izip, but ensures arguments are of same length""" assert all([len(arg) == len(args[0]) for arg in args]) return izip(*args)
def pooling_matrix(groups, per_group, strides=None, dtype=None, sparse=None): """ Construct a pooling matrix, optionally with overlapping pools arranged in a 1 or 2D topology. Parameters ---------- groups : int or tuple The grid dimensions of a 1- or 2-dimensional pooling grid. per_group : int or tuple The grid dimensions of a single 1- or 2-dimensional feature pool. Must be same length as `groups`. strides : int or tuple, optional The stride of the pools along each dimension. A value of `None` is equivalent to setting equal to `per_group`, i.e. no overlap dtype : dtype object or str, optional The dtype of the resulting pooling matrix. sparse : str, optional If `None`, the function will return a dense matrix (a rank-2 `numpy.ndarray`). Specifying 'csc' or 'csr' in this argument will cause the function to return a `scipy.sparse.csc_matrix` or a `scipy.sparse.csr_matrix`, instead. Returns ------- pools : ndarray or sparse matrix Either a dense 2-dimensional NumPy array or one of `scipy.sparse.csc_matrix` or `scipy.sparse.csr_matrix`, depending on the value of the `sparse` argument. In any case, the shape is `(n_pools, n_filters)` and the value of `pools[i, j]` is 1 if feature `j` is in pool `i`, and 0 otherwise. """ # Error-check arguments and fill in row_stride and col_stride # if either argument is absent. def _validate_shape(shape, param_name): try: shape = tuple(shape) [int(val) for val in shape] except (ValueError, TypeError): try: shape = (int(shape), ) except TypeError: reraise_as( TypeError("%s must be int or int tuple" % param_name)) return shape groups = _validate_shape(groups, 'groups') per_group = _validate_shape(per_group, 'per_group') if strides is not None: strides = _validate_shape(strides, 'strides') else: strides = per_group if len(groups) != len(per_group): raise ValueError('groups and per_group must have the same length') elif len(per_group) != len(strides): raise ValueError('per_group and strides must have the same length') if len(groups) > 2 or len(per_group) > 2: raise ValueError('only <= 2-dimensional pooling grids are supported') if not all(stride <= dim for stride, dim in izip(strides, per_group)): raise ValueError('strides must each be <= per_group dimensions') try: group_rows, group_cols = groups rows_per_group, cols_per_group = per_group row_stride, col_stride = strides except ValueError: group_rows, group_cols = groups[0], 1 rows_per_group, cols_per_group = per_group[0], 1 row_stride, col_stride = strides[0], 1 if sparse is not None and sparse not in ('csc', 'csr'): raise ValueError("sparse must be one of (None, 'csr', 'csc')") # The total number of filters along either dimension is the # the number of groups times the stride, plus whatever dangles # off the last filter (the added term is zero if there's no # overlapping pools). filter_rows = group_rows * row_stride + (rows_per_group - row_stride) filter_cols = group_cols * col_stride + (cols_per_group - col_stride) if dtype is None: dtype = theano.config.floatX # If the return type is dense we can treat it as a 4-tensor and # then reshape. If not we'll need some index math, but it happens shape = (group_rows, group_cols, filter_rows, filter_cols) matrix_shape = group_rows * group_cols, filter_rows * filter_cols if sparse is not None: # Use a dictionary-of-keys matrix at construction time, # since they are efficient for arbitrary assignment. # TODO: I think CSC/CSR are fast to construct if you know the total # number of elements, which should be easy to calculate. pools = scipy.sparse.dok_matrix(matrix_shape, dtype=dtype) else: pools = np.zeros(shape, dtype=dtype) for g_row in xrange(group_rows): for g_col in xrange(group_cols): # The start and end points of the contiguous block of 1's. row_start = row_stride * g_row row_end = row_start + rows_per_group col_start = col_stride * g_col col_end = col_start + cols_per_group if sparse is not None: for f_row in xrange(row_start, row_end): matrix_cols = slice(f_row * shape[3] + col_start, f_row * shape[3] + col_end) # The group to which this belongs. matrix_row = g_row * shape[1] + g_col pools[matrix_row, matrix_cols] = 1. else: # If the matrix is a dense 4-tensor then we can get # away with doing an entire pool in one assignment. pools[g_row, g_col, row_start:row_end, col_start:col_end] = 1 if sparse is not None: # Call either .tocsr() or .tocsc() pools = getattr(pools, 'to' + sparse)() else: pools = pools.reshape(matrix_shape) return pools
def get_gradients(self, model, data, ** kwargs): try: cost = self.expr(model=model, data=data, **kwargs) except TypeError: # If anybody knows how to add type(self) to the exception message # but still preserve the stack trace, please do so # The current code does neither message = "Error while calling " + str(type(self)) + ".expr" reraise_as(TypeError(message)) if cost is None: raise NotImplementedError(str(type(self)) + " represents an intractable cost and " "does not provide a gradient " "approximation scheme.") n_leafnodes = len(cost) n_model_layer = len(model.layers) params = [] for i in xrange(n_leafnodes): params_branch = [] layer_params = model.layers[0].get_params() for param in layer_params: params_branch.append(param) params.append(params_branch) for i in xrange(1,n_model_layer - constants.NUM_REGLAYER,1): CompsiteLayer = model.layers[i] n_curnodes = len(CompsiteLayer.layers) step = int(n_leafnodes / n_curnodes) for k in xrange(n_leafnodes): layer_params = CompsiteLayer.layers[k/step].get_params() for param in layer_params: params[k].append(param) for i in xrange(-constants.NUM_REGLAYER,0,1): CompsiteLayer = model.layers[i] for k in xrange(n_leafnodes): layer_params = CompsiteLayer.layers[k].get_params() for param in layer_params: params[k].append(param) grads =[] for i in xrange(n_leafnodes): grads.append(theano.tensor.grad(cost[i], params[i], disconnected_inputs='ignore')) flat_grads=[] flat_params = [] for j in xrange(n_model_layer - constants.NUM_REGLAYER): weight = params[0][2*j] bais = params[0][2*j+1] tmp_grad_weigth = grads[0][2*j] tmp_grad_bais = grads[0][2*j+1] for i in xrange(1,n_leafnodes,1): if weight.name == params[i][2*j].name: weight += params[i][2*j] weight.name = params[i][2*j].name bais += params[i][2*j+1] bais.name = params[i][2*j+1].name tmp_grad_weigth += grads[i][2*j] tmp_grad_bais += grads[i][2*j+1] else: flat_params.append(weight) flat_params.append(bais) flat_grads.append(tmp_grad_weigth) flat_grads.append(tmp_grad_bais) weight = params[i][2*j] bais = params[i][2*j+1] tmp_grad_weigth = grads[i][2*j] tmp_grad_bais = grads[i][2*j+1] flat_params.append(weight) flat_params.append(bais) flat_grads.append(tmp_grad_weigth) flat_grads.append(tmp_grad_bais) for j in xrange(-constants.NUM_REGLAYER,0,1): for i in xrange(0, n_leafnodes,1): flat_params.append(params[i][2*j]) flat_params.append(params[i][2*j+1]) flat_grads.append(grads[i][2*j]) flat_grads.append(grads[i][2*j+1]) params_model = model.get_params() if len(flat_params) != len(params_model): raise ValueError("the length of the flat_params of tree cnn " "does not meet the list of model params" ) else: for flat_i, p_i in zip(flat_params,params_model): flat_i.name = p_i.name gradients = OrderedDict(izip(params_model, flat_grads)) updates = OrderedDict() return gradients, updates