def test_adadelta(): """ Make sure that learning_rule.AdaDelta obtains the same parameter values as with a hand-crafted AdaDelta implementation, given a dummy model and learning rate scaler for each parameter. Reference: "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) decay = 0.95 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaDelta(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) state[param]['dx2'] = np.zeros(param_shape) def adadelta_manual(model, state): inc = [] rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = -rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t**2 rval += [param_val + dx_t] return rval manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def __init__(self, layers, random_state=None, learning_rule='sgd', learning_rate=0.01, learning_momentum=0.9, dropout=False, batch_size=1, n_iter=None, n_stable=50, f_stable=0.001, valid_set=None, valid_size=0.0, verbose=False, **params): self.layers = [] for i, layer in enumerate(layers): assert isinstance(layer, Layer),\ "Specify each layer as an instance of a `sknn.mlp.Layer` object." # Layer names are optional, if not specified then generate one. if layer.name is None: label = "hidden" if i < len(layers) - 1 else "output" layer.name = "%s%i" % (label, i) # sklearn may pass layers in as additional named parameters, remove them. if layer.name in params: del params[layer.name] self.layers.append(layer) # Don't support any additional parameters that are not in the constructor. # These are specified only so `get_params()` can return named layers, for double- # underscore syntax to work. assert len(params) == 0,\ "The specified additional parameters are unknown." self.random_state = random_state self.learning_rule = learning_rule self.learning_rate = learning_rate self.learning_momentum = learning_momentum self.dropout = dropout if type(dropout) is float else ( 0.5 if dropout else 0.0) self.batch_size = batch_size self.n_iter = n_iter self.n_stable = n_stable self.f_stable = f_stable self.valid_set = valid_set self.valid_size = valid_size self.verbose = verbose self.unit_counts = None self.input_space = None self.mlp = None self.weights = None self.vs = None self.ds = None self.trainer = None self.f = None self.train_set = None self.best_valid_error = float("inf") self.cost = "Dropout" if dropout else None if learning_rule == 'sgd': self._learning_rule = None # elif learning_rule == 'adagrad': # self._learning_rule = AdaGrad() elif learning_rule == 'adadelta': self._learning_rule = AdaDelta() elif learning_rule == 'momentum': self._learning_rule = Momentum(learning_momentum) elif learning_rule == 'nesterov': self._learning_rule = Momentum(learning_momentum, nesterov_momentum=True) elif learning_rule == 'rmsprop': self._learning_rule = RMSProp() else: raise NotImplementedError( "Learning rule type `%s` is not supported." % learning_rule) self._setup()