def get_decay_params(self): # noise scheduling noise_schedule = get_decay_function(self.noise_decay, self.input_noise_level, self.args.get('input_noise_level'), self.noise_annealing) return [noise_schedule]
def __init__(self, model, dataset, config=None, defaults=defaults, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, momentum=None, momentum_decay=None, momentum_factor=None, nesterov_momentum=None): # superclass init super(SGD, self).__init__(model, dataset, config=config, defaults=defaults, n_epoch=n_epoch, batch_size=batch_size, minimum_batch_size=minimum_batch_size, save_frequency=save_frequency, early_stop_length=early_stop_length, early_stop_threshold=early_stop_threshold, learning_rate=learning_rate, lr_decay=lr_decay, lr_factor=lr_factor, momentum=momentum, momentum_decay=momentum_decay, momentum_factor=momentum_factor, nesterov_momentum=nesterov_momentum) # everything is in self! yay! # Momentum - smoothing over the parameter changes (see Hinton) if self.momentum: self.momentum = sharedX(self.momentum, 'momentum') if self.momentum_decay is not None and \ self.momentum_decay is not False and \ self.momentum_factor is not None: self.momentum_decay = get_decay_function(self.momentum_decay, self.momentum, self.momentum.get_value(), self.momentum_factor) else: self.momentum_decay = False else: self.momentum = 1
def __init__(self, dataset, loss, model=None, epochs=10, batch_size=100, min_batch_size=1, save_freq=None, stop_threshold=None, stop_patience=None, learning_rate=.1, lr_decay="exponential", lr_decay_factor=.995, momentum=0.5, momentum_decay="linear", momentum_factor=0, nesterov_momentum=True, grad_clip=None, hard_clip=False): """ Initialize SGD. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int How many epochs to train between each new save of the Model's parameters. stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_decay_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. momentum : float The momentum to use during gradient updates. momentum_decay : str The type of decay function to use for changing the momentum over epochs. See `opendeep.utils.decay` for options. momentum_factor : float The amount to use for the decay function when changing the momentum over epochs. See `opendeep.utils.decay` for its effect for given decay functions. nesterov_momentum : bool Whether or not to use Nesterov momentum. grad_clip : float, optional Whether to clip gradients. This will clip with a maximum of grad_clip or the parameter norm. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ # superclass init initial_parameters = locals().copy() initial_parameters.pop('self') super(SGD, self).__init__(**initial_parameters) # Momentum - smoothing over the parameter changes (see Hinton) if momentum: self.momentum = sharedX(momentum, 'momentum') if momentum_decay is not None and \ momentum_decay is not False and \ momentum_factor is not None: self.momentum_decay = get_decay_function( momentum_decay, self.momentum, self.momentum.get_value(), momentum_factor) else: self.momentum_decay = False else: self.momentum = 0 self.momentum_decay = False self.nesterov_momentum = nesterov_momentum
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/lstm/', input_size=None, hidden_size=None, output_size=None, activation='sigmoid', hidden_activation='relu', inner_hidden_activation='sigmoid', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, cost_function='mse', cost_args=None, noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99, direction='forward', clip_recurrent_grads=False): """ Initialize a simple recurrent network. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets, this will be the initial starting value for hidden layers. params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters. outdir : str The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved. input_size : int The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional. hidden_size : int The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional. output_size : int The size (dimensionality) of the output. activation : str or callable The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The activation to perform for the hidden units. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. inner_hidden_activation : str or callable The activation to perform for the hidden gates. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the output cost of the model. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. noise : str What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. direction : str The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or 'bidirectional'. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(LSTM, self).__init__(**initial_parameters) ################## # specifications # ################## backward = direction.lower() == 'backward' bidirectional = direction.lower() == 'bidirectional' ######################################### # activation, cost, and noise functions # ######################################### # recurrent hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) self.inner_hidden_activation_func = get_activation_function(inner_hidden_activation) # output activation function! activation_func = get_activation_function(activation) # Cost function cost_function = get_cost_function(cost_function) cost_args = cost_args or dict() # Now deal with noise if we added it: if noise: log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function(noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) ############### # inputs hook # ############### # grab info from the inputs_hook # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension # being the temporal dimension. # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.inputs_hook is not None: self.input = self.inputs_hook[1] if self.input.ndim == 1: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) self.input_size = 1 elif self.input.ndim == 2: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim > 3: self.input = self.input.flatten(3) self.input_size = sum(self.input_size) else: raise NotImplementedError("Recurrent input with %d dimensions not supported!" % self.input.ndim) xs = self.input else: # Assume input coming from optimizer is (batches, timesteps, data) # so, we need to reshape to (timesteps, batches, data) self.input = T.tensor3("Xs") xs = self.input.dimshuffle(1, 0, 2) # The target outputs for supervised training - in the form of (batches, timesteps, output) which is # the same dimension ordering as the expected input from optimizer. # therefore, we need to swap it like we did to input xs. self.target = T.tensor3("Ys") ys = self.target.dimshuffle(1, 0, 2) ################ # hiddens hook # ################ # set an initial value for the recurrent hiddens from hook if self.hiddens_hook is not None: h_init = self.hiddens_hook[1] self.hidden_size = self.hiddens_hook[0] else: # deal with h_init after parameters are made (have to make the same size as hiddens that are computed) self.hidden_size = hidden_size ################## # for generating # ################## # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") #################################################### # parameters - make sure to deal with params_hook! # #################################################### if self.params_hook is not None: if not bidirectional: (W_x_c, W_x_i, W_x_f, W_x_o, U_h_c, U_h_i, U_h_f, U_h_o, W_h_y, b_c, b_i, b_f, b_o, b_y) = self.params_hook recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o] else: (W_x_c, W_x_i, W_x_f, W_x_o, U_h_c, U_h_i, U_h_f, U_h_o, U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b, W_h_y, b_c, b_i, b_f, b_o, b_y) = self.params_hook recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o, U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b] # otherwise, construct our params else: # all input-to-hidden weights W_x_c, W_x_i, W_x_f, W_x_o = [ get_weights(weights_init=weights_init, shape=(self.input_size, self.hidden_size), name="W_x_%s" % sub, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for sub in ['c', 'i', 'f', 'o'] ] # all hidden-to-hidden weights U_h_c, U_h_i, U_h_f, U_h_o = [ get_weights(weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_h_%s" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval) for sub in ['c', 'i', 'f', 'o'] ] # hidden-to-output weights W_h_y = get_weights(weights_init=weights_init, shape=(self.hidden_size, self.output_size), name="W_h_y", # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # biases b_c, b_i, b_f, b_o = [ get_bias(shape=(self.hidden_size,), name="b_%s" % sub, init_values=r_bias_init) for sub in ['c', 'i', 'f', 'o'] ] # output bias b_y = get_bias(shape=(self.output_size,), name="b_y", init_values=bias_init) # clip gradients if we are doing that recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o] if clip_recurrent_grads: clip = abs(clip_recurrent_grads) U_h_c, U_h_i, U_h_f, U_h_o = [theano.gradient.grad_clip(p, -clip, clip) for p in recurrent_params] # bidirectional params if bidirectional: # all hidden-to-hidden weights U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b = [ get_weights(weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_h_%s_b" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval) for sub in ['c', 'i', 'f', 'o'] ] recurrent_params += [U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b] if clip_recurrent_grads: clip = abs(clip_recurrent_grads) U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b = [theano.gradient.grad_clip(p, -clip, clip) for p in [U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b]] # put all the parameters into our list, and make sure it is in the same order as when we try to load # them from a params_hook!!! self.params = [W_x_c, W_x_i, W_x_f, W_x_o] + recurrent_params + [W_h_y, b_c, b_i, b_f, b_o, b_y] # make h_init the right sized tensor if not self.hiddens_hook: h_init = T.zeros_like(T.dot(xs[0], W_x_c)) c_init = T.zeros_like(T.dot(xs[0], W_x_c)) ############### # computation # ############### # move some computation outside of scan to speed it up! x_c = T.dot(xs, W_x_c) + b_c x_i = T.dot(xs, W_x_i) + b_i x_f = T.dot(xs, W_x_f) + b_f x_o = T.dot(xs, W_x_o) + b_o # now do the recurrent stuff (self.hiddens, _), self.updates = theano.scan( fn=self.recurrent_step, sequences=[x_c, x_i, x_f, x_o], outputs_info=[h_init, c_init], non_sequences=[U_h_c, U_h_i, U_h_f, U_h_o], go_backwards=backward, name="lstm_scan", strict=True ) # if bidirectional, do the same in reverse! if bidirectional: (hiddens_b, _), updates_b = theano.scan( fn=self.recurrent_step, sequences=[x_c, x_i, x_f, x_o], outputs_info=[h_init, c_init], non_sequences=[U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b], go_backwards=not backward, name="lstm_scan_back", strict=True ) # flip the hiddens to be the right direction hiddens_b = hiddens_b[::-1] # update stuff self.updates.update(updates_b) self.hiddens += hiddens_b # add noise (like dropout) if we wanted it! if noise: self.hiddens = T.switch(self.noise_switch, noise_func(input=self.hiddens), self.hiddens) # now compute the outputs from the leftover (top level) hiddens self.output = activation_func( T.dot(self.hiddens, W_h_y) + b_y ) # now to define the cost of the model - use the cost function to compare our output with the target value. self.cost = cost_function(output=self.output, target=ys, **cost_args) log.info("Initialized an LSTM!")
def __init__(self, model, dataset, n_epoch=10, batch_size=100, minimum_batch_size=1, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=.1, lr_decay="exponential", lr_factor=.995, momentum=0.5, momentum_decay="linear", momentum_factor=0, nesterov_momentum=True): """ Initialize SGD. Parameters ---------- model : Model The Model to train. dataset : Dataset The Dataset to use when training the Model. n_epoch : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. minimum_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_frequency : int How many epochs to train between each new save of the Model's parameters. early_stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. early_stop_length : int The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. momentum : float The momentum to use during gradient updates. momentum_decay : str The type of decay function to use for changing the momentum over epochs. See `opendeep.utils.decay` for options. momentum_factor : float The amount to use for the decay function when changing the momentum over epochs. See `opendeep.utils.decay` for its effect for given decay functions. nesterov_momentum : bool Whether or not to use Nesterov momentum. """ # superclass init initial_parameters = locals().copy() initial_parameters.pop('self') super(SGD, self).__init__(**initial_parameters) # Momentum - smoothing over the parameter changes (see Hinton) if momentum: self.momentum = sharedX(momentum, 'momentum') if momentum_decay is not None and \ momentum_decay is not False and \ momentum_factor is not None: self.momentum_decay = get_decay_function(momentum_decay, self.momentum, self.momentum.get_value(), momentum_factor) else: self.momentum_decay = False else: self.momentum = 0 self.momentum_decay = False self.nesterov_momentum = nesterov_momentum
def __init__(self, model, dataset, config=None, defaults=None, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, **kwargs): # Default values to use for some training parameters _defaults = {"n_epoch": 1000, "batch_size": 100, "minimum_batch_size": 1, "save_frequency": 10, "early_stop_threshold": .9995, "early_stop_length": 30, "learning_rate": 0.001, "lr_decay": "exponential", "lr_factor": 1, # no learning rate decay by default } log.debug("Initializing optimizer %s", str(type(self))) assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!" self.model = model self.dataset = dataset assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!" # set self.args to be the combination of the defaults and the config dictionaries from the subclass in_args = combine_config_and_defaults(config, defaults) self.args = combine_config_and_defaults(in_args, _defaults) # if the args are none, make it a blank dictionary if self.args is None: self.args = {} # now that our required variables are out of the way, do the same thing for everything else passed via kwargs for arg, val in kwargs.items(): if (val is not None or str(arg) not in self.args) and str(arg) != 'kwargs': self.args[str(arg)] = val # flatten kwargs if it was passed as a variable elif str(arg) == 'kwargs': inner_kwargs = kwargs['kwargs'] for key, item in inner_kwargs.items(): if item is not None or str(key) not in self.args: self.args[str(key)] = item # now take care of overriding explicits passed in if n_epoch is not None: self.args['n_epoch'] = n_epoch if batch_size is not None: self.args['batch_size'] = batch_size if minimum_batch_size is not None: self.args['minimum_batch_size'] = minimum_batch_size if save_frequency is not None: self.args['save_frequency'] = save_frequency if early_stop_threshold is not None: self.args['early_stop_threshold'] = early_stop_threshold if early_stop_length is not None: self.args['early_stop_length'] = early_stop_length if learning_rate is not None: self.args['learning_rate'] = learning_rate if lr_decay is not None: self.args['lr_decay'] = lr_decay if lr_factor is not None: self.args['lr_factor'] = lr_factor # Magic! Now self.args contains the combination of all the initialization variables, overridden like so: # _defaults < defaults < config < kwargs (explicits passed to model's __init__) # log the arguments log.debug("optimizer config args: %s", str(self.args)) # Finally, to make things really easy, update the class 'self' with everything in self.args to make # all the parameters accessible via self.<param> self.__dict__.update(self.args) # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(self.learning_rate, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if self.lr_decay: self.learning_rate_decay = get_decay_function(self.lr_decay, self.learning_rate, self.learning_rate.get_value(), self.lr_factor) else: self.learning_rate_decay = False
def __init__(self, dataset, loss=None, model=None, epochs=1000, batch_size=100, min_batch_size=1, save_freq=10, stop_threshold=None, stop_patience=50, learning_rate=1e-3, lr_decay=None, lr_decay_factor=None, grad_clip=None, hard_clip=False, **kwargs): """ Initialize the Optimizer. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int How many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int, optional How many epochs to train between each new save of the Model's parameters. stop_threshold : float, optional The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int, optional The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for classes of decay and documentation. lr_decay_factor : float The amount of decay to use for the ``lr_decay`` type of decay. grad_clip : float, optional Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ log.info("Initializing optimizer %s", str(self.__class__.__name__)) # Deal with early stopping None initializations (no early stopping). if not stop_threshold: stop_threshold = numpy.inf if not save_freq: save_freq = 1000000 if not stop_patience: stop_patience = 1 # Put all init parameters in self.args so we can log the initial configuration. self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("Optimizer config args: %s", str(self.args)) # if the optimizer wasn't initialized with a Model (train() being called from the model class itself), # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't # the best option because other methods besides stochastic ones can exist for optimizers in the future. # TODO: fix this up - feels like a hack just to make model.train() work... if not model: return # Otherwise, things are proceeding as normal. Carry on... assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \ "Found %s" % str(model.__class__.__name__) assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \ "Found %s" % str(dataset.__class__.__name__) # deal with loss expression/targets if loss is not None: assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \ "Found %s" % str(loss.__class__.__name__) if isinstance(loss, Loss): self.loss_targets = loss.get_targets() self.loss_expression = loss.get_loss() else: assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented." if isinstance(model.get_loss(), tuple): self.loss_targets = raise_to_list(model.get_loss()[0]) self.loss_expression = model.get_loss()[1] else: self.loss_targets = None self.loss_expression = model.get_loss() model_inputs = raise_to_list(model.get_inputs()) n_model_inputs = len(model_inputs) model_targets = self.loss_targets or [] for input in model_inputs: if input in model_targets: model_targets.remove(input) n_model_targets = len(model_targets) self.unsupervised = (n_model_targets is 0) # make sure the number of inputs/targets matches up with the dataset properties # train assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \ "Dataset has %d train inputs, while model expects %d" % \ (len(raise_to_list(dataset.train_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \ "Dataset has %d train targets, while model expects %d" % \ (len(raise_to_list(dataset.train_targets) or []), n_model_targets) # valid if dataset.valid_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \ "Dataset has %d valid inputs, while model expects %d" % \ (len(raise_to_list(dataset.valid_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \ "Dataset has %d valid targets, while model expects %d" % \ (len(raise_to_list(dataset.valid_targets) or []), n_model_targets) # test if dataset.test_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \ "Dataset has %d test inputs, while model expects %d" % \ (len(raise_to_list(dataset.test_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \ "Dataset has %d test targets, while model expects %d" % \ (len(raise_to_list(dataset.test_targets) or []), n_model_targets) # now we are happy, we can add them to `self` self.model = model self.dataset = dataset self.loss = loss # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') # whether to scale individual model parameters' learning rates. self.lr_scalers = self.model.get_lr_scalers() # whether to decay if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, learning_rate, lr_decay_factor) else: self.learning_rate_decay = False # rest of initial parameters needed for training. self.batch_size = batch_size self.min_batch_size = min_batch_size self.n_epoch = epochs self.save_frequency = save_freq self.early_stop_threshold = stop_threshold self.early_stop_length = stop_patience self.grad_clip = grad_clip self.hard_clip = hard_clip
def __init__(self, dataset, loss, model=None, epochs=10, batch_size=100, min_batch_size=1, save_freq=None, stop_threshold=None, stop_patience=None, learning_rate=.1, lr_decay="exponential", lr_decay_factor=.995, momentum=0.5, momentum_decay="linear", momentum_factor=0, nesterov_momentum=True, grad_clip=None, hard_clip=False): """ Initialize SGD. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int How many epochs to train between each new save of the Model's parameters. stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_decay_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. momentum : float The momentum to use during gradient updates. momentum_decay : str The type of decay function to use for changing the momentum over epochs. See `opendeep.utils.decay` for options. momentum_factor : float The amount to use for the decay function when changing the momentum over epochs. See `opendeep.utils.decay` for its effect for given decay functions. nesterov_momentum : bool Whether or not to use Nesterov momentum. grad_clip : float, optional Whether to clip gradients. This will clip with a maximum of grad_clip or the parameter norm. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ # superclass init initial_parameters = locals().copy() initial_parameters.pop('self') super(SGD, self).__init__(**initial_parameters) # Momentum - smoothing over the parameter changes (see Hinton) if momentum: self.momentum = sharedX(momentum, 'momentum') if momentum_decay is not None and \ momentum_decay is not False and \ momentum_factor is not None: self.momentum_decay = get_decay_function(momentum_decay, self.momentum, self.momentum.get_value(), momentum_factor) else: self.momentum_decay = False else: self.momentum = 0 self.momentum_decay = False self.nesterov_momentum = nesterov_momentum
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rnn/', input_size=None, hidden_size=None, output_size=None, layers=1, activation='sigmoid', hidden_activation='relu', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, cost_function='mse', cost_args=None, noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99, direction='forward', clip_recurrent_grads=False): """ Initialize a simple recurrent network. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets, this will be the initial starting value for hidden layers. params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters. outdir : str The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved. input_size : int The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional. hidden_size : int The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional. output_size : int The size (dimensionality) of the output. layers : int The number of stacked hidden layers to use. activation : str or callable The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The activation to perform for the hidden layers. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the output cost of the model. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. noise : str What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. direction : str The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or 'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence, computing two sets of hiddens and merging them before running through the final decoder. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. Raises ------ AssertionError When asserting various properties of input parameters. See error messages. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(RNN, self).__init__(**initial_parameters) ################## # specifications # ################## self.direction = direction self.bidirectional = (direction == "bidirectional") self.backward = (direction == "backward") self.layers = layers self.noise = noise self.weights_init = weights_init self.weights_mean = weights_mean self.weights_std = weights_std self.weights_interval = weights_interval self.r_weights_init = r_weights_init self.r_weights_mean = r_weights_mean self.r_weights_std = r_weights_std self.r_weights_interval = r_weights_interval self.bias_init = bias_init self.r_bias_init = r_bias_init ######################################### # activation, cost, and noise functions # ######################################### # recurrent hidden activation function! self.hidden_activation_func = get_activation_function( hidden_activation) # output activation function! self.activation_func = get_activation_function(activation) # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() # Now deal with noise if we added it: if self.noise: log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) self.noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: self.noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function( noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) ############### # inputs hook # ############### # grab info from the inputs_hook # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension # being the temporal dimension. # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.inputs_hook is not None: self.input = self.inputs_hook[1] if self.input.ndim == 1: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) self.input_size = 1 elif self.input.ndim == 2: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim == 3: pass elif self.input.ndim > 3: self.input = self.input.flatten(3) self.input_size = sum(self.input_size) else: raise NotImplementedError( "Recurrent input with %d dimensions not supported!" % self.input.ndim) else: # Assume input coming from optimizer is (batches, timesteps, data) # so, we need to reshape to (timesteps, batches, data) xs = T.tensor3("Xs") xs = xs.dimshuffle(1, 0, 2) self.input = xs # The target outputs for supervised training - in the form of (batches, timesteps, output) which is # the same dimension ordering as the expected input from optimizer. # therefore, we need to swap it like we did to input xs. ys = T.tensor3("Ys") ys = ys.dimshuffle(1, 0, 2) self.target = ys ################ # hiddens hook # ################ # set an initial value for the recurrent hiddens from hook if self.hiddens_hook is not None: self.h_init = self.hiddens_hook[1] self.hidden_size = self.hiddens_hook[0] else: # deal with h_init after parameters are made (have to make the same size as hiddens that are computed) self.hidden_size = hidden_size ################## # for generating # ################## # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") self.output, self.hiddens, self.updates, self.cost, self.params = self.build_computation_graph( )
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gru/', input_size=None, hidden_size=None, output_size=None, activation='sigmoid', hidden_activation='relu', inner_hidden_activation='sigmoid', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, cost_function='mse', cost_args=None, noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99, forward=True, clip_recurrent_grads=False): """ Initialize a simple recurrent network. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets, this will be the initial starting value for hidden layers. params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters. outdir : str The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved. input_size : int The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional. hidden_size : int The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional. output_size : int The size (dimensionality) of the output. activation : str or callable The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The activation to perform for the hidden units. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. inner_hidden_activation : str or callable The activation to perform for the hidden gates. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the output cost of the model. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. noise : str What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. forward : bool The direction this recurrent model should go over its inputs. True means forward, False mean backward. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(GRU, self).__init__(**initial_parameters) ################## # specifications # ################## ######################################### # activation, cost, and noise functions # ######################################### # recurrent hidden activation function! self.hidden_activation_func = get_activation_function( hidden_activation) self.inner_hidden_activation_func = get_activation_function( inner_hidden_activation) # output activation function! activation_func = get_activation_function(activation) # Cost function cost_function = get_cost_function(cost_function) cost_args = cost_args or dict() # Now deal with noise if we added it: if noise: log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.noise_switch = sharedX(value=1, name="gru_noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function( noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) ############### # inputs hook # ############### # grab info from the inputs_hook # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension # being the temporal dimension. # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.inputs_hook is not None: self.input = self.inputs_hook[1] if self.input.ndim == 1: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) self.input_size = 1 elif self.input.ndim == 2: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim > 3: self.input = self.input.flatten(3) self.input_size = sum(self.input_size) else: raise NotImplementedError( "Recurrent input with %d dimensions not supported!" % self.input.ndim) xs = self.input else: # Assume input coming from optimizer is (batches, timesteps, data) # so, we need to reshape to (timesteps, batches, data) self.input = T.tensor3("Xs") xs = self.input.dimshuffle(1, 0, 2) # The target outputs for supervised training - in the form of (batches, timesteps, output) which is # the same dimension ordering as the expected input from optimizer. # therefore, we need to swap it like we did to input xs. self.target = T.tensor3("Ys") ys = self.target.dimshuffle(1, 0, 2) ################ # hiddens hook # ################ # set an initial value for the recurrent hiddens from hook if self.hiddens_hook is not None: h_init = self.hiddens_hook[1] self.hidden_size = self.hiddens_hook[0] else: # deal with h_init after parameters are made (have to make the same size as hiddens that are computed) self.hidden_size = hidden_size ################## # for generating # ################## # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") #################################################### # parameters - make sure to deal with params_hook! # #################################################### if self.params_hook is not None: (W_x_z, W_x_r, W_x_h, U_h_z, U_h_r, U_h_h, W_h_y, b_z, b_r, b_h, b_y) = self.params_hook recurrent_params = [U_h_z, U_h_r, U_h_h] # otherwise, construct our params else: # all input-to-hidden weights W_x_z, W_x_r, W_x_h = [ get_weights( weights_init=weights_init, shape=(self.input_size, self.hidden_size), name="W_x_%s" % sub, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for sub in ['z', 'r', 'h'] ] # all hidden-to-hidden weights U_h_z, U_h_r, U_h_h = [ get_weights( weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_h_%s" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval) for sub in ['z', 'r', 'h'] ] # hidden-to-output weights W_h_y = get_weights( weights_init=weights_init, shape=(self.hidden_size, self.output_size), name="W_h_y", # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # biases b_z, b_r, b_h = [ get_bias(shape=(self.hidden_size, ), name="b_%s" % sub, init_values=r_bias_init) for sub in ['z', 'r', 'h'] ] # output bias b_y = get_bias(shape=(self.output_size, ), name="b_y", init_values=bias_init) # clip gradients if we are doing that recurrent_params = [U_h_z, U_h_r, U_h_h] if clip_recurrent_grads: clip = abs(clip_recurrent_grads) U_h_z, U_h_r, U_h_h = [ theano.gradient.grad_clip(p, -clip, clip) for p in recurrent_params ] # put all the parameters into our list, and make sure it is in the same order as when we try to load # them from a params_hook!!! self.params = [W_x_z, W_x_r, W_x_h ] + recurrent_params + [W_h_y, b_z, b_r, b_h, b_y] # make h_init the right sized tensor if not self.hiddens_hook: h_init = T.zeros_like(T.dot(xs[0], W_x_h)) ############### # computation # ############### # move some computation outside of scan to speed it up! x_z = T.dot(xs, W_x_z) + b_z x_r = T.dot(xs, W_x_r) + b_r x_h = T.dot(xs, W_x_h) + b_h # now do the recurrent stuff self.hiddens, self.updates = theano.scan( fn=self.recurrent_step, sequences=[x_z, x_r, x_h], outputs_info=[h_init], non_sequences=[U_h_z, U_h_r, U_h_h], go_backwards=not forward, name="gru_scan", strict=True) # add noise (like dropout) if we wanted it! if noise: self.hiddens = T.switch(self.noise_switch, noise_func(input=self.hiddens), self.hiddens) # now compute the outputs from the leftover (top level) hiddens self.output = activation_func(T.dot(self.hiddens, W_h_y) + b_y) # now to define the cost of the model - use the cost function to compare our output with the target value. self.cost = cost_function(output=self.output, target=ys, **cost_args) log.info("Initialized a GRU!")
def __init__(self, model, dataset, n_epoch=1000, batch_size=100, minimum_batch_size=1, save_frequency=10, early_stop_threshold=.9995, early_stop_length=30, learning_rate=1e-3, lr_decay='exponential', lr_factor=1, **kwargs): """ Initialize the Optimizer. Parameters ---------- model : Model The Model to train. dataset : Dataset The Dataset to use when training the Model. n_epoch : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. minimum_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_frequency : int How many epochs to train between each new save of the Model's parameters. early_stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. early_stop_length : int The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. """ log.info("Initializing optimizer %s", str(type(self))) if early_stop_threshold is None: early_stop_threshold = 1. if save_frequency is None: save_frequency = 1000000 if early_stop_length is None: early_stop_length = 100 self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("optimizer config args: %s", str(self.args)) assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!" assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!" self.model = model self.dataset = dataset # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, self.learning_rate.get_value(), lr_factor) else: self.learning_rate_decay = False self.noise_switches = raise_to_list(self.model.get_noise_switch()) self.batch_size = batch_size self.minimum_batch_size = minimum_batch_size self.n_epoch = n_epoch self.save_frequency = save_frequency self.early_stop_threshold = early_stop_threshold self.early_stop_length = early_stop_length
def __init__(self, model, dataset, iterator_class=SequentialIterator, config=None, defaults=_defaults, rng=None, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, momentum=None, momentum_decay=None, momentum_factor=None, nesterov_momentum=None, flag_para_load=None): # superclass init super(SGD, self).__init__(config=config, defaults=defaults) # config and defaults are now combined in self.args! yay! self.model = model self.dataset = dataset self.iterator = iterator_class # Training epochs - how many times to iterate over the whole dataset self.n_epoch = n_epoch or self.args.get('n_epoch') # Dataset iteration batch sizes - number of examples in each calculation self.batch_size = batch_size or self.args.get('batch_size') self.minimum_batch_size = minimum_batch_size or self.args.get( 'minimum_batch_size') # Number of epochs between saving model parameters self.save_frequency = save_frequency or self.args.get('save_frequency') # Early stopping threshold and patience - by how much does the cost have to improve over a number of epochs self.early_stop_threshold = early_stop_threshold or self.args.get( 'early_stop_threshold') self.early_stop_length = early_stop_length or self.args.get( 'early_stop_length') # Learning rate - how drastic of a step do the parameters change lr = learning_rate or self.args.get('learning_rate') self.learning_rate = sharedX(lr, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if lr_decay or self.args.get('lr_decay'): self.learning_rate_decay = get_decay_function( lr_decay or self.args.get('lr_decay'), self.learning_rate, self.learning_rate.get_value(), lr_factor or self.args.get('lr_factor')) # Momentum - smoothing over the parameter changes (see Hinton) self.momentum = sharedX(momentum or self.args.get('momentum'), 'momentum') if self.args.get('momentum_decay'): self.momentum_decay = get_decay_function( momentum_decay or self.args.get('momentum_decay'), self.momentum, self.momentum.get_value(), momentum_factor or self.args.get('momentum_factor')) self.nesterov_momentum = nesterov_momentum or self.args.get( 'nesterov_momentum') # RNG for working on random iterator if rng is None: random.seed(123) self.rng = random else: self.rng = rng self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters log.info("%s params: %s", str(type(self.model)), str(self.params)) # gradient! gradient = grad(self.model.get_train_cost(), self.params) grads = OrderedDict(zip(self.params, gradient)) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta! # It tells how to update the params each training epoch gradient_updates = self.get_updates(grads) # Combine the updates from the model also if applicable train_updates = model.get_updates() if train_updates: train_updates.update(gradient_updates) else: train_updates = gradient_updates # Compile the training function! log.info('Compiling f_learn function for model %s...', str(type(self.model))) t = time.time() self.f_learn = function(inputs=model.get_inputs(), updates=train_updates, outputs=self.model.get_train_cost(), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # Determine if this function is unsupervised or not by looking at the number of inputs to the f_learn function. # If there is only one input, it is unsupervised, otherwise, it is supervised. # This workaround was provided by Pascal Lamblin on the theano-users google group num_inputs = len( [i for i in self.f_learn.maker.inputs if not i.shared]) if num_inputs == 1: log.debug("Model is unsupervised: 1 input to f_learn.") self.unsupervised = True elif num_inputs == 2: log.debug("Model is supervised: 2 inputs to f_learn.") self.unsupervised = False else: log.error( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised.", str(type(self.model)), str(num_inputs)) raise AssertionError( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised." % str(type(self.model)), str(num_inputs)) # grab the function(s) to use to monitor different model values during training self.monitors = self.model.get_monitors()
def __init__(self, inputs=None, noise='dropout', noise_level=0.5, noise_decay=False, noise_decay_amount=0.99, mrg=RNG_MRG.MRG_RandomStreams(1), switch=True): """ Parameters ---------- inputs : tuple(shape, `Theano.TensorType`) tuple(shape, `Theano.TensorType`) describing the inputs to use for this layer. `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. noise : str What type of noise to use for the output. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. switch : boolean Whether to create a switch to turn noise on during training and off during testing (True). If False, noise will be applied at both training and testing times. """ super(Noise, self).__init__(inputs=inputs, outputs=inputs[0], noise=noise, noise_level=noise_level, noise_decay=noise_decay, noise_decay_amount=noise_decay_amount, mrg=mrg, switch=switch) # self.inputs is a list from superclass initialization, grab the first element self.inputs = self.inputs[0][1] log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) if switch: self.noise_switch = sharedX(value=1, name="noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function(noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) # apply noise to the inputs! if switch: self.outputs = Tswitch(self.noise_switch, noise_func(input=self.inputs), self.inputs) else: self.outputs = noise_func(input=self.inputs)
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rnn/', input_size=None, hidden_size=None, output_size=None, layers=1, activation='sigmoid', hidden_activation='relu', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, cost_function='mse', cost_args=None, noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99, direction='forward', clip_recurrent_grads=False): """ Initialize a simple recurrent network. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets, this will be the initial starting value for hidden layers. params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters. outdir : str The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved. input_size : int The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional. hidden_size : int The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional. output_size : int The size (dimensionality) of the output. layers : int The number of stacked hidden layers to use. activation : str or callable The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The activation to perform for the hidden layers. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the output cost of the model. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. noise : str What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. direction : str The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or 'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence, computing two sets of hiddens and merging them before running through the final decoder. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. Raises ------ AssertionError When asserting various properties of input parameters. See error messages. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(RNN, self).__init__(**initial_parameters) ################## # specifications # ################## self.direction = direction self.bidirectional = (direction == "bidirectional") self.backward = (direction == "backward") self.layers = layers self.noise = noise self.weights_init = weights_init self.weights_mean = weights_mean self.weights_std = weights_std self.weights_interval = weights_interval self.r_weights_init = r_weights_init self.r_weights_mean = r_weights_mean self.r_weights_std = r_weights_std self.r_weights_interval = r_weights_interval self.bias_init = bias_init self.r_bias_init = r_bias_init ######################################### # activation, cost, and noise functions # ######################################### # recurrent hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # output activation function! self.activation_func = get_activation_function(activation) # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() # Now deal with noise if we added it: if self.noise: log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) self.noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: self.noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function(noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) ############### # inputs hook # ############### # grab info from the inputs_hook # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension # being the temporal dimension. # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.inputs_hook is not None: self.input = self.inputs_hook[1] if self.input.ndim == 1: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) self.input_size = 1 elif self.input.ndim == 2: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim == 3: pass elif self.input.ndim > 3: self.input = self.input.flatten(3) self.input_size = sum(self.input_size) else: raise NotImplementedError("Recurrent input with %d dimensions not supported!" % self.input.ndim) else: # Assume input coming from optimizer is (batches, timesteps, data) # so, we need to reshape to (timesteps, batches, data) xs = T.tensor3("Xs") xs = xs.dimshuffle(1, 0, 2) self.input = xs # The target outputs for supervised training - in the form of (batches, timesteps, output) which is # the same dimension ordering as the expected input from optimizer. # therefore, we need to swap it like we did to input xs. ys = T.tensor3("Ys") ys = ys.dimshuffle(1, 0, 2) self.target = ys ################ # hiddens hook # ################ # set an initial value for the recurrent hiddens from hook if self.hiddens_hook is not None: self.h_init = self.hiddens_hook[1] self.hidden_size = self.hiddens_hook[0] else: # deal with h_init after parameters are made (have to make the same size as hiddens that are computed) self.hidden_size = hidden_size ################## # for generating # ################## # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") self.output, self.hiddens, self.updates, self.cost, self.params = self.build_computation_graph()
def __init__(self, inputs=None, noise='dropout', noise_level=0.5, noise_decay=False, noise_decay_amount=0.99, mrg=RNG_MRG.MRG_RandomStreams(1), switch=True): """ Parameters ---------- inputs : tuple(shape, `Theano.TensorType`) tuple(shape, `Theano.TensorType`) describing the inputs to use for this layer. `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. noise : str What type of noise to use for the output. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. switch : boolean Whether to create a switch to turn noise on during training and off during testing (True). If False, noise will be applied at both training and testing times. """ super(Noise, self).__init__(inputs=inputs, noise=noise, noise_level=noise_level, noise_decay=noise_decay, noise_decay_amount=noise_decay_amount, mrg=mrg, switch=switch) # self.inputs is a list from superclass initialization, grab the first element self.output_size, self.inputs = self.inputs[0] log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) if switch: self.noise_switch = sharedX(value=1, name="noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function(noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) # apply noise to the inputs! if switch: self.outputs = Tswitch(self.noise_switch, noise_func(input=self.inputs), self.inputs) else: self.outputs = noise_func(input=self.inputs)