def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rnnrbm/', input_size=None, hidden_size=None, visible_activation='sigmoid', hidden_activation='sigmoid', weights_init='uniform', weights_mean=0, weights_std=5e-3, weights_interval='montreal', bias_init=0, mrg=RNG_MRG.MRG_RandomStreams(1), k=15, rnn_hidden_size=None, rnn_hidden_activation='rectifier', rnn_weights_init='identity', rnn_weights_mean=0, rnn_weights_std=5e-3, rnn_weights_interval='montreal', rnn_bias_init=0, generate_n_steps=200): """ Initialize the RNN-RBM. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the input i.e. input_size). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. hidden_size). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the RBM. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the RBM is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the RBM. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. mrg : random A random number generator that is used when sampling. The RBM is a probabilistic model, so it relies a lot on sampling. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. k : int The k number of steps used for CD-k or PCD-k with Gibbs sampling. Basically, the number of samples generated from the model to train against reconstructing the original input. rnn_hidden_size : int The number of hidden units (dimensionality) to use in the recurrent layer. rnn_hidden_activation : str or Callable The activation function to apply to recurrent units. See opendeep.utils.activation for options. rnn_weights_init : str Determines the method for initializing recurrent weights. See opendeep.utils.nnet for options. 'Identity' works well with 'rectifier' `rnn_hidden_activation`. rnn_weights_mean : float If Gaussian `rnn_weights_init`, the mean value to use. rnn_weights_std : float If Gaussian `rnn_weights_init`, the standard deviation to use. rnn_weights_interval : str or float If Uniform `rnn_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. rnn_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. generate_n_steps : int When generating from the model, how many steps to generate. """ super(RNN_RBM, self).__init__(**{arg: val for (arg, val) in locals().iteritems() if arg is not 'self'}) ################## # specifications # ################## self.mrg = mrg self.k = k self.generate_n_steps = generate_n_steps # grab info from the inputs_hook, hiddens_hook, or from parameters if self.inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) raise NotImplementedError("Inputs_hook not implemented yet for RNN-RBM") else: # make the input a symbolic matrix - a sequence of inputs self.input = T.matrix('Vs') # set an initial value for the recurrent hiddens self.u0 = T.zeros((rnn_hidden_size,)) # make a symbolic vector for the initial recurrent hiddens value to use during generation for the model self.generate_u0 = T.vector("generate_u0") # either grab the hidden's desired size from the parameter directly, or copy n_in self.hidden_size = hidden_size or self.input_size # deal with hiddens_hook if self.hiddens_hook is not None: raise NotImplementedError("Hiddens_hook not implemented yet for RNN_RBM") # other specifications # visible activation function! self.visible_activation_func = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation_func): self.visible_sampling = self.mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.hidden_activation_func): self.hidden_sampling = self.mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary hidden activation not supported yet!") raise NotImplementedError("Non-binary hidden activation not supported yet!") # recurrent hidden activation function! self.rnn_hidden_activation_func = get_activation_function(rnn_hidden_activation) # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") #################################################### # parameters - make sure to deal with params_hook! # #################################################### if self.params_hook is not None: # make sure the params_hook has W (weights matrix) and bh, bv (bias vectors) assert len(self.params_hook) == 8, \ "Expected 8 params (W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu) for RBM, found {0!s}!".format( len(self.params_hook) ) self.W, self.bv, self.bh, self.Wuh, self.Wuv, self.Wvu, self.Wuu, self.bu = self.params_hook else: # RBM weight params self.W = get_weights(weights_init=weights_init, shape=(self.input_size, self.hidden_size), name="W", rng=self.mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # RNN weight params self.Wuh = get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, self.hidden_size), name="Wuh", rng=self.mrg, # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) self.Wuv = get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, self.input_size), name="Wuv", rng=self.mrg, # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) self.Wvu = get_weights(weights_init=rnn_weights_init, shape=(self.input_size, rnn_hidden_size), name="Wvu", rng=self.mrg, # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) self.Wuu = get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, rnn_hidden_size), name="Wuu", rng=self.mrg, # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) # grab the bias vectors # rbm biases self.bv = get_bias(shape=self.input_size, name="bv", init_values=bias_init) self.bh = get_bias(shape=self.hidden_size, name="bh", init_values=bias_init) # rnn bias self.bu = get_bias(shape=rnn_hidden_size, name="bu", init_values=rnn_bias_init) # Finally have the parameters self.params = [self.W, self.bv, self.bh, self.Wuh, self.Wuv, self.Wvu, self.Wuu, self.bu] # Create the RNN-RBM graph! self.v_sample, self.cost, self.monitors, self.updates_train, self.v_ts, self.updates_generate, self.u_t = \ self._build_rnnrbm() log.info("Initialized an RNN-RBM!")
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gsn/', input_size=None, hidden_size=1000, layers=2, walkbacks=4, visible_activation='sigmoid', hidden_activation='tanh', input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1), tied_weights=True, weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, cost_function='binary_crossentropy', cost_args=None, add_noise=True, noiseless_h1=True, hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4, noise_decay='exponential', noise_annealing=1, image_width=None, image_height=None, **kwargs): """ Initialize a GSN. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's output layer gives a generative recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than `input_size`, which is known as *overcomplete*. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. layers : int The number of hidden layers to use. walkbacks : int The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample from the DAE, which means the model generates inputs in sequence, where each generated input is compared to the original input to create the reconstruction cost for training. For running the model, the very last generated input in the Gibbs chain is used as the output. input_sampling : bool During walkbacks, whether to sample from the generated input to create a new starting point for the next walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the process more stochastic - more likely to find spurious modes in the model's representation. mrg : random A random number generator that is used when adding noise into the network and for sampling from the input. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. tied_weights : bool DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the reconstruction cost of the model. This should be appropriate for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. add_noise : bool Whether to add noise (corrupt) the input before passing it through the computation graph during training. This should most likely be set to the default of True, because this is a *denoising* autoencoder after all. noiseless_h1 : bool Whether to not add noise (corrupt) the hidden layer during computation. hidden_noise : str What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. hidden_noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. input_noise : str What type of noise to use for corrupting the input before computation (if `add_noise`). See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper for binary units, etc. input_noise_level : float The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper, standard deviation for Gaussian, interval for Uniform, etc. noise_decay : str or False Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_annealing : float The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified in `noise_decay`. image_width : int If the input should be represented as an image, the width of the input image. If not specified, it will be close to the square factor of the `input_size`. image_height : int If the input should be represented as an image, the height of the input image. If not specified, it will be close to the square factor of the `input_size`. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(GSN, self).__init__(**initial_parameters) # when the input should be thought of as an image, either use the specified width and height, # or try to make as square as possible. if image_height is None and image_width is None: (_h, _w) = closest_to_square_factors(self.input_size) self.image_width = _w self.image_height = _h else: self.image_height = image_height self.image_width = image_width ############################ # Theano variables and RNG # ############################ if self.inputs_hook is None: self.X = T.matrix('X') else: # inputs_hook is a (shape, input) tuple self.X = self.inputs_hook[1] ########################## # Network specifications # ########################## # generally, walkbacks should be at least 2*layers if layers % 2 == 0: if walkbacks < 2*layers: log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) else: if walkbacks < 2*layers-1: log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) self.add_noise = add_noise self.noise_annealing = as_floatX(noise_annealing) # noise schedule parameter self.hidden_noise_level = sharedX(hidden_noise_level, dtype=theano.config.floatX) self.hidden_noise = get_noise(name=hidden_noise, noise_level=self.hidden_noise_level, mrg=mrg) self.input_noise_level = sharedX(input_noise_level, dtype=theano.config.floatX) self.input_noise = get_noise(name=input_noise, noise_level=self.input_noise_level, mrg=mrg) self.walkbacks = walkbacks self.tied_weights = tied_weights self.layers = layers self.noiseless_h1 = noiseless_h1 self.input_sampling = input_sampling self.noise_decay = noise_decay # if there was a hiddens_hook, unpack the hidden layers in the tensor if self.hiddens_hook is not None: hidden_size = self.hiddens_hook[0] self.hiddens_flag = True else: self.hiddens_flag = False # determine the sizes of each layer in a list. # layer sizes, from h0 to hK (h0 is the visible layer) hidden_size = list(raise_to_list(hidden_size)) if len(hidden_size) == 1: self.layer_sizes = [self.input_size] + hidden_size * self.layers else: assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \ "Hiddens %d and layers %d" % (len(hidden_size), self.layers) self.layer_sizes = [self.input_size] + hidden_size if self.hiddens_hook is not None: self.hiddens = self.unpack_hiddens(self.hiddens_hook[1]) ######################### # Activation functions! # ######################### # hidden unit activation self.hidden_activation = get_activation_function(hidden_activation) # Visible layer activation self.visible_activation = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() ############### # Parameters! # ############### # make sure to deal with params_hook! if self.params_hook is not None: # if tied weights, expect layers*2 + 1 params if self.tied_weights: assert len(self.params_hook) == 2*layers + 1, \ "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:layers] self.bias_list = self.params_hook[layers:] # if untied weights, expect layers*3 + 1 params else: assert len(self.params_hook) == 3*layers + 1, \ "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:2*layers] self.bias_list = self.params_hook[2*layers:] # otherwise, construct our params else: # initialize a list of weights and biases based on layer_sizes for the GSN self.weights_list = [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i], self.layer_sizes[i+1]), name="W_{0!s}_{1!s}".format(i, i+1), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in range(layers)] # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now) if not tied_weights: self.weights_list.extend( [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i+1], self.layer_sizes[i]), name="W_{0!s}_{1!s}".format(i+1, i), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in reversed(range(layers))] ) # initialize each layer bias to 0's. self.bias_list = [get_bias(shape=(self.layer_sizes[i],), name='b_' + str(i), init_values=bias_init) for i in range(layers+1)] # build the params of the model into a list self.params = self.weights_list + self.bias_list log.debug("gsn params: %s", str(self.params)) # using the properties, build the computational graph self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph()
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir=None, input_size=None, hidden_size=None, layers=2, walkbacks=4, visible_activation='sigmoid', hidden_activation='tanh', input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1), tied_weights=True, weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0, cost_function='binary_crossentropy', cost_args=None, add_noise=True, noiseless_h1=True, hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4, noise_decay='exponential', noise_annealing=1, image_width=None, image_height=None, rnn_hidden_size=None, rnn_hidden_activation='rectifier', rnn_weights_init='identity', rnn_weights_mean=0, rnn_weights_std=5e-3, rnn_weights_interval='montreal', rnn_bias_init=0, generate_n_steps=200): """ Initialize an RNN-GSN. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's output layer gives a generative recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than `input_size`, which is known as *overcomplete*. layers : int The number of hidden layers to use. walkbacks : int The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample from the DAE, which means the model generates inputs in sequence, where each generated input is compared to the original input to create the reconstruction cost for training. For running the model, the very last generated input in the Gibbs chain is used as the output. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. input_sampling : bool During walkbacks, whether to sample from the generated input to create a new starting point for the next walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the process more stochastic - more likely to find spurious modes in the model's representation. mrg : random A random number generator that is used when adding noise into the network and for sampling from the input. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. tied_weights : bool DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the reconstruction cost of the model. This should be appropriate for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. add_noise : bool Whether to add noise (corrupt) the input before passing it through the computation graph during training. This should most likely be set to the default of True, because this is a *denoising* autoencoder after all. noiseless_h1 : bool Whether to not add noise (corrupt) the hidden layer during computation. hidden_noise : str What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. hidden_noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. input_noise : str What type of noise to use for corrupting the input before computation (if `add_noise`). See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper for binary units, etc. input_noise_level : float The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper, standard deviation for Gaussian, interval for Uniform, etc. noise_decay : str or False Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_annealing : float The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified in `noise_decay`. image_width : int If the input should be represented as an image, the width of the input image. If not specified, it will be close to the square factor of the `input_size`. image_height : int If the input should be represented as an image, the height of the input image. If not specified, it will be close to the square factor of the `input_size`. rnn_hidden_size : int The number of hidden units (dimensionality) to use in the recurrent layer. rnn_hidden_activation : str or Callable The activation function to apply to recurrent units. See opendeep.utils.activation for options. rnn_weights_init : str Determines the method for initializing recurrent weights. See opendeep.utils.nnet for options. 'Identity' works well with 'rectifier' `rnn_hidden_activation`. rnn_weights_mean : float If Gaussian `rnn_weights_init`, the mean value to use. rnn_weights_std : float If Gaussian `rnn_weights_init`, the standard deviation to use. rnn_weights_interval : str or float If Uniform `rnn_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. rnn_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. generate_n_steps : int When generating from the model, how many steps to generate. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(RNN_GSN, self).__init__(**initial_parameters) ################## # specifications # ################## self.input_size = input_size self.layers = layers self.walkbacks = walkbacks self.input_sampling = input_sampling self.mrg = mrg self.tied_weights = tied_weights self.noise_decay = noise_decay self.noise_annealing = noise_annealing self.add_noise = add_noise self.noiseless_h1 = noiseless_h1 self.hidden_noise = hidden_noise self.hidden_noise_level = hidden_noise_level self.input_noise = input_noise self.input_noise_level = input_noise_level self.image_width = image_width self.image_height = image_height # grab info from the inputs_hook, hiddens_hook, or from parameters if self.inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) raise NotImplementedError("Inputs_hook not implemented yet for RNN-GSN") else: # make the input a symbolic matrix - a sequence of inputs self.input = T.matrix('Xs') # set an initial value for the recurrent hiddens self.u0 = T.zeros((rnn_hidden_size,)) # make a symbolic vector for the initial recurrent hiddens value to use during generation for the model self.generate_u0 = T.vector("generate_u0") # either grab the hidden's desired size from the parameter directly, or copy n_in self.hidden_size = hidden_size or self.input_size # deal with hiddens_hook if self.hiddens_hook is not None: raise NotImplementedError("Hiddens_hook not implemented yet for RNN-GSN") # other specifications # visible activation function! self.visible_activation_func = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation_func): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # recurrent hidden activation function! self.rnn_hidden_activation_func = get_activation_function(rnn_hidden_activation) # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") # determine the sizes of each layer in a list. # layer sizes, from h0 to hK (h0 is the visible layer) self.layer_sizes = [self.input_size] + [self.hidden_size] * self.layers #################################################### # parameters - make sure to deal with params_hook! # #################################################### if self.params_hook is not None: # if tied weights, expect (layers*2 + 1) params for GSN and (int(layers+1)/int(2) + 3) for RNN if self.tied_weights: expected_num = (2*self.layers + 1) + (int(self.layers+1)/2 + 3) assert len(self.params_hook) == expected_num, \ "Tied weights: expected {0!s} params, found {1!s}!".format(expected_num, len(self.params_hook)) gsn_len = (2*self.layers + 1) self.weights_list = self.params_hook[:self.layers] self.bias_list = self.params_hook[self.layers:gsn_len] # if untied weights, expect layers*3 + 1 params else: expected_num = (3*self.layers + 1) + (int(self.layers + 1)/2 + 3) assert len(self.params_hook) == expected_num, \ "Untied weights: expected {0!s} params, found {1!s}!".format(expected_num, len(self.params_hook)) gsn_len = (3*self.layers + 1) self.weights_list = self.params_hook[:2*self.layers] self.bias_list = self.params_hook[2*self.layers:gsn_len] rnn_len = gsn_len + int(self.layers + 1) / 2 self.recurrent_to_gsn_weights_list = self.params_hook[gsn_len:rnn_len] self.W_u_u = self.params_hook[rnn_len:rnn_len + 1] self.W_x_u = self.params_hook[rnn_len + 1:rnn_len + 2] self.recurrent_bias = self.params_hook[rnn_len + 2:rnn_len + 3] # otherwise, construct our params else: # initialize a list of weights and biases based on layer_sizes for the GSN self.weights_list = [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i], self.layer_sizes[i + 1]), name="W_{0!s}_{1!s}".format(i, i + 1), # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in range(self.layers)] # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now) if not self.tied_weights: self.weights_list.extend( [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i + 1], self.layer_sizes[i]), name="W_{0!s}_{1!s}".format(i + 1, i), # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in reversed(range(self.layers))] ) # initialize each layer bias to 0's. self.bias_list = [get_bias(shape=(self.layer_sizes[i],), name='b_' + str(i), init_values=bias_init) for i in range(self.layers + 1)] self.recurrent_to_gsn_weights_list = [ get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, self.layer_sizes[layer]), name="W_u_h{0!s}".format(layer), # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) for layer in range(self.layers + 1) if layer % 2 != 0 ] self.W_u_u = get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, rnn_hidden_size), name="W_u_u", # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, #if uniform interval=rnn_weights_interval) self.W_x_u = get_weights(weights_init=rnn_weights_init, shape=(self.input_size, rnn_hidden_size), name="W_x_u", # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) self.recurrent_bias = get_bias(shape=(rnn_hidden_size,), name="b_u", init_values=rnn_bias_init) # build the params of the model into a list self.gsn_params = self.weights_list + self.bias_list self.params = self.gsn_params + \ self.recurrent_to_gsn_weights_list + \ [self.W_u_u, self.W_x_u, self.recurrent_bias] log.debug("rnn-gsn params: %s", str(self.params)) # Create the RNN-GSN graph! self.x_sample, self.cost, self.monitors, self.updates_train, self.x_ts, self.updates_generate, self.u_t = \ self._build_rnngsn() log.info("Initialized an RNN-GSN!")
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gsn/', input_size=None, hidden_size=1000, layers=2, walkbacks=4, visible_activation='sigmoid', hidden_activation='tanh', input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1), tied_weights=True, weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, cost_function='binary_crossentropy', cost_args=None, add_noise=True, noiseless_h1=True, hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4, noise_decay='exponential', noise_annealing=1, image_width=None, image_height=None, **kwargs): """ Initialize a GSN. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's output layer gives a generative recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than `input_size`, which is known as *overcomplete*. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. layers : int The number of hidden layers to use. walkbacks : int The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample from the DAE, which means the model generates inputs in sequence, where each generated input is compared to the original input to create the reconstruction cost for training. For running the model, the very last generated input in the Gibbs chain is used as the output. input_sampling : bool During walkbacks, whether to sample from the generated input to create a new starting point for the next walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the process more stochastic - more likely to find spurious modes in the model's representation. mrg : random A random number generator that is used when adding noise into the network and for sampling from the input. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. tied_weights : bool DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the reconstruction cost of the model. This should be appropriate for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. add_noise : bool Whether to add noise (corrupt) the input before passing it through the computation graph during training. This should most likely be set to the default of True, because this is a *denoising* autoencoder after all. noiseless_h1 : bool Whether to not add noise (corrupt) the hidden layer during computation. hidden_noise : str What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. hidden_noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. input_noise : str What type of noise to use for corrupting the input before computation (if `add_noise`). See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper for binary units, etc. input_noise_level : float The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper, standard deviation for Gaussian, interval for Uniform, etc. noise_decay : str or False Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_annealing : float The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified in `noise_decay`. image_width : int If the input should be represented as an image, the width of the input image. If not specified, it will be close to the square factor of the `input_size`. image_height : int If the input should be represented as an image, the height of the input image. If not specified, it will be close to the square factor of the `input_size`. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(GSN, self).__init__(**initial_parameters) # when the input should be thought of as an image, either use the specified width and height, # or try to make as square as possible. if image_height is None and image_width is None: (_h, _w) = closest_to_square_factors(self.input_size) self.image_width = _w self.image_height = _h else: self.image_height = image_height self.image_width = image_width ############################ # Theano variables and RNG # ############################ if self.inputs_hook is None: self.X = T.matrix('X') else: # inputs_hook is a (shape, input) tuple self.X = self.inputs_hook[1] ########################## # Network specifications # ########################## # generally, walkbacks should be at least 2*layers if layers % 2 == 0: if walkbacks < 2 * layers: log.warning( 'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) else: if walkbacks < 2 * layers - 1: log.warning( 'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) self.add_noise = add_noise self.noise_annealing = as_floatX( noise_annealing) # noise schedule parameter self.hidden_noise_level = sharedX(hidden_noise_level, dtype=theano.config.floatX) self.hidden_noise = get_noise(name=hidden_noise, noise_level=self.hidden_noise_level, mrg=mrg) self.input_noise_level = sharedX(input_noise_level, dtype=theano.config.floatX) self.input_noise = get_noise(name=input_noise, noise_level=self.input_noise_level, mrg=mrg) self.walkbacks = walkbacks self.tied_weights = tied_weights self.layers = layers self.noiseless_h1 = noiseless_h1 self.input_sampling = input_sampling self.noise_decay = noise_decay # if there was a hiddens_hook, unpack the hidden layers in the tensor if self.hiddens_hook is not None: hidden_size = self.hiddens_hook[0] self.hiddens_flag = True else: self.hiddens_flag = False # determine the sizes of each layer in a list. # layer sizes, from h0 to hK (h0 is the visible layer) hidden_size = list(raise_to_list(hidden_size)) if len(hidden_size) == 1: self.layer_sizes = [self.input_size] + hidden_size * self.layers else: assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \ "Hiddens %d and layers %d" % (len(hidden_size), self.layers) self.layer_sizes = [self.input_size] + hidden_size if self.hiddens_hook is not None: self.hiddens = self.unpack_hiddens(self.hiddens_hook[1]) ######################### # Activation functions! # ######################### # hidden unit activation self.hidden_activation = get_activation_function(hidden_activation) # Visible layer activation self.visible_activation = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError( "Non-binary visible activation not supported yet!") # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() ############### # Parameters! # ############### # make sure to deal with params_hook! if self.params_hook is not None: # if tied weights, expect layers*2 + 1 params if self.tied_weights: assert len(self.params_hook) == 2*layers + 1, \ "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:layers] self.bias_list = self.params_hook[layers:] # if untied weights, expect layers*3 + 1 params else: assert len(self.params_hook) == 3*layers + 1, \ "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:2 * layers] self.bias_list = self.params_hook[2 * layers:] # otherwise, construct our params else: # initialize a list of weights and biases based on layer_sizes for the GSN self.weights_list = [ get_weights( weights_init=weights_init, shape=(self.layer_sizes[i], self.layer_sizes[i + 1]), name="W_{0!s}_{1!s}".format(i, i + 1), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in range(layers) ] # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now) if not tied_weights: self.weights_list.extend([ get_weights( weights_init=weights_init, shape=(self.layer_sizes[i + 1], self.layer_sizes[i]), name="W_{0!s}_{1!s}".format(i + 1, i), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in reversed(range(layers)) ]) # initialize each layer bias to 0's. self.bias_list = [ get_bias(shape=(self.layer_sizes[i], ), name='b_' + str(i), init_values=bias_init) for i in range(layers + 1) ] # build the params of the model into a list self.params = self.weights_list + self.bias_list log.debug("gsn params: %s", str(self.params)) # using the properties, build the computational graph self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph( )
def update_single_layer(self, hiddens, p_X_chain, layer_idx, add_noise): # Compute the dot product, whatever layer # If the visible layer X if layer_idx == 0: if self.tied_weights: log.debug('using ' + str(self.weights_list[layer_idx]) + '.T') hiddens[layer_idx] = T.dot(hiddens[layer_idx+1], self.weights_list[layer_idx].T) + \ self.bias_list[layer_idx] else: log.debug('using ' + str(self.weights_list[-(layer_idx+1)])) hiddens[layer_idx] = T.dot(hiddens[layer_idx+1], self.weights_list[-(layer_idx+1)]) + \ self.bias_list[layer_idx] # If the top layer elif layer_idx == len(hiddens) - 1: log.debug('using ' + str(self.weights_list[layer_idx-1])) hiddens[layer_idx] = T.dot(hiddens[layer_idx-1], self.weights_list[layer_idx-1]) + self.bias_list[layer_idx] # Otherwise in-between layers else: if self.tied_weights: log.debug("using %s and %s.T", str(self.weights_list[layer_idx-1]), str(self.weights_list[layer_idx])) # next layer : hiddens[layer_idx+1], assigned weights : W_i # previous layer : hiddens[layer_idx-1], assigned weights : W_(layer_idx-1) hiddens[layer_idx] = T.dot(hiddens[layer_idx+1], self.weights_list[layer_idx].T) + \ T.dot(hiddens[layer_idx-1], self.weights_list[layer_idx-1]) + \ self.bias_list[layer_idx] else: log.debug("using %s and %s", str(self.weights_list[layer_idx-1]), str(self.weights_list[-(layer_idx+1)])) hiddens[layer_idx] = T.dot(hiddens[layer_idx+1], self.weights_list[-(layer_idx+1)]) + \ T.dot(hiddens[layer_idx-1], self.weights_list[layer_idx-1]) + \ self.bias_list[layer_idx] # Add pre-activation noise if NOT input layer if layer_idx == 1 and self.noiseless_h1: log.debug('>>NO noise in first hidden layer') add_noise = False # pre activation noise if layer_idx != 0 and add_noise: log.debug('Adding pre-activation gaussian noise for layer %s', str(layer_idx)) hiddens[layer_idx] = self.hidden_noise(hiddens[layer_idx]) # ACTIVATION! if layer_idx == 0: log.debug('Activation for visible layer') hiddens[layer_idx] = self.visible_activation(hiddens[layer_idx]) else: log.debug('Hidden units activation for layer %s', str(layer_idx)) hiddens[layer_idx] = self.hidden_activation(hiddens[layer_idx]) # post activation noise # why is there post activation noise? Because there is already pre-activation noise, # this just doubles the amount of noise between each activation of the hiddens. if layer_idx != 0 and add_noise: log.debug('Adding post-activation gaussian noise for layer %s', str(layer_idx)) hiddens[layer_idx] = self.hidden_noise(hiddens[layer_idx]) # build the reconstruction chain if updating the visible layer X if layer_idx == 0: # if input layer -> append p(X|H...) p_X_chain.append(hiddens[layer_idx]) # sample from p(X|H...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES # I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL. real-valued inputs should be gaussian if self.input_sampling: if not is_binary(self.visible_activation): # TODO: implement non-binary sampling (gaussian) log.error("Non-binary visible activation sampling not yet supported.") raise NotImplementedError("Non-binary visible activation sampling not yet supported.") log.debug('Sampling from input') sampled = self.visible_sampling(p=hiddens[layer_idx], size=hiddens[layer_idx].shape, dtype=theano.config.floatX) else: log.debug('>>NO input sampling') sampled = hiddens[layer_idx] # add noise to input layer sampled = self.input_noise(sampled) # set input layer hiddens[layer_idx] = sampled
def __init__(self, inputs=None, hiddens=None, params=None, outdir='outputs/rbm/', visible_activation='sigmoid', hidden_activation='sigmoid', weights_init='uniform', weights_mean=0, weights_std=5e-3, weights_interval='montreal', bias_init=0.0, mrg=RNG_MRG.MRG_RandomStreams(1), k=15): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. Parameters ---------- inputs : List of [tuple(shape, `Theano.TensorType`)] The dimensionality of the inputs for this model, and the routing information for the model to accept inputs from elsewhere. `inputs` variable are expected to be of the form (timesteps, batch, data). `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. hiddens : int or Tuple of (shape, `Theano.TensorType`) Int for the number of hidden units to use, or a tuple of shape, expression to route the starting hidden values from elsewhere. params : Dict(string_name: theano SharedVariable), optional A dictionary of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as siamese networks or pretraining some weights. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. mrg : random A random number generator that is used when sampling. The RBM is a probabilistic model, so it relies a lot on sampling. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. k : int The k number of steps used for CD-k or PCD-k with Gibbs sampling. Basically, the number of samples generated from the model to train against reconstructing the original input. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(RBM, self).__init__(**initial_parameters) ################## # specifications # ################## if len(self.inputs) > 1: raise NotImplementedError("Expected 1 input to RBM, found %d. Please merge inputs before passing " "to the model!" % len(self.inputs)) # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input) input_shape, self.input = self.inputs[0] if isinstance(input_shape, int): self.input_size = ((None,) * (self.input.ndim - 1)) + (input_shape,) else: self.input_size = input_shape assert self.input_size is not None, "Need to specify the shape for the last dimension of the input!" # our output space is the same as the input space self.output_size = self.input_size # grab hiddens # have only 1 hiddens assert len(self.hiddens) == 1, "Expected 1 `hiddens` param, found %d" % len(self.hiddens) self.hiddens = self.hiddens[0] if isinstance(self.hiddens, int): hidden_size = self.hiddens hiddens_init = None elif isinstance(self.hiddens, tuple): hidden_shape, hiddens_init = self.hiddens if isinstance(hidden_shape, int): hidden_size = hidden_shape else: hidden_size = hidden_shape[-1] else: raise AssertionError("Hiddens need to be an int or tuple of (shape, theano_expression), found %s" % type(self.hiddens)) # other specifications # visible activation function! self.visible_activation_func = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation_func): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.hidden_activation_func): self.hidden_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary hidden activation not supported yet!") raise NotImplementedError("Non-binary hidden activation not supported yet!") #################################################### # parameters - make sure to deal with params_hook! # #################################################### self.W = self.params.get( "W", get_weights(weights_init=weights_init, shape=(self.input_size[-1], hidden_size), name="W", rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) ) self.b_v = self.params.get( "b_v", get_bias(shape=self.input_size[-1], name="b_v", init_values=bias_init) ) self.b_h = self.params.get( "b_h", get_bias(shape=hidden_size, name="b_h", init_values=bias_init) ) # Finally have the parameters self.params = {"W": self.W, "b_v": self.b_v, "b_h": self.b_h} ############### # computation # ############### # initialize from visibles if we aren't generating from some hiddens if hiddens_init is None: [_, v_chain, _, h_chain], self.updates = theano.scan(fn=self._gibbs_step_vhv, outputs_info=[None, self.input, None, None], n_steps=k) # initialize from hiddens else: [_, v_chain, _, h_chain], self.updates = theano.scan(fn=self._gibbs_step_hvh, outputs_info=[None, None, None, hiddens_init], n_steps=k) self.v_sample = v_chain[-1] self.h_sample = h_chain[-1] mean_v, _, _, _ = self._gibbs_step_vhv(self.v_sample) # the free-energy cost function! # consider v_sample constant when computing gradients on the cost function # this actually keeps v_sample from being considered in the gradient, to set gradient to 0 instead, # use theano.gradient.zero_grad v_sample_constant = theano.gradient.disconnected_grad(self.v_sample) # v_sample_constant = v_sample self.cost = (self.free_energy(self.input) - self.free_energy(v_sample_constant)) / self.input.shape[0] log.debug("Initialized an RBM shape %s", str((self.input_size, hidden_size)))
def update_single_layer(self, hiddens, p_X_chain, layer_idx, add_noise): # Compute the dot product, whatever layer # If the visible layer X if layer_idx == 0: if self.tied_weights: log.debug('using ' + str(self.weights_list[layer_idx]) + '.T') hiddens[layer_idx] = T.dot(hiddens[layer_idx+1], self.weights_list[layer_idx].T) + \ self.bias_list[layer_idx] else: log.debug('using ' + str(self.weights_list[-(layer_idx + 1)])) hiddens[layer_idx] = T.dot(hiddens[layer_idx+1], self.weights_list[-(layer_idx+1)]) + \ self.bias_list[layer_idx] # If the top layer elif layer_idx == len(hiddens) - 1: log.debug('using ' + str(self.weights_list[layer_idx - 1])) hiddens[layer_idx] = T.dot( hiddens[layer_idx - 1], self.weights_list[layer_idx - 1]) + self.bias_list[layer_idx] # Otherwise in-between layers else: if self.tied_weights: log.debug("using %s and %s.T", str(self.weights_list[layer_idx - 1]), str(self.weights_list[layer_idx])) # next layer : hiddens[layer_idx+1], assigned weights : W_i # previous layer : hiddens[layer_idx-1], assigned weights : W_(layer_idx-1) hiddens[layer_idx] = T.dot(hiddens[layer_idx+1], self.weights_list[layer_idx].T) + \ T.dot(hiddens[layer_idx-1], self.weights_list[layer_idx-1]) + \ self.bias_list[layer_idx] else: log.debug("using %s and %s", str(self.weights_list[layer_idx - 1]), str(self.weights_list[-(layer_idx + 1)])) hiddens[layer_idx] = T.dot(hiddens[layer_idx+1], self.weights_list[-(layer_idx+1)]) + \ T.dot(hiddens[layer_idx-1], self.weights_list[layer_idx-1]) + \ self.bias_list[layer_idx] # Add pre-activation noise if NOT input layer if layer_idx == 1 and self.noiseless_h1: log.debug('>>NO noise in first hidden layer') add_noise = False # pre activation noise if layer_idx != 0 and add_noise: log.debug('Adding pre-activation gaussian noise for layer %s', str(layer_idx)) hiddens[layer_idx] = self.hidden_noise(hiddens[layer_idx]) # ACTIVATION! if layer_idx == 0: log.debug('Activation for visible layer') hiddens[layer_idx] = self.visible_activation(hiddens[layer_idx]) else: log.debug('Hidden units activation for layer %s', str(layer_idx)) hiddens[layer_idx] = self.hidden_activation(hiddens[layer_idx]) # post activation noise # why is there post activation noise? Because there is already pre-activation noise, # this just doubles the amount of noise between each activation of the hiddens. if layer_idx != 0 and add_noise: log.debug('Adding post-activation gaussian noise for layer %s', str(layer_idx)) hiddens[layer_idx] = self.hidden_noise(hiddens[layer_idx]) # build the reconstruction chain if updating the visible layer X if layer_idx == 0: # if input layer -> append p(X|H...) p_X_chain.append(hiddens[layer_idx]) # sample from p(X|H...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES # I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL. real-valued inputs should be gaussian if self.input_sampling: if not is_binary(self.visible_activation): # TODO: implement non-binary sampling (gaussian) log.error( "Non-binary visible activation sampling not yet supported." ) raise NotImplementedError( "Non-binary visible activation sampling not yet supported." ) log.debug('Sampling from input') sampled = self.visible_sampling(p=hiddens[layer_idx], size=hiddens[layer_idx].shape, dtype=theano.config.floatX) else: log.debug('>>NO input sampling') sampled = hiddens[layer_idx] # add noise to input layer sampled = self.input_noise(sampled) # set input layer hiddens[layer_idx] = sampled
def __init__(self, inputs=None, hiddens=None, params=None, outdir='outputs/rbm/', visible_activation='sigmoid', hidden_activation='sigmoid', weights_init='uniform', weights_mean=0, weights_std=5e-3, weights_interval='glorot', bias_init=0.0, mrg=RNG_MRG.MRG_RandomStreams(1), k=15): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. Parameters ---------- inputs : List of [tuple(shape, `Theano.TensorType`)] The dimensionality of the inputs for this model, and the routing information for the model to accept inputs from elsewhere. `inputs` variable are expected to be of the form (timesteps, batch, data). `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. hiddens : int or Tuple of (shape, `Theano.TensorType`) Int for the number of hidden units to use, or a tuple of shape, expression to route the starting hidden values from elsewhere. params : Dict(string_name: theano SharedVariable), optional A dictionary of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as siamese networks or pretraining some weights. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. mrg : random A random number generator that is used when sampling. The RBM is a probabilistic model, so it relies a lot on sampling. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. k : int The k number of steps used for CD-k or PCD-k with Gibbs sampling. Basically, the number of samples generated from the model to train against reconstructing the original input. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(RBM, self).__init__(**initial_parameters) ################## # specifications # ################## if len(self.inputs) > 1: raise NotImplementedError("Expected 1 input to RBM, found %d. Please merge inputs before passing " "to the model!" % len(self.inputs)) # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input) input_shape, self.input = self.inputs[0] if isinstance(input_shape, int): self.input_size = ((None,) * (self.input.ndim - 1)) + (input_shape,) else: self.input_size = input_shape assert self.input_size is not None, "Need to specify the shape for the last dimension of the input!" # our output space is the same as the input space self.output_size = self.input_size # grab hiddens # have only 1 hiddens assert len(self.hiddens) == 1, "Expected 1 `hiddens` param, found %d" % len(self.hiddens) self.hiddens = self.hiddens[0] if isinstance(self.hiddens, int): hidden_size = self.hiddens hiddens_init = None elif isinstance(self.hiddens, tuple): hidden_shape, hiddens_init = self.hiddens if isinstance(hidden_shape, int): hidden_size = hidden_shape else: hidden_size = hidden_shape[-1] else: raise AssertionError("Hiddens need to be an int or tuple of (shape, theano_expression), found %s" % type(self.hiddens)) # other specifications # visible activation function! self.visible_activation_func = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation_func): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.hidden_activation_func): self.hidden_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary hidden activation not supported yet!") raise NotImplementedError("Non-binary hidden activation not supported yet!") #################################################### # parameters - make sure to deal with params_hook! # #################################################### self.W = self.params.get( "W", get_weights(weights_init=weights_init, shape=(self.input_size[-1], hidden_size), name="W", rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) ) self.b_v = self.params.get( "b_v", get_bias(shape=self.input_size[-1], name="b_v", init_values=bias_init) ) self.b_h = self.params.get( "b_h", get_bias(shape=hidden_size, name="b_h", init_values=bias_init) ) # Finally have the parameters self.params = {"W": self.W, "b_v": self.b_v, "b_h": self.b_h} ############### # computation # ############### # initialize from visibles if we aren't generating from some hiddens if hiddens_init is None: [_, v_chain, _, h_chain], self.updates = theano.scan(fn=self._gibbs_step_vhv, outputs_info=[None, self.input, None, None], n_steps=k) # initialize from hiddens else: [_, v_chain, _, h_chain], self.updates = theano.scan(fn=self._gibbs_step_hvh, outputs_info=[None, None, None, hiddens_init], n_steps=k) self.v_sample = v_chain[-1] self.h_sample = h_chain[-1] mean_v, _, _, _ = self._gibbs_step_vhv(self.v_sample) # the free-energy cost function! # consider v_sample constant when computing gradients on the cost function # this actually keeps v_sample from being considered in the gradient, to set gradient to 0 instead, # use theano.gradient.zero_grad v_sample_constant = theano.gradient.disconnected_grad(self.v_sample) # v_sample_constant = v_sample self.cost = (self.free_energy(self.input) - self.free_energy(v_sample_constant)) / self.input.shape[0] log.debug("Initialized an RBM shape %s", str((self.input_size, hidden_size)))
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rbm/', input_size=None, hidden_size=None, visible_activation='sigmoid', hidden_activation='sigmoid', weights_init='uniform', weights_mean=0, weights_std=5e-3, weights_interval='montreal', bias_init=0.0, mrg=RNG_MRG.MRG_RandomStreams(1), k=15, persistent=True): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the input i.e. input_size). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. hidden_size). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the RBM. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the RBM is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the RBM. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. mrg : random A random number generator that is used when sampling. The RBM is a probabilistic model, so it relies a lot on sampling. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. k : int The k number of steps used for CD-k or PCD-k with Gibbs sampling. Basically, the number of samples generated from the model to train against reconstructing the original input. """ # init Model to combine the defaults and config dictionaries with the initial parameters. super(RBM, self).__init__(**{arg: val for (arg, val) in locals().items() if arg is not 'self'}) ################## # specifications # ################## # grab info from the inputs_hook, hiddens_hook, or from parameters if inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) assert len(inputs_hook) == 2, 'Expected inputs_hook to be tuple!' # make sure inputs_hook is a tuple self.input = inputs_hook[1] else: # make the input a symbolic matrix self.input = T.matrix('V') # either grab the hidden's desired size from the parameter directly, or copy n_in hidden_size = hidden_size or self.input_size # get the number of steps k self.k = k # deal with hiddens_hook if hiddens_hook is not None: # make sure hiddens_hook is a tuple assert len(hiddens_hook) == 2, 'Expected hiddens_hook to be tuple!' hidden_size = hiddens_hook[0] or hidden_size self.hiddens_init = hiddens_hook[1] else: self.hiddens_init = None # other specifications # visible activation function! self.visible_activation_func = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation_func): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.hidden_activation_func): self.hidden_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary hidden activation not supported yet!") raise NotImplementedError("Non-binary hidden activation not supported yet!") #################################################### # parameters - make sure to deal with params_hook! # #################################################### if params_hook is not None: # make sure the params_hook has W (weights matrix) and bh, bv (bias vectors) assert len(params_hook) == 3, \ "Expected 3 params (W, bv, bh) for RBM, found {0!s}!".format(len(params_hook)) # doesn't matter if bv and bh are vectors or matrices. self.W, self.bv, self.bh = params_hook hidden_size = self.W.shape[1].eval() else: self.W = get_weights(weights_init=weights_init, shape=(self.input_size, hidden_size), name="W", rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # grab the bias vectors self.bv = get_bias(shape=self.input_size, name="bv", init_values=bias_init) self.bh = get_bias(shape=hidden_size, name="bh", init_values=bias_init) # Finally have the parameters self.params = [self.W, self.bv, self.bh] # Create the RBM graph! self.cost, self.monitors, self.updates, self.v_sample, self.h_sample = self._build_rbm() log.debug("Initialized an RBM shape %s", str((self.input_size, hidden_size)))
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rnnrbm/', input_size=None, hidden_size=None, visible_activation='sigmoid', hidden_activation='sigmoid', weights_init='uniform', weights_mean=0, weights_std=5e-3, weights_interval='montreal', bias_init=0, mrg=RNG_MRG.MRG_RandomStreams(1), k=15, rnn_hidden_size=None, rnn_hidden_activation='rectifier', rnn_weights_init='identity', rnn_weights_mean=0, rnn_weights_std=5e-3, rnn_weights_interval='montreal', rnn_bias_init=0, generate_n_steps=200): """ Initialize the RNN-RBM. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the input i.e. input_size). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. hidden_size). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the RBM. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the RBM is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the RBM. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. mrg : random A random number generator that is used when sampling. The RBM is a probabilistic model, so it relies a lot on sampling. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. k : int The k number of steps used for CD-k or PCD-k with Gibbs sampling. Basically, the number of samples generated from the model to train against reconstructing the original input. rnn_hidden_size : int The number of hidden units (dimensionality) to use in the recurrent layer. rnn_hidden_activation : str or Callable The activation function to apply to recurrent units. See opendeep.utils.activation for options. rnn_weights_init : str Determines the method for initializing recurrent weights. See opendeep.utils.nnet for options. 'Identity' works well with 'rectifier' `rnn_hidden_activation`. rnn_weights_mean : float If Gaussian `rnn_weights_init`, the mean value to use. rnn_weights_std : float If Gaussian `rnn_weights_init`, the standard deviation to use. rnn_weights_interval : str or float If Uniform `rnn_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. rnn_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. generate_n_steps : int When generating from the model, how many steps to generate. """ super(RNN_RBM, self).__init__(**{arg: val for (arg, val) in locals().items() if arg is not 'self'}) ################## # specifications # ################## self.mrg = mrg self.k = k self.generate_n_steps = generate_n_steps # grab info from the inputs_hook, hiddens_hook, or from parameters if self.inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) raise NotImplementedError("Inputs_hook not implemented yet for RNN-RBM") else: # make the input a symbolic matrix - a sequence of inputs self.input = T.matrix('Vs') # set an initial value for the recurrent hiddens self.u0 = T.zeros((rnn_hidden_size,)) # make a symbolic vector for the initial recurrent hiddens value to use during generation for the model self.generate_u0 = T.vector("generate_u0") # either grab the hidden's desired size from the parameter directly, or copy n_in self.hidden_size = hidden_size or self.input_size # deal with hiddens_hook if self.hiddens_hook is not None: raise NotImplementedError("Hiddens_hook not implemented yet for RNN_RBM") # other specifications # visible activation function! self.visible_activation_func = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation_func): self.visible_sampling = self.mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.hidden_activation_func): self.hidden_sampling = self.mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary hidden activation not supported yet!") raise NotImplementedError("Non-binary hidden activation not supported yet!") # recurrent hidden activation function! self.rnn_hidden_activation_func = get_activation_function(rnn_hidden_activation) # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") #################################################### # parameters - make sure to deal with params_hook! # #################################################### if self.params_hook is not None: # make sure the params_hook has W (weights matrix) and bh, bv (bias vectors) assert len(self.params_hook) == 8, \ "Expected 8 params (W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu) for RBM, found {0!s}!".format( len(self.params_hook) ) self.W, self.bv, self.bh, self.Wuh, self.Wuv, self.Wvu, self.Wuu, self.bu = self.params_hook else: # RBM weight params self.W = get_weights(weights_init=weights_init, shape=(self.input_size, self.hidden_size), name="W", rng=self.mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # RNN weight params self.Wuh = get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, self.hidden_size), name="Wuh", rng=self.mrg, # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) self.Wuv = get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, self.input_size), name="Wuv", rng=self.mrg, # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) self.Wvu = get_weights(weights_init=rnn_weights_init, shape=(self.input_size, rnn_hidden_size), name="Wvu", rng=self.mrg, # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) self.Wuu = get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, rnn_hidden_size), name="Wuu", rng=self.mrg, # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) # grab the bias vectors # rbm biases self.bv = get_bias(shape=self.input_size, name="bv", init_values=bias_init) self.bh = get_bias(shape=self.hidden_size, name="bh", init_values=bias_init) # rnn bias self.bu = get_bias(shape=rnn_hidden_size, name="bu", init_values=rnn_bias_init) # Finally have the parameters self.params = [self.W, self.bv, self.bh, self.Wuh, self.Wuv, self.Wvu, self.Wuu, self.bu] # Create the RNN-RBM graph! self.v_sample, self.cost, self.monitors, self.updates_train, self.v_ts, self.updates_generate, self.u_t = \ self._build_rnnrbm() log.info("Initialized an RNN-RBM!")
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rbm/', input_size=None, hidden_size=None, visible_activation='sigmoid', hidden_activation='sigmoid', weights_init='uniform', weights_mean=0, weights_std=5e-3, weights_interval='montreal', bias_init=0.0, mrg=RNG_MRG.MRG_RandomStreams(1), k=15): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the input i.e. input_size). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. hidden_size). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the RBM. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the RBM is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the RBM. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. mrg : random A random number generator that is used when sampling. The RBM is a probabilistic model, so it relies a lot on sampling. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. k : int The k number of steps used for CD-k or PCD-k with Gibbs sampling. Basically, the number of samples generated from the model to train against reconstructing the original input. """ # init Model to combine the defaults and config dictionaries with the initial parameters. super(RBM, self).__init__(**{arg: val for (arg, val) in locals().iteritems() if arg is not 'self'}) ################## # specifications # ################## # grab info from the inputs_hook, hiddens_hook, or from parameters if inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) assert len(inputs_hook) == 2, 'Expected inputs_hook to be tuple!' # make sure inputs_hook is a tuple self.input = inputs_hook[1] else: # make the input a symbolic matrix self.input = T.matrix('V') # either grab the hidden's desired size from the parameter directly, or copy n_in hidden_size = hidden_size or self.input_size # get the number of steps k self.k = k # deal with hiddens_hook if hiddens_hook is not None: # make sure hiddens_hook is a tuple assert len(hiddens_hook) == 2, 'Expected hiddens_hook to be tuple!' hidden_size = hiddens_hook[0] or hidden_size self.hiddens_init = hiddens_hook[1] else: self.hiddens_init = None # other specifications # visible activation function! self.visible_activation_func = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation_func): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.hidden_activation_func): self.hidden_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary hidden activation not supported yet!") raise NotImplementedError("Non-binary hidden activation not supported yet!") #################################################### # parameters - make sure to deal with params_hook! # #################################################### if params_hook is not None: # make sure the params_hook has W (weights matrix) and bh, bv (bias vectors) assert len(params_hook) == 3, \ "Expected 3 params (W, bv, bh) for RBM, found {0!s}!".format(len(params_hook)) # doesn't matter if bv and bh are vectors or matrices. self.W, self.bv, self.bh = params_hook hidden_size = self.W.shape[1].eval() else: self.W = get_weights(weights_init=weights_init, shape=(self.input_size, hidden_size), name="W", rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # grab the bias vectors self.bv = get_bias(shape=self.input_size, name="bv", init_values=bias_init) self.bh = get_bias(shape=hidden_size, name="bh", init_values=bias_init) # Finally have the parameters self.params = [self.W, self.bv, self.bh] # Create the RBM graph! self.cost, self.monitors, self.updates, self.v_sample, self.h_sample = self._build_rbm() log.debug("Initialized an RBM shape %s", str((self.input_size, hidden_size)))
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir=None, input_size=None, hidden_size=None, layers=2, walkbacks=4, visible_activation='sigmoid', hidden_activation='tanh', input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1), tied_weights=True, weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0, cost_function='binary_crossentropy', cost_args=None, add_noise=True, noiseless_h1=True, hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4, noise_decay='exponential', noise_annealing=1, image_width=None, image_height=None, rnn_hidden_size=None, rnn_hidden_activation='rectifier', rnn_weights_init='identity', rnn_weights_mean=0, rnn_weights_std=5e-3, rnn_weights_interval='montreal', rnn_bias_init=0, generate_n_steps=200): """ Initialize an RNN-GSN. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's output layer gives a generative recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than `input_size`, which is known as *overcomplete*. layers : int The number of hidden layers to use. walkbacks : int The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample from the DAE, which means the model generates inputs in sequence, where each generated input is compared to the original input to create the reconstruction cost for training. For running the model, the very last generated input in the Gibbs chain is used as the output. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. input_sampling : bool During walkbacks, whether to sample from the generated input to create a new starting point for the next walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the process more stochastic - more likely to find spurious modes in the model's representation. mrg : random A random number generator that is used when adding noise into the network and for sampling from the input. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. tied_weights : bool DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the reconstruction cost of the model. This should be appropriate for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. add_noise : bool Whether to add noise (corrupt) the input before passing it through the computation graph during training. This should most likely be set to the default of True, because this is a *denoising* autoencoder after all. noiseless_h1 : bool Whether to not add noise (corrupt) the hidden layer during computation. hidden_noise : str What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. hidden_noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. input_noise : str What type of noise to use for corrupting the input before computation (if `add_noise`). See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper for binary units, etc. input_noise_level : float The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper, standard deviation for Gaussian, interval for Uniform, etc. noise_decay : str or False Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_annealing : float The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified in `noise_decay`. image_width : int If the input should be represented as an image, the width of the input image. If not specified, it will be close to the square factor of the `input_size`. image_height : int If the input should be represented as an image, the height of the input image. If not specified, it will be close to the square factor of the `input_size`. rnn_hidden_size : int The number of hidden units (dimensionality) to use in the recurrent layer. rnn_hidden_activation : str or Callable The activation function to apply to recurrent units. See opendeep.utils.activation for options. rnn_weights_init : str Determines the method for initializing recurrent weights. See opendeep.utils.nnet for options. 'Identity' works well with 'rectifier' `rnn_hidden_activation`. rnn_weights_mean : float If Gaussian `rnn_weights_init`, the mean value to use. rnn_weights_std : float If Gaussian `rnn_weights_init`, the standard deviation to use. rnn_weights_interval : str or float If Uniform `rnn_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. rnn_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. generate_n_steps : int When generating from the model, how many steps to generate. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(RNN_GSN, self).__init__(**initial_parameters) ################## # specifications # ################## self.layers = layers self.walkbacks = walkbacks self.input_sampling = input_sampling self.mrg = mrg self.tied_weights = tied_weights self.noise_decay = noise_decay self.noise_annealing = noise_annealing self.add_noise = add_noise self.noiseless_h1 = noiseless_h1 self.hidden_noise = hidden_noise self.hidden_noise_level = hidden_noise_level self.input_noise = input_noise self.input_noise_level = input_noise_level self.image_width = image_width self.image_height = image_height # grab info from the inputs_hook, hiddens_hook, or from parameters if self.inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) raise NotImplementedError("Inputs_hook not implemented yet for RNN-GSN") else: # make the input a symbolic matrix - a sequence of inputs self.input = T.matrix('Xs') # set an initial value for the recurrent hiddens self.u0 = T.zeros((rnn_hidden_size,)) # make a symbolic vector for the initial recurrent hiddens value to use during generation for the model self.generate_u0 = T.vector("generate_u0") # either grab the hidden's desired size from the parameter directly, or copy n_in self.hidden_size = hidden_size or self.input_size # deal with hiddens_hook if self.hiddens_hook is not None: raise NotImplementedError("Hiddens_hook not implemented yet for RNN-GSN") # other specifications # visible activation function! self.visible_activation_func = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation_func): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # recurrent hidden activation function! self.rnn_hidden_activation_func = get_activation_function(rnn_hidden_activation) # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") # determine the sizes of each layer in a list. # layer sizes, from h0 to hK (h0 is the visible layer) self.layer_sizes = [self.input_size] + [self.hidden_size] * self.layers #################################################### # parameters - make sure to deal with params_hook! # #################################################### if self.params_hook is not None: # if tied weights, expect (layers*2 + 1) params for GSN and (int(layers+1)/int(2) + 3) for RNN if self.tied_weights: expected_num = (2*self.layers + 1) + (int(self.layers+1)/2 + 3) assert len(self.params_hook) == expected_num, \ "Tied weights: expected {0!s} params, found {1!s}!".format(expected_num, len(self.params_hook)) gsn_len = (2*self.layers + 1) self.weights_list = self.params_hook[:self.layers] self.bias_list = self.params_hook[self.layers:gsn_len] # if untied weights, expect layers*3 + 1 params else: expected_num = (3*self.layers + 1) + (int(self.layers + 1)/2 + 3) assert len(self.params_hook) == expected_num, \ "Untied weights: expected {0!s} params, found {1!s}!".format(expected_num, len(self.params_hook)) gsn_len = (3*self.layers + 1) self.weights_list = self.params_hook[:2*self.layers] self.bias_list = self.params_hook[2*self.layers:gsn_len] rnn_len = gsn_len + int(self.layers + 1) / 2 self.recurrent_to_gsn_weights_list = self.params_hook[gsn_len:rnn_len] self.W_u_u = self.params_hook[rnn_len:rnn_len + 1] self.W_x_u = self.params_hook[rnn_len + 1:rnn_len + 2] self.recurrent_bias = self.params_hook[rnn_len + 2:rnn_len + 3] # otherwise, construct our params else: # initialize a list of weights and biases based on layer_sizes for the GSN self.weights_list = [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i], self.layer_sizes[i + 1]), name="W_{0!s}_{1!s}".format(i, i + 1), # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in range(self.layers)] # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now) if not self.tied_weights: self.weights_list.extend( [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i + 1], self.layer_sizes[i]), name="W_{0!s}_{1!s}".format(i + 1, i), # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in reversed(range(self.layers))] ) # initialize each layer bias to 0's. self.bias_list = [get_bias(shape=(self.layer_sizes[i],), name='b_' + str(i), init_values=bias_init) for i in range(self.layers + 1)] self.recurrent_to_gsn_weights_list = [ get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, self.layer_sizes[layer]), name="W_u_h{0!s}".format(layer), # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) for layer in range(self.layers + 1) if layer % 2 != 0 ] self.W_u_u = get_weights(weights_init=rnn_weights_init, shape=(rnn_hidden_size, rnn_hidden_size), name="W_u_u", # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, #if uniform interval=rnn_weights_interval) self.W_x_u = get_weights(weights_init=rnn_weights_init, shape=(self.input_size, rnn_hidden_size), name="W_x_u", # if gaussian mean=rnn_weights_mean, std=rnn_weights_std, # if uniform interval=rnn_weights_interval) self.recurrent_bias = get_bias(shape=(rnn_hidden_size,), name="b_u", init_values=rnn_bias_init) # build the params of the model into a list self.gsn_params = self.weights_list + self.bias_list self.params = self.gsn_params + \ self.recurrent_to_gsn_weights_list + \ [self.W_u_u, self.W_x_u, self.recurrent_bias] log.debug("rnn-gsn params: %s", str(self.params)) # Create the RNN-GSN graph! self.x_sample, self.cost, self.monitors, self.updates_train, self.x_ts, self.updates_generate, self.u_t = \ self._build_rnngsn() log.info("Initialized an RNN-GSN!")