Exemplo n.º 1
0
    def __init__(
        self,
        inputs_hook=None,
        params_hook=None,
        outdir="outputs/basic",
        input_size=None,
        output_size=None,
        activation="rectifier",
        cost="mse",
        cost_args=None,
        weights_init="uniform",
        weights_mean=0,
        weights_std=5e-3,
        weights_interval="montreal",
        bias_init=0.0,
        noise=None,
        noise_level=None,
        mrg=RNG_MRG.MRG_RandomStreams(1),
        **kwargs
    ):
        """
        Initialize a basic layer.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together. For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. input_size).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the layer. If shape is provided in `inputs_hook`,
            this is optional.
        output_size : int
            The size (dimensionality) of the output from the layer.
        activation : str or callable
            The activation function to use after the dot product going from input -> output. This can be a string
            representing an option from opendeep.utils.activation, or your own function as long as it is callable.
        cost : str or callable
            The cost function to use when training the layer. This should be appropriate for the output type, i.e.
            mse for real-valued outputs, binary cross-entropy for binary outputs, etc.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        weights_init : str
            Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        noise : str
            What type of noise to use for corrupting the output (if not None). See opendeep.utils.noise
            for options. This should be appropriate for the output activation, i.e. Gaussian for tanh or other
            real-valued activations, etc. Often, you will use 'dropout' here as a regularization in BasicLayers.
        noise_level : float
            The amount of noise to use for the noise function specified by `noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop("self")
        super(Dense, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        if inputs_hook is not None:  # inputs_hook is a tuple of (Shape, Input)
            assert len(inputs_hook) == 2, "Expected inputs_hook to be tuple!"  # make sure inputs_hook is a tuple
            self.input = inputs_hook[1]
        else:
            # make the input a symbolic matrix
            self.input = T.matrix("X")

        # now that we have the input specs, define the output 'target' variable to be used in supervised training!
        if kwargs.get("out_as_probs") == False:
            self.target = T.vector("Y", dtype="int64")
        else:
            self.target = T.matrix("Y")

        # either grab the output's desired size from the parameter directly, or copy input_size
        self.output_size = self.output_size or self.input_size

        # other specifications
        # activation function!
        activation_func = get_activation_function(activation)
        # cost function!
        cost_func = get_cost_function(cost)
        cost_args = cost_args or dict()

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if params_hook is not None:
            # make sure the params_hook has W (weights matrix) and b (bias vector)
            assert len(params_hook) == 2, "Expected 2 params (W and b) for Dense, found {0!s}!".format(len(params_hook))
            W, b = params_hook
        else:
            W = get_weights(
                weights_init=weights_init,
                shape=(self.input_size, self.output_size),
                name="W",
                rng=mrg,
                # if gaussian
                mean=weights_mean,
                std=weights_std,
                # if uniform
                interval=weights_interval,
            )

            # grab the bias vector
            b = get_bias(shape=output_size, name="b", init_values=bias_init)

        # Finally have the two parameters - weights matrix W and bias vector b. That is all!
        self.params = [W, b]

        ###############
        # computation #
        ###############
        # Here is the meat of the computation transforming input -> output
        # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing
        # the result through our activation function (normally something nonlinear such as: max(0, output))
        self.output = activation_func(T.dot(self.input, W) + b)

        # Now deal with noise if we added it:
        if noise:
            log.debug("Adding noise switch.")
            if noise_level is not None:
                noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.switch = sharedX(value=1, name="basiclayer_noise_switch")
            self.output = T.switch(self.switch, noise_func(input=self.output), self.output)

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_func(output=self.output, target=self.target, **cost_args)

        log.debug(
            "Initialized a basic fully-connected layer with shape %s and activation: %s",
            str((self.input_size, self.output_size)),
            str(activation),
        )
Exemplo n.º 2
0
    def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/lstm/',
                 input_size=None, hidden_size=None, output_size=None,
                 activation='sigmoid', hidden_activation='relu', inner_hidden_activation='sigmoid',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3,
                 r_bias_init=0.0,
                 cost_function='mse', cost_args=None,
                 noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize a simple recurrent network.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets,
            this will be the initial starting value for hidden layers.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters.
        outdir : str
            The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved.
        input_size : int
            The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional.
        hidden_size : int
            The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional.
        output_size : int
            The size (dimensionality) of the output.
        activation : str or callable
            The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The activation to perform for the hidden units.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        inner_hidden_activation : str or callable
            The activation to perform for the hidden gates.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the output cost of the model.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        noise : str
            What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        direction : str
            The direction this recurrent model should go over its inputs.
            Can be 'forward', 'backward', or 'bidirectional'.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(LSTM, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        backward = direction.lower() == 'backward'
        bidirectional = direction.lower() == 'bidirectional'

        #########################################
        # activation, cost, and noise functions #
        #########################################
        # recurrent hidden activation function!
        self.hidden_activation_func = get_activation_function(hidden_activation)
        self.inner_hidden_activation_func = get_activation_function(inner_hidden_activation)

        # output activation function!
        activation_func = get_activation_function(activation)

        # Cost function
        cost_function = get_cost_function(cost_function)
        cost_args = cost_args or dict()

        # Now deal with noise if we added it:
        if noise:
            log.debug('Adding %s noise switch.' % str(noise))
            if noise_level is not None:
                noise_level = sharedX(value=noise_level)
                noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch")

            # noise scheduling
            if noise_decay and noise_level is not None:
                self.noise_schedule = get_decay_function(noise_decay,
                                                         noise_level,
                                                         noise_level.get_value(),
                                                         noise_decay_amount)

        ###############
        # inputs hook #
        ###############
        # grab info from the inputs_hook
        # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension
        # being the temporal dimension.
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.inputs_hook is not None:
            self.input = self.inputs_hook[1]

            if self.input.ndim == 1:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2])
                self.input_size = 1

            elif self.input.ndim == 2:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

            elif self.input.ndim > 3:
                self.input = self.input.flatten(3)
                self.input_size = sum(self.input_size)
            else:
                raise NotImplementedError("Recurrent input with %d dimensions not supported!" % self.input.ndim)
            xs = self.input
        else:
            # Assume input coming from optimizer is (batches, timesteps, data)
            # so, we need to reshape to (timesteps, batches, data)
            self.input = T.tensor3("Xs")
            xs = self.input.dimshuffle(1, 0, 2)

        # The target outputs for supervised training - in the form of (batches, timesteps, output) which is
        # the same dimension ordering as the expected input from optimizer.
        # therefore, we need to swap it like we did to input xs.
        self.target = T.tensor3("Ys")
        ys = self.target.dimshuffle(1, 0, 2)

        ################
        # hiddens hook #
        ################
        # set an initial value for the recurrent hiddens from hook
        if self.hiddens_hook is not None:
            h_init = self.hiddens_hook[1]
            self.hidden_size = self.hiddens_hook[0]
        else:
            # deal with h_init after parameters are made (have to make the same size as hiddens that are computed)
            self.hidden_size = hidden_size

        ##################
        # for generating #
        ##################
        # symbolic scalar for how many recurrent steps to use during generation from the model
        self.n_steps = T.iscalar("generate_n_steps")

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if self.params_hook is not None:
            if not bidirectional:
                (W_x_c, W_x_i, W_x_f, W_x_o,
                 U_h_c, U_h_i, U_h_f, U_h_o,
                 W_h_y, b_c, b_i, b_f, b_o,
                 b_y) = self.params_hook
                recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o]
            else:
                (W_x_c, W_x_i, W_x_f, W_x_o,
                 U_h_c, U_h_i, U_h_f, U_h_o,
                 U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b,
                 W_h_y, b_c, b_i, b_f, b_o,
                 b_y) = self.params_hook
                recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o, U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b]
        # otherwise, construct our params
        else:
            # all input-to-hidden weights
            W_x_c, W_x_i, W_x_f, W_x_o = [
                get_weights(weights_init=weights_init,
                            shape=(self.input_size, self.hidden_size),
                            name="W_x_%s" % sub,
                            # if gaussian
                            mean=weights_mean,
                            std=weights_std,
                            # if uniform
                            interval=weights_interval)
                for sub in ['c', 'i', 'f', 'o']
            ]
            # all hidden-to-hidden weights
            U_h_c, U_h_i, U_h_f, U_h_o = [
                get_weights(weights_init=r_weights_init,
                            shape=(self.hidden_size, self.hidden_size),
                            name="U_h_%s" % sub,
                            # if gaussian
                            mean=r_weights_mean,
                            std=r_weights_std,
                            # if uniform
                            interval=r_weights_interval)
                for sub in ['c', 'i', 'f', 'o']
            ]
            # hidden-to-output weights
            W_h_y = get_weights(weights_init=weights_init,
                                shape=(self.hidden_size, self.output_size),
                                name="W_h_y",
                                # if gaussian
                                mean=weights_mean,
                                std=weights_std,
                                # if uniform
                                interval=weights_interval)
            # biases
            b_c, b_i, b_f, b_o = [
                get_bias(shape=(self.hidden_size,),
                         name="b_%s" % sub,
                         init_values=r_bias_init)
                for sub in ['c', 'i', 'f', 'o']
            ]
            # output bias
            b_y = get_bias(shape=(self.output_size,),
                           name="b_y",
                           init_values=bias_init)
            # clip gradients if we are doing that
            recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o]
            if clip_recurrent_grads:
                clip = abs(clip_recurrent_grads)
                U_h_c, U_h_i, U_h_f, U_h_o = [theano.gradient.grad_clip(p, -clip, clip) for p in recurrent_params]
            # bidirectional params
                if bidirectional:
                    # all hidden-to-hidden weights
                    U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b = [
                        get_weights(weights_init=r_weights_init,
                                    shape=(self.hidden_size, self.hidden_size),
                                    name="U_h_%s_b" % sub,
                                    # if gaussian
                                    mean=r_weights_mean,
                                    std=r_weights_std,
                                    # if uniform
                                    interval=r_weights_interval)
                        for sub in ['c', 'i', 'f', 'o']
                    ]
                    recurrent_params += [U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b]
                    if clip_recurrent_grads:
                        clip = abs(clip_recurrent_grads)
                        U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b = [theano.gradient.grad_clip(p, -clip, clip) for p in
                                                              [U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b]]

        # put all the parameters into our list, and make sure it is in the same order as when we try to load
        # them from a params_hook!!!
        self.params = [W_x_c, W_x_i, W_x_f, W_x_o] + recurrent_params + [W_h_y, b_c, b_i, b_f, b_o, b_y]

        # make h_init the right sized tensor
        if not self.hiddens_hook:
            h_init = T.zeros_like(T.dot(xs[0], W_x_c))

        c_init = T.zeros_like(T.dot(xs[0], W_x_c))

        ###############
        # computation #
        ###############
        # move some computation outside of scan to speed it up!
        x_c = T.dot(xs, W_x_c) + b_c
        x_i = T.dot(xs, W_x_i) + b_i
        x_f = T.dot(xs, W_x_f) + b_f
        x_o = T.dot(xs, W_x_o) + b_o

        # now do the recurrent stuff
        (self.hiddens, _), self.updates = theano.scan(
            fn=self.recurrent_step,
            sequences=[x_c, x_i, x_f, x_o],
            outputs_info=[h_init, c_init],
            non_sequences=[U_h_c, U_h_i, U_h_f, U_h_o],
            go_backwards=backward,
            name="lstm_scan",
            strict=True
        )

        # if bidirectional, do the same in reverse!
        if bidirectional:
            (hiddens_b, _), updates_b = theano.scan(
                fn=self.recurrent_step,
                sequences=[x_c, x_i, x_f, x_o],
                outputs_info=[h_init, c_init],
                non_sequences=[U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b],
                go_backwards=not backward,
                name="lstm_scan_back",
                strict=True
            )
            # flip the hiddens to be the right direction
            hiddens_b = hiddens_b[::-1]
            # update stuff
            self.updates.update(updates_b)
            self.hiddens += hiddens_b

        # add noise (like dropout) if we wanted it!
        if noise:
            self.hiddens = T.switch(self.noise_switch,
                                    noise_func(input=self.hiddens),
                                    self.hiddens)

        # now compute the outputs from the leftover (top level) hiddens
        self.output = activation_func(
            T.dot(self.hiddens, W_h_y) + b_y
        )

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_function(output=self.output, target=ys, **cost_args)

        log.info("Initialized an LSTM!")
    def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gsn/',
                 input_size=None, hidden_size=1000,
                 layers=2, walkbacks=4,
                 visible_activation='sigmoid', hidden_activation='tanh',
                 input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1),
                 tied_weights=True,
                 weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3,
                 bias_init=0.0,
                 cost_function='binary_crossentropy', cost_args=None,
                 add_noise=True, noiseless_h1=True,
                 hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4,
                 noise_decay='exponential', noise_annealing=1,
                 image_width=None, image_height=None,
                 **kwargs):
        """
        Initialize a GSN.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere.
            This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's
            output layer gives a generative recurrent model.) For now, it needs to include the shape
            information (normally the dimensionality of the hiddens i.e. n_hidden).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional.
            The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an
            unsupervised model. The output is a reconstruction of the input.
        hidden_size : int
            The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than
            `input_size`, which is known as *overcomplete*.
        visible_activation : str or callable
            The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer.
            This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        layers : int
            The number of hidden layers to use.
        walkbacks : int
            The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample
            from the DAE, which means the model generates inputs in sequence, where each generated input is compared
            to the original input to create the reconstruction cost for training. For running the model, the very last
            generated input in the Gibbs chain is used as the output.
        input_sampling : bool
            During walkbacks, whether to sample from the generated input to create a new starting point for the next
            walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the
            process more stochastic - more likely to find spurious modes in the model's representation.
        mrg : random
            A random number generator that is used when adding noise into the network and for sampling from the input.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        tied_weights : bool
            DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean
            determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the reconstruction cost of the model. This should be appropriate
            for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        add_noise : bool
            Whether to add noise (corrupt) the input before passing it through the computation graph during training.
            This should most likely be set to the default of True, because this is a *denoising* autoencoder after all.
        noiseless_h1 : bool
            Whether to not add noise (corrupt) the hidden layer during computation.
        hidden_noise : str
            What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise
            for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        hidden_noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        input_noise : str
            What type of noise to use for corrupting the input before computation (if `add_noise`).
            See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper
            for binary units, etc.
        input_noise_level : float
            The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper,
            standard deviation for Gaussian, interval for Uniform, etc.
        noise_decay : str or False
            Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_annealing : float
            The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        image_width : int
            If the input should be represented as an image, the width of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        image_height : int
            If the input should be represented as an image, the height of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(GSN, self).__init__(**initial_parameters)

        # when the input should be thought of as an image, either use the specified width and height,
        # or try to make as square as possible.
        if image_height is None and image_width is None:
            (_h, _w) = closest_to_square_factors(self.input_size)
            self.image_width  = _w
            self.image_height = _h
        else:
            self.image_height = image_height
            self.image_width = image_width

        ############################
        # Theano variables and RNG #
        ############################
        if self.inputs_hook is None:
            self.X = T.matrix('X')
        else:
            # inputs_hook is a (shape, input) tuple
            self.X = self.inputs_hook[1]
        
        ##########################
        # Network specifications #
        ##########################
        # generally, walkbacks should be at least 2*layers
        if layers % 2 == 0:
            if walkbacks < 2*layers:
                log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                            'Generaly want 2X walkbacks to layers',
                            str(layers), str(walkbacks))
        else:
            if walkbacks < 2*layers-1:
                log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                            'Generaly want 2X walkbacks to layers',
                            str(layers), str(walkbacks))

        self.add_noise = add_noise
        self.noise_annealing = as_floatX(noise_annealing)  # noise schedule parameter
        self.hidden_noise_level = sharedX(hidden_noise_level, dtype=theano.config.floatX)
        self.hidden_noise = get_noise(name=hidden_noise, noise_level=self.hidden_noise_level, mrg=mrg)
        self.input_noise_level = sharedX(input_noise_level, dtype=theano.config.floatX)
        self.input_noise = get_noise(name=input_noise, noise_level=self.input_noise_level, mrg=mrg)

        self.walkbacks = walkbacks
        self.tied_weights = tied_weights
        self.layers = layers
        self.noiseless_h1 = noiseless_h1
        self.input_sampling = input_sampling
        self.noise_decay = noise_decay

        # if there was a hiddens_hook, unpack the hidden layers in the tensor
        if self.hiddens_hook is not None:
            hidden_size = self.hiddens_hook[0]
            self.hiddens_flag = True
        else:
            self.hiddens_flag = False

        # determine the sizes of each layer in a list.
        #  layer sizes, from h0 to hK (h0 is the visible layer)
        hidden_size = list(raise_to_list(hidden_size))
        if len(hidden_size) == 1:
            self.layer_sizes = [self.input_size] + hidden_size * self.layers
        else:
            assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \
                                                    "Hiddens %d and layers %d" % (len(hidden_size), self.layers)
            self.layer_sizes = [self.input_size] + hidden_size

        if self.hiddens_hook is not None:
            self.hiddens = self.unpack_hiddens(self.hiddens_hook[1])

        #########################
        # Activation functions! #
        #########################
        # hidden unit activation
        self.hidden_activation = get_activation_function(hidden_activation)
        # Visible layer activation
        self.visible_activation = get_activation_function(visible_activation)
        # make sure the sampling functions are appropriate for the activation functions.
        if is_binary(self.visible_activation):
            self.visible_sampling = mrg.binomial
        else:
            # TODO: implement non-binary activation
            log.error("Non-binary visible activation not supported yet!")
            raise NotImplementedError("Non-binary visible activation not supported yet!")

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        ###############
        # Parameters! #
        ###############
        # make sure to deal with params_hook!
        if self.params_hook is not None:
            # if tied weights, expect layers*2 + 1 params
            if self.tied_weights:
                assert len(self.params_hook) == 2*layers + 1, \
                    "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:layers]
                self.bias_list = self.params_hook[layers:]
            # if untied weights, expect layers*3 + 1 params
            else:
                assert len(self.params_hook) == 3*layers + 1, \
                    "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:2*layers]
                self.bias_list = self.params_hook[2*layers:]
        # otherwise, construct our params
        else:
            # initialize a list of weights and biases based on layer_sizes for the GSN
            self.weights_list = [get_weights(weights_init=weights_init,
                                             shape=(self.layer_sizes[i], self.layer_sizes[i+1]),
                                             name="W_{0!s}_{1!s}".format(i, i+1),
                                             rng=mrg,
                                             # if gaussian
                                             mean=weights_mean,
                                             std=weights_std,
                                             # if uniform
                                             interval=weights_interval)
                                 for i in range(layers)]
            # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now)
            if not tied_weights:
                self.weights_list.extend(
                    [get_weights(weights_init=weights_init,
                                 shape=(self.layer_sizes[i+1], self.layer_sizes[i]),
                                 name="W_{0!s}_{1!s}".format(i+1, i),
                                 rng=mrg,
                                 # if gaussian
                                 mean=weights_mean,
                                 std=weights_std,
                                 # if uniform
                                 interval=weights_interval)
                     for i in reversed(range(layers))]
                )
            # initialize each layer bias to 0's.
            self.bias_list = [get_bias(shape=(self.layer_sizes[i],),
                                       name='b_' + str(i),
                                       init_values=bias_init)
                              for i in range(layers+1)]

        # build the params of the model into a list
        self.params = self.weights_list + self.bias_list
        log.debug("gsn params: %s", str(self.params))

        # using the properties, build the computational graph
        self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph()
Exemplo n.º 4
0
    def __init__(self,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 outdir='outputs/rnn/',
                 input_size=None,
                 hidden_size=None,
                 output_size=None,
                 layers=1,
                 activation='sigmoid',
                 hidden_activation='relu',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity',
                 r_weights_interval='montreal',
                 r_weights_mean=0,
                 r_weights_std=5e-3,
                 r_bias_init=0.0,
                 cost_function='mse',
                 cost_args=None,
                 noise='dropout',
                 noise_level=None,
                 noise_decay=False,
                 noise_decay_amount=.99,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize a simple recurrent network.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets,
            this will be the initial starting value for hidden layers.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters.
        outdir : str
            The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved.
        input_size : int
            The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional.
        hidden_size : int
            The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional.
        output_size : int
            The size (dimensionality) of the output.
        layers : int
            The number of stacked hidden layers to use.
        activation : str or callable
            The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The activation to perform for the hidden layers.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the output cost of the model.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        noise : str
            What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        direction : str
            The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or
            'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence,
            computing two sets of hiddens and merging them before running through the final decoder.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.

        Raises
        ------
        AssertionError
            When asserting various properties of input parameters. See error messages.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(RNN, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        self.direction = direction
        self.bidirectional = (direction == "bidirectional")
        self.backward = (direction == "backward")
        self.layers = layers
        self.noise = noise

        self.weights_init = weights_init
        self.weights_mean = weights_mean
        self.weights_std = weights_std
        self.weights_interval = weights_interval

        self.r_weights_init = r_weights_init
        self.r_weights_mean = r_weights_mean
        self.r_weights_std = r_weights_std
        self.r_weights_interval = r_weights_interval

        self.bias_init = bias_init
        self.r_bias_init = r_bias_init

        #########################################
        # activation, cost, and noise functions #
        #########################################
        # recurrent hidden activation function!
        self.hidden_activation_func = get_activation_function(
            hidden_activation)

        # output activation function!
        self.activation_func = get_activation_function(activation)

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        # Now deal with noise if we added it:
        if self.noise:
            log.debug('Adding %s noise switch.' % str(noise))
            if noise_level is not None:
                noise_level = sharedX(value=noise_level)
                self.noise_func = get_noise(noise,
                                            noise_level=noise_level,
                                            mrg=mrg)
            else:
                self.noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.noise_switch = sharedX(value=1,
                                        name="basiclayer_noise_switch")

            # noise scheduling
            if noise_decay and noise_level is not None:
                self.noise_schedule = get_decay_function(
                    noise_decay, noise_level, noise_level.get_value(),
                    noise_decay_amount)

        ###############
        # inputs hook #
        ###############
        # grab info from the inputs_hook
        # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension
        # being the temporal dimension.
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.inputs_hook is not None:
            self.input = self.inputs_hook[1]

            if self.input.ndim == 1:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'),
                                           [1, 2])
                self.input_size = 1

            elif self.input.ndim == 2:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

            elif self.input.ndim == 3:
                pass

            elif self.input.ndim > 3:
                self.input = self.input.flatten(3)
                self.input_size = sum(self.input_size)
            else:
                raise NotImplementedError(
                    "Recurrent input with %d dimensions not supported!" %
                    self.input.ndim)
        else:
            # Assume input coming from optimizer is (batches, timesteps, data)
            # so, we need to reshape to (timesteps, batches, data)
            xs = T.tensor3("Xs")
            xs = xs.dimshuffle(1, 0, 2)
            self.input = xs

        # The target outputs for supervised training - in the form of (batches, timesteps, output) which is
        # the same dimension ordering as the expected input from optimizer.
        # therefore, we need to swap it like we did to input xs.
        ys = T.tensor3("Ys")
        ys = ys.dimshuffle(1, 0, 2)
        self.target = ys

        ################
        # hiddens hook #
        ################
        # set an initial value for the recurrent hiddens from hook
        if self.hiddens_hook is not None:
            self.h_init = self.hiddens_hook[1]
            self.hidden_size = self.hiddens_hook[0]
        else:
            # deal with h_init after parameters are made (have to make the same size as hiddens that are computed)
            self.hidden_size = hidden_size

        ##################
        # for generating #
        ##################
        # symbolic scalar for how many recurrent steps to use during generation from the model
        self.n_steps = T.iscalar("generate_n_steps")

        self.output, self.hiddens, self.updates, self.cost, self.params = self.build_computation_graph(
        )
Exemplo n.º 5
0
    def __init__(self,
                 inputs_hook=None,
                 params_hook=None,
                 outdir='outputs/basic',
                 input_size=None,
                 output_size=None,
                 activation='rectifier',
                 cost='mse',
                 cost_args=None,
                 weights_init='uniform',
                 weights_mean=0,
                 weights_std=5e-3,
                 weights_interval='montreal',
                 bias_init=0.0,
                 noise=None,
                 noise_level=None,
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 **kwargs):
        """
        Initialize a basic layer.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together. For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. input_size).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the layer. If shape is provided in `inputs_hook`,
            this is optional.
        output_size : int
            The size (dimensionality) of the output from the layer.
        activation : str or callable
            The activation function to use after the dot product going from input -> output. This can be a string
            representing an option from opendeep.utils.activation, or your own function as long as it is callable.
        cost : str or callable
            The cost function to use when training the layer. This should be appropriate for the output type, i.e.
            mse for real-valued outputs, binary cross-entropy for binary outputs, etc.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        weights_init : str
            Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        noise : str
            What type of noise to use for corrupting the output (if not None). See opendeep.utils.noise
            for options. This should be appropriate for the output activation, i.e. Gaussian for tanh or other
            real-valued activations, etc. Often, you will use 'dropout' here as a regularization in BasicLayers.
        noise_level : float
            The amount of noise to use for the noise function specified by `noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(Dense, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        if inputs_hook is not None:  # inputs_hook is a tuple of (Shape, Input)
            assert len(
                inputs_hook
            ) == 2, 'Expected inputs_hook to be tuple!'  # make sure inputs_hook is a tuple
            self.input = inputs_hook[1]
        else:
            # make the input a symbolic matrix
            self.input = T.matrix('X')

        # now that we have the input specs, define the output 'target' variable to be used in supervised training!
        if kwargs.get('out_as_probs') == False:
            self.target = T.vector('Y', dtype='int64')
        else:
            self.target = T.matrix('Y')

        # either grab the output's desired size from the parameter directly, or copy input_size
        self.output_size = self.output_size or self.input_size

        # other specifications
        # activation function!
        activation_func = get_activation_function(activation)
        # cost function!
        cost_func = get_cost_function(cost)
        cost_args = cost_args or dict()

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if params_hook is not None:
            # make sure the params_hook has W (weights matrix) and b (bias vector)
            assert len(params_hook) == 2, \
                "Expected 2 params (W and b) for Dense, found {0!s}!".format(len(params_hook))
            W, b = params_hook
        else:
            W = get_weights(
                weights_init=weights_init,
                shape=(self.input_size, self.output_size),
                name="W",
                rng=mrg,
                # if gaussian
                mean=weights_mean,
                std=weights_std,
                # if uniform
                interval=weights_interval)

            # grab the bias vector
            b = get_bias(shape=output_size, name="b", init_values=bias_init)

        # Finally have the two parameters - weights matrix W and bias vector b. That is all!
        self.params = [W, b]

        ###############
        # computation #
        ###############
        # Here is the meat of the computation transforming input -> output
        # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing
        # the result through our activation function (normally something nonlinear such as: max(0, output))
        self.output = activation_func(T.dot(self.input, W) + b)

        # Now deal with noise if we added it:
        if noise:
            log.debug('Adding noise switch.')
            if noise_level is not None:
                noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.switch = sharedX(value=1, name="basiclayer_noise_switch")
            self.output = T.switch(self.switch, noise_func(input=self.output),
                                   self.output)

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_func(output=self.output,
                              target=self.target,
                              **cost_args)

        log.debug(
            "Initialized a basic fully-connected layer with shape %s and activation: %s",
            str((self.input_size, self.output_size)), str(activation))
Exemplo n.º 6
0
    def __init__(self,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 outdir='outputs/gsn/',
                 input_size=None,
                 hidden_size=1000,
                 layers=2,
                 walkbacks=4,
                 visible_activation='sigmoid',
                 hidden_activation='tanh',
                 input_sampling=True,
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 tied_weights=True,
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 cost_function='binary_crossentropy',
                 cost_args=None,
                 add_noise=True,
                 noiseless_h1=True,
                 hidden_noise='gaussian',
                 hidden_noise_level=2,
                 input_noise='salt_and_pepper',
                 input_noise_level=0.4,
                 noise_decay='exponential',
                 noise_annealing=1,
                 image_width=None,
                 image_height=None,
                 **kwargs):
        """
        Initialize a GSN.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere.
            This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's
            output layer gives a generative recurrent model.) For now, it needs to include the shape
            information (normally the dimensionality of the hiddens i.e. n_hidden).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional.
            The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an
            unsupervised model. The output is a reconstruction of the input.
        hidden_size : int
            The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than
            `input_size`, which is known as *overcomplete*.
        visible_activation : str or callable
            The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer.
            This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        layers : int
            The number of hidden layers to use.
        walkbacks : int
            The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample
            from the DAE, which means the model generates inputs in sequence, where each generated input is compared
            to the original input to create the reconstruction cost for training. For running the model, the very last
            generated input in the Gibbs chain is used as the output.
        input_sampling : bool
            During walkbacks, whether to sample from the generated input to create a new starting point for the next
            walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the
            process more stochastic - more likely to find spurious modes in the model's representation.
        mrg : random
            A random number generator that is used when adding noise into the network and for sampling from the input.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        tied_weights : bool
            DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean
            determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the reconstruction cost of the model. This should be appropriate
            for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        add_noise : bool
            Whether to add noise (corrupt) the input before passing it through the computation graph during training.
            This should most likely be set to the default of True, because this is a *denoising* autoencoder after all.
        noiseless_h1 : bool
            Whether to not add noise (corrupt) the hidden layer during computation.
        hidden_noise : str
            What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise
            for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        hidden_noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        input_noise : str
            What type of noise to use for corrupting the input before computation (if `add_noise`).
            See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper
            for binary units, etc.
        input_noise_level : float
            The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper,
            standard deviation for Gaussian, interval for Uniform, etc.
        noise_decay : str or False
            Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_annealing : float
            The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        image_width : int
            If the input should be represented as an image, the width of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        image_height : int
            If the input should be represented as an image, the height of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(GSN, self).__init__(**initial_parameters)

        # when the input should be thought of as an image, either use the specified width and height,
        # or try to make as square as possible.
        if image_height is None and image_width is None:
            (_h, _w) = closest_to_square_factors(self.input_size)
            self.image_width = _w
            self.image_height = _h
        else:
            self.image_height = image_height
            self.image_width = image_width

        ############################
        # Theano variables and RNG #
        ############################
        if self.inputs_hook is None:
            self.X = T.matrix('X')
        else:
            # inputs_hook is a (shape, input) tuple
            self.X = self.inputs_hook[1]

        ##########################
        # Network specifications #
        ##########################
        # generally, walkbacks should be at least 2*layers
        if layers % 2 == 0:
            if walkbacks < 2 * layers:
                log.warning(
                    'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                    'Generaly want 2X walkbacks to layers', str(layers),
                    str(walkbacks))
        else:
            if walkbacks < 2 * layers - 1:
                log.warning(
                    'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                    'Generaly want 2X walkbacks to layers', str(layers),
                    str(walkbacks))

        self.add_noise = add_noise
        self.noise_annealing = as_floatX(
            noise_annealing)  # noise schedule parameter
        self.hidden_noise_level = sharedX(hidden_noise_level,
                                          dtype=theano.config.floatX)
        self.hidden_noise = get_noise(name=hidden_noise,
                                      noise_level=self.hidden_noise_level,
                                      mrg=mrg)
        self.input_noise_level = sharedX(input_noise_level,
                                         dtype=theano.config.floatX)
        self.input_noise = get_noise(name=input_noise,
                                     noise_level=self.input_noise_level,
                                     mrg=mrg)

        self.walkbacks = walkbacks
        self.tied_weights = tied_weights
        self.layers = layers
        self.noiseless_h1 = noiseless_h1
        self.input_sampling = input_sampling
        self.noise_decay = noise_decay

        # if there was a hiddens_hook, unpack the hidden layers in the tensor
        if self.hiddens_hook is not None:
            hidden_size = self.hiddens_hook[0]
            self.hiddens_flag = True
        else:
            self.hiddens_flag = False

        # determine the sizes of each layer in a list.
        #  layer sizes, from h0 to hK (h0 is the visible layer)
        hidden_size = list(raise_to_list(hidden_size))
        if len(hidden_size) == 1:
            self.layer_sizes = [self.input_size] + hidden_size * self.layers
        else:
            assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \
                                                    "Hiddens %d and layers %d" % (len(hidden_size), self.layers)
            self.layer_sizes = [self.input_size] + hidden_size

        if self.hiddens_hook is not None:
            self.hiddens = self.unpack_hiddens(self.hiddens_hook[1])

        #########################
        # Activation functions! #
        #########################
        # hidden unit activation
        self.hidden_activation = get_activation_function(hidden_activation)
        # Visible layer activation
        self.visible_activation = get_activation_function(visible_activation)
        # make sure the sampling functions are appropriate for the activation functions.
        if is_binary(self.visible_activation):
            self.visible_sampling = mrg.binomial
        else:
            # TODO: implement non-binary activation
            log.error("Non-binary visible activation not supported yet!")
            raise NotImplementedError(
                "Non-binary visible activation not supported yet!")

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        ###############
        # Parameters! #
        ###############
        # make sure to deal with params_hook!
        if self.params_hook is not None:
            # if tied weights, expect layers*2 + 1 params
            if self.tied_weights:
                assert len(self.params_hook) == 2*layers + 1, \
                    "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:layers]
                self.bias_list = self.params_hook[layers:]
            # if untied weights, expect layers*3 + 1 params
            else:
                assert len(self.params_hook) == 3*layers + 1, \
                    "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:2 * layers]
                self.bias_list = self.params_hook[2 * layers:]
        # otherwise, construct our params
        else:
            # initialize a list of weights and biases based on layer_sizes for the GSN
            self.weights_list = [
                get_weights(
                    weights_init=weights_init,
                    shape=(self.layer_sizes[i], self.layer_sizes[i + 1]),
                    name="W_{0!s}_{1!s}".format(i, i + 1),
                    rng=mrg,
                    # if gaussian
                    mean=weights_mean,
                    std=weights_std,
                    # if uniform
                    interval=weights_interval) for i in range(layers)
            ]
            # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now)
            if not tied_weights:
                self.weights_list.extend([
                    get_weights(
                        weights_init=weights_init,
                        shape=(self.layer_sizes[i + 1], self.layer_sizes[i]),
                        name="W_{0!s}_{1!s}".format(i + 1, i),
                        rng=mrg,
                        # if gaussian
                        mean=weights_mean,
                        std=weights_std,
                        # if uniform
                        interval=weights_interval)
                    for i in reversed(range(layers))
                ])
            # initialize each layer bias to 0's.
            self.bias_list = [
                get_bias(shape=(self.layer_sizes[i], ),
                         name='b_' + str(i),
                         init_values=bias_init) for i in range(layers + 1)
            ]

        # build the params of the model into a list
        self.params = self.weights_list + self.bias_list
        log.debug("gsn params: %s", str(self.params))

        # using the properties, build the computational graph
        self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph(
        )
Exemplo n.º 7
0
    def __init__(self,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 outdir='outputs/gru/',
                 input_size=None,
                 hidden_size=None,
                 output_size=None,
                 activation='sigmoid',
                 hidden_activation='relu',
                 inner_hidden_activation='sigmoid',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity',
                 r_weights_interval='montreal',
                 r_weights_mean=0,
                 r_weights_std=5e-3,
                 r_bias_init=0.0,
                 cost_function='mse',
                 cost_args=None,
                 noise='dropout',
                 noise_level=None,
                 noise_decay=False,
                 noise_decay_amount=.99,
                 forward=True,
                 clip_recurrent_grads=False):
        """
        Initialize a simple recurrent network.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets,
            this will be the initial starting value for hidden layers.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters.
        outdir : str
            The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved.
        input_size : int
            The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional.
        hidden_size : int
            The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional.
        output_size : int
            The size (dimensionality) of the output.
        activation : str or callable
            The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The activation to perform for the hidden units.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        inner_hidden_activation : str or callable
            The activation to perform for the hidden gates.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the output cost of the model.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        noise : str
            What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        forward : bool
            The direction this recurrent model should go over its inputs. True means forward, False mean backward.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(GRU, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################

        #########################################
        # activation, cost, and noise functions #
        #########################################
        # recurrent hidden activation function!
        self.hidden_activation_func = get_activation_function(
            hidden_activation)
        self.inner_hidden_activation_func = get_activation_function(
            inner_hidden_activation)

        # output activation function!
        activation_func = get_activation_function(activation)

        # Cost function
        cost_function = get_cost_function(cost_function)
        cost_args = cost_args or dict()

        # Now deal with noise if we added it:
        if noise:
            log.debug('Adding %s noise switch.' % str(noise))
            if noise_level is not None:
                noise_level = sharedX(value=noise_level)
                noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.noise_switch = sharedX(value=1, name="gru_noise_switch")

            # noise scheduling
            if noise_decay and noise_level is not None:
                self.noise_schedule = get_decay_function(
                    noise_decay, noise_level, noise_level.get_value(),
                    noise_decay_amount)

        ###############
        # inputs hook #
        ###############
        # grab info from the inputs_hook
        # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension
        # being the temporal dimension.
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.inputs_hook is not None:
            self.input = self.inputs_hook[1]

            if self.input.ndim == 1:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'),
                                           [1, 2])
                self.input_size = 1

            elif self.input.ndim == 2:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

            elif self.input.ndim > 3:
                self.input = self.input.flatten(3)
                self.input_size = sum(self.input_size)
            else:
                raise NotImplementedError(
                    "Recurrent input with %d dimensions not supported!" %
                    self.input.ndim)
            xs = self.input
        else:
            # Assume input coming from optimizer is (batches, timesteps, data)
            # so, we need to reshape to (timesteps, batches, data)
            self.input = T.tensor3("Xs")
            xs = self.input.dimshuffle(1, 0, 2)

        # The target outputs for supervised training - in the form of (batches, timesteps, output) which is
        # the same dimension ordering as the expected input from optimizer.
        # therefore, we need to swap it like we did to input xs.
        self.target = T.tensor3("Ys")
        ys = self.target.dimshuffle(1, 0, 2)

        ################
        # hiddens hook #
        ################
        # set an initial value for the recurrent hiddens from hook
        if self.hiddens_hook is not None:
            h_init = self.hiddens_hook[1]
            self.hidden_size = self.hiddens_hook[0]
        else:
            # deal with h_init after parameters are made (have to make the same size as hiddens that are computed)
            self.hidden_size = hidden_size

        ##################
        # for generating #
        ##################
        # symbolic scalar for how many recurrent steps to use during generation from the model
        self.n_steps = T.iscalar("generate_n_steps")

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if self.params_hook is not None:
            (W_x_z, W_x_r, W_x_h, U_h_z, U_h_r, U_h_h, W_h_y, b_z, b_r, b_h,
             b_y) = self.params_hook
            recurrent_params = [U_h_z, U_h_r, U_h_h]
        # otherwise, construct our params
        else:
            # all input-to-hidden weights
            W_x_z, W_x_r, W_x_h = [
                get_weights(
                    weights_init=weights_init,
                    shape=(self.input_size, self.hidden_size),
                    name="W_x_%s" % sub,
                    # if gaussian
                    mean=weights_mean,
                    std=weights_std,
                    # if uniform
                    interval=weights_interval) for sub in ['z', 'r', 'h']
            ]
            # all hidden-to-hidden weights
            U_h_z, U_h_r, U_h_h = [
                get_weights(
                    weights_init=r_weights_init,
                    shape=(self.hidden_size, self.hidden_size),
                    name="U_h_%s" % sub,
                    # if gaussian
                    mean=r_weights_mean,
                    std=r_weights_std,
                    # if uniform
                    interval=r_weights_interval) for sub in ['z', 'r', 'h']
            ]
            # hidden-to-output weights
            W_h_y = get_weights(
                weights_init=weights_init,
                shape=(self.hidden_size, self.output_size),
                name="W_h_y",
                # if gaussian
                mean=weights_mean,
                std=weights_std,
                # if uniform
                interval=weights_interval)
            # biases
            b_z, b_r, b_h = [
                get_bias(shape=(self.hidden_size, ),
                         name="b_%s" % sub,
                         init_values=r_bias_init) for sub in ['z', 'r', 'h']
            ]
            # output bias
            b_y = get_bias(shape=(self.output_size, ),
                           name="b_y",
                           init_values=bias_init)
            # clip gradients if we are doing that
            recurrent_params = [U_h_z, U_h_r, U_h_h]
            if clip_recurrent_grads:
                clip = abs(clip_recurrent_grads)
                U_h_z, U_h_r, U_h_h = [
                    theano.gradient.grad_clip(p, -clip, clip)
                    for p in recurrent_params
                ]

        # put all the parameters into our list, and make sure it is in the same order as when we try to load
        # them from a params_hook!!!
        self.params = [W_x_z, W_x_r, W_x_h
                       ] + recurrent_params + [W_h_y, b_z, b_r, b_h, b_y]

        # make h_init the right sized tensor
        if not self.hiddens_hook:
            h_init = T.zeros_like(T.dot(xs[0], W_x_h))

        ###############
        # computation #
        ###############
        # move some computation outside of scan to speed it up!
        x_z = T.dot(xs, W_x_z) + b_z
        x_r = T.dot(xs, W_x_r) + b_r
        x_h = T.dot(xs, W_x_h) + b_h

        # now do the recurrent stuff
        self.hiddens, self.updates = theano.scan(
            fn=self.recurrent_step,
            sequences=[x_z, x_r, x_h],
            outputs_info=[h_init],
            non_sequences=[U_h_z, U_h_r, U_h_h],
            go_backwards=not forward,
            name="gru_scan",
            strict=True)

        # add noise (like dropout) if we wanted it!
        if noise:
            self.hiddens = T.switch(self.noise_switch,
                                    noise_func(input=self.hiddens),
                                    self.hiddens)

        # now compute the outputs from the leftover (top level) hiddens
        self.output = activation_func(T.dot(self.hiddens, W_h_y) + b_y)

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_function(output=self.output, target=ys, **cost_args)

        log.info("Initialized a GRU!")
Exemplo n.º 8
0
    def __init__(self, inputs=None,
                 noise='dropout', noise_level=0.5, noise_decay=False, noise_decay_amount=0.99,
                 mrg=RNG_MRG.MRG_RandomStreams(1), switch=True):
        """
        Parameters
        ----------
        inputs : tuple(shape, `Theano.TensorType`)
            tuple(shape, `Theano.TensorType`) describing the inputs to use for this layer.
            `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`.
            The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape
            element is an integer representing the size for its dimension, or None if the shape isn't known.
            For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would
            be: (None, 784). The full form of `inputs` would be:
            [((None, 784), <TensorType(float32, matrix)>)].
        noise : str
            What type of noise to use for the output. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        switch : boolean
            Whether to create a switch to turn noise on during training and off during testing (True). If False,
            noise will be applied at both training and testing times.
        """
        super(Noise, self).__init__(inputs=inputs, outputs=inputs[0],
                                    noise=noise, noise_level=noise_level,
                                    noise_decay=noise_decay, noise_decay_amount=noise_decay_amount,
                                    mrg=mrg, switch=switch)
        # self.inputs is a list from superclass initialization, grab the first element
        self.inputs = self.inputs[0][1]
        log.debug('Adding %s noise switch.' % str(noise))
        if noise_level is not None:
            noise_level = sharedX(value=noise_level)
            noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
        else:
            noise_func = get_noise(noise, mrg=mrg)

        # apply the noise as a switch!
        # default to apply noise. this is for the cost and gradient functions to be computed later
        # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
        if switch:
            self.noise_switch = sharedX(value=1, name="noise_switch")

        # noise scheduling
        if noise_decay and noise_level is not None:
            self.noise_schedule = get_decay_function(noise_decay,
                                                     noise_level,
                                                     noise_level.get_value(),
                                                     noise_decay_amount)
        # apply noise to the inputs!
        if switch:
            self.outputs = Tswitch(self.noise_switch,
                                   noise_func(input=self.inputs),
                                   self.inputs)
        else:
            self.outputs = noise_func(input=self.inputs)
Exemplo n.º 9
0
    def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rnn/',
                 input_size=None, hidden_size=None, output_size=None,
                 layers=1,
                 activation='sigmoid', hidden_activation='relu',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3,
                 r_bias_init=0.0,
                 cost_function='mse', cost_args=None,
                 noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize a simple recurrent network.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets,
            this will be the initial starting value for hidden layers.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters.
        outdir : str
            The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved.
        input_size : int
            The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional.
        hidden_size : int
            The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional.
        output_size : int
            The size (dimensionality) of the output.
        layers : int
            The number of stacked hidden layers to use.
        activation : str or callable
            The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The activation to perform for the hidden layers.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the output cost of the model.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        noise : str
            What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        direction : str
            The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or
            'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence,
            computing two sets of hiddens and merging them before running through the final decoder.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.

        Raises
        ------
        AssertionError
            When asserting various properties of input parameters. See error messages.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(RNN, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        self.direction = direction
        self.bidirectional = (direction == "bidirectional")
        self.backward = (direction == "backward")
        self.layers = layers
        self.noise = noise

        self.weights_init = weights_init
        self.weights_mean = weights_mean
        self.weights_std = weights_std
        self.weights_interval = weights_interval

        self.r_weights_init = r_weights_init
        self.r_weights_mean = r_weights_mean
        self.r_weights_std = r_weights_std
        self.r_weights_interval = r_weights_interval

        self.bias_init = bias_init
        self.r_bias_init = r_bias_init

        #########################################
        # activation, cost, and noise functions #
        #########################################
        # recurrent hidden activation function!
        self.hidden_activation_func = get_activation_function(hidden_activation)

        # output activation function!
        self.activation_func = get_activation_function(activation)

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        # Now deal with noise if we added it:
        if self.noise:
            log.debug('Adding %s noise switch.' % str(noise))
            if noise_level is not None:
                noise_level = sharedX(value=noise_level)
                self.noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                self.noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch")

            # noise scheduling
            if noise_decay and noise_level is not None:
                self.noise_schedule = get_decay_function(noise_decay,
                                                         noise_level,
                                                         noise_level.get_value(),
                                                         noise_decay_amount)

        ###############
        # inputs hook #
        ###############
        # grab info from the inputs_hook
        # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension
        # being the temporal dimension.
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.inputs_hook is not None:
            self.input = self.inputs_hook[1]

            if self.input.ndim == 1:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2])
                self.input_size = 1

            elif self.input.ndim == 2:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

            elif self.input.ndim == 3:
                pass

            elif self.input.ndim > 3:
                self.input = self.input.flatten(3)
                self.input_size = sum(self.input_size)
            else:
                raise NotImplementedError("Recurrent input with %d dimensions not supported!" % self.input.ndim)
        else:
            # Assume input coming from optimizer is (batches, timesteps, data)
            # so, we need to reshape to (timesteps, batches, data)
            xs = T.tensor3("Xs")
            xs = xs.dimshuffle(1, 0, 2)
            self.input = xs

        # The target outputs for supervised training - in the form of (batches, timesteps, output) which is
        # the same dimension ordering as the expected input from optimizer.
        # therefore, we need to swap it like we did to input xs.
        ys = T.tensor3("Ys")
        ys = ys.dimshuffle(1, 0, 2)
        self.target = ys

        ################
        # hiddens hook #
        ################
        # set an initial value for the recurrent hiddens from hook
        if self.hiddens_hook is not None:
            self.h_init = self.hiddens_hook[1]
            self.hidden_size = self.hiddens_hook[0]
        else:
            # deal with h_init after parameters are made (have to make the same size as hiddens that are computed)
            self.hidden_size = hidden_size

        ##################
        # for generating #
        ##################
        # symbolic scalar for how many recurrent steps to use during generation from the model
        self.n_steps = T.iscalar("generate_n_steps")

        self.output, self.hiddens, self.updates, self.cost, self.params = self.build_computation_graph()
Exemplo n.º 10
0
    def __init__(self, config=None, defaults=_default,
                 inputs_hook=None, params_hook=None,
                 input_size=None, output_size=None,
                 activation=None,
                 cost=None, cost_args=None,
                 weights_init=None, weights_mean=None, weights_std=None, weights_interval=None,
                 bias_init=None,
                 noise=None, noise_level=None, mrg=None,
                 outdir=None,
                 **kwargs):
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals()
        initial_parameters.pop('self')
        super(BasicLayer, self).__init__(**initial_parameters)
        # all configuration parameters are now in self!

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        if self.inputs_hook is not None:  # inputs_hook is a tuple of (Shape, Input)
            assert len(self.inputs_hook) == 2, 'Expected inputs_hook to be tuple!'  # make sure inputs_hook is a tuple
            self.input_size = self.inputs_hook[0] or self.input_size
            self.input = self.inputs_hook[1]
        else:
            # make the input a symbolic matrix
            self.input = T.fmatrix('X')

        # now that we have the input specs, define the output 'target' variable to be used in supervised training!
        self.target = T.fmatrix('Y')

        # either grab the output's desired size from the parameter directly, or copy n_in
        self.output_size = self.output_size or self.input_size

        # other specifications
        # activation function!
        activation_func = get_activation_function(self.activation)
        # cost function!
        cost_func = get_cost_function(self.cost)

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if self.params_hook is not None:
            # make sure the params_hook has W (weights matrix) and b (bias vector)
            assert len(self.params_hook) == 2, \
                "Expected 2 params (W and b) for BasicLayer, found {0!s}!".format(len(self.params_hook))
            W, b = self.params_hook
        else:
            W = get_weights(weights_init=self.weights_init,
                            shape=(self.input_size, self.output_size),
                            name="W",
                            # if gaussian
                            mean=self.weights_mean,
                            std=self.weights_std,
                            # if uniform
                            interval=self.weights_interval)

            # grab the bias vector
            b = get_bias(shape=self.output_size, name="b", init_values=self.bias_init)

        # Finally have the two parameters - weights matrix W and bias vector b. That is all!
        self.params = [W, b]

        ###############
        # computation #
        ###############
        # Here is the meat of the computation transforming input -> output
        # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing
        # the result through our activation function (normally something nonlinear such as: max(0, output))
        self.output = activation_func(T.dot(self.input, W) + b)

        # Now deal with noise if we added it:
        if self.noise:
            log.debug('Adding noise switch.')
            if self.noise_level is not None:
                noise_func = get_noise(self.noise, self.noise_level, self.mrg)
            else:
                noise_func = get_noise(self.noise, mrg=self.mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.switch = sharedX(value=1, name="basiclayer_noise_switch")
            self.output = T.switch(self.switch,
                                   noise_func(input=self.output),
                                   self.output)

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_func(output=self.output, target=self.target, **self.cost_args)

        log.debug("Initialized a basic fully-connected layer with shape %s and activation: %s",
                  str((self.input_size, self.output_size)), str(self.activation))
Exemplo n.º 11
0
    def __init__(self, inputs=None,
                 noise='dropout', noise_level=0.5, noise_decay=False, noise_decay_amount=0.99,
                 mrg=RNG_MRG.MRG_RandomStreams(1), switch=True):
        """
        Parameters
        ----------
        inputs : tuple(shape, `Theano.TensorType`)
            tuple(shape, `Theano.TensorType`) describing the inputs to use for this layer.
            `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`.
            The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape
            element is an integer representing the size for its dimension, or None if the shape isn't known.
            For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would
            be: (None, 784). The full form of `inputs` would be:
            [((None, 784), <TensorType(float32, matrix)>)].
        noise : str
            What type of noise to use for the output. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        switch : boolean
            Whether to create a switch to turn noise on during training and off during testing (True). If False,
            noise will be applied at both training and testing times.
        """
        super(Noise, self).__init__(inputs=inputs,
                                    noise=noise, noise_level=noise_level,
                                    noise_decay=noise_decay, noise_decay_amount=noise_decay_amount,
                                    mrg=mrg, switch=switch)
        # self.inputs is a list from superclass initialization, grab the first element
        self.output_size, self.inputs = self.inputs[0]
        log.debug('Adding %s noise switch.' % str(noise))
        if noise_level is not None:
            noise_level = sharedX(value=noise_level)
            noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
        else:
            noise_func = get_noise(noise, mrg=mrg)

        # apply the noise as a switch!
        # default to apply noise. this is for the cost and gradient functions to be computed later
        # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
        if switch:
            self.noise_switch = sharedX(value=1, name="noise_switch")

        # noise scheduling
        if noise_decay and noise_level is not None:
            self.noise_schedule = get_decay_function(noise_decay,
                                                     noise_level,
                                                     noise_level.get_value(),
                                                     noise_decay_amount)
        # apply noise to the inputs!
        if switch:
            self.outputs = Tswitch(self.noise_switch,
                                   noise_func(input=self.inputs),
                                   self.inputs)
        else:
            self.outputs = noise_func(input=self.inputs)