Exemplo n.º 1
0
def bernoulli_csl(switch=0):

    mnist = MNIST()
    train_x, _ = mnist.getSubset(TRAIN)
    valid_x, _ = mnist.getSubset(VALID)
    test_x, _ = mnist.getSubset(TEST)

    mnist_b = MNIST(binary=True)
    train_x_b, _ = mnist_b.getSubset(TRAIN)
    valid_x_b, _ = mnist_b.getSubset(VALID)
    test_x_b, _ = mnist_b.getSubset(TEST)

    means = as_floatX(test_x).eval()
    means = numpy.clip(a=means, a_min=1e-10, a_max=(1 - (1e-5)))
    # means = as_floatX(numpy.random.uniform(size=(10000,784))) * 0 + 0.5

    minibatches = as_floatX(test_x_b.reshape((1000, 10, 784))).eval()

    if switch:
        # when means is a matrix of (N,D), representing only 1 chain
        csl_fn = _compile_csl_fn_v2(means)
        compute_CSL_with_minibatches_one_chain(csl_fn, minibatches)
    else:
        # when means is a 3D tensor (N, K, D)
        # When there are N chains, each chain having K samples of dimension D
        chains = means.reshape(10, 100, 10, 784)
        csl_fn = _compile_csl_fn()
        compute_CSL_with_minibatches(csl_fn, minibatches, chains)

    del mnist
    del mnist_b
Exemplo n.º 2
0
def bernoulli_csl(switch=0):

    mnist = MNIST()
    train_x, _ = mnist.getSubset(TRAIN)
    valid_x, _ = mnist.getSubset(VALID)
    test_x, _ = mnist.getSubset(TEST)

    mnist_b = MNIST(binary=True)
    train_x_b, _ = mnist_b.getSubset(TRAIN)
    valid_x_b, _ = mnist_b.getSubset(VALID)
    test_x_b, _ = mnist_b.getSubset(TEST)

    means = as_floatX(test_x).eval()
    means = numpy.clip(a=means, a_min=1e-10, a_max=(1 - (1e-5)))
    #means = as_floatX(numpy.random.uniform(size=(10000,784))) * 0 + 0.5

    minibatches = as_floatX(test_x_b.reshape((1000, 10, 784))).eval()

    if switch:
        # when means is a matrix of (N,D), representing only 1 chain
        csl_fn = _compile_csl_fn_v2(means)
        compute_CSL_with_minibatches_one_chain(csl_fn, minibatches)
    else:
        # when means is a 3D tensor (N, K, D)
        # When there are N chains, each chain having K samples of dimension D
        chains = means.reshape(10, 100, 10, 784)
        csl_fn = _compile_csl_fn()
        compute_CSL_with_minibatches(csl_fn, minibatches, chains)

    del mnist
    del mnist_b
Exemplo n.º 3
0
def _compile_csl_fn():
    """
    BUG HERE, not doing properly by chains (still has the bug, I don't see it)
    This is taking too much GPU mem

    mean: N(# of chains)*K(samples per chain)*D(data dim)
    minibatch: M(# of examples)*D (data dim)
    M * N matrix where each element is LL of one example against one chain.

    This function is for computing CSL over parallel chains of minibatches.

    Returns
    -------
    theano function
        Function computing M * N matrix where each element is LL of one example against one chain.
    """
    # when means is a 3D tensor (N, K, D)
    # When there are N chains, each chain having K samples of dimension D
    log.debug('building theano fn for Bernoulli CSL')
    means = T.tensor3('chains')
    minibatch = T.matrix('inputs')

    # how many chains CSL average over
    N = 5
    # minibatch size
    M = 10
    # data dim
    D = 784
    minibatch.tag.test_value = as_floatX(numpy.random.binomial(1, 0.5, size=(M, D)))
    # chain length
    K = 100
    means.tag.test_value = as_floatX(numpy.random.uniform(size=(N, K, D)))

    # computing LL

    # the length of each chain
    sample_size = means.shape[1]

    _minibatch = minibatch.dimshuffle(0, 'x', 'x', 1)
    _means = means.dimshuffle('x', 0, 1, 2)

    A = T.log(sample_size)
    B = _minibatch * T.log(_means) + (1. - _minibatch) * T.log(1. - _means)
    C = B.sum(axis=3)
    D = log_sum_exp_theano(C, axis=2)
    E = D - A
    # G = E.mean(axis=1)
    f = function(
        inputs=[minibatch, means],
        outputs=E,
        name='CSL_independent_bernoulli_fn'
    )
    return f
def _compile_csl_fn():
    """
    BUG HERE, not doing properly by chains (still has the bug, I don't see it)
    This is taking too much GPU mem

    mean: N(# of chains)*K(samples per chain)*D(data dim)
    minibatch: M(# of examples)*D (data dim)
    M * N matrix where each element is LL of one example against one chain.

    This function is for computing CSL over parallel chains of minibatches.

    Returns
    -------
    theano function
        Function computing M * N matrix where each element is LL of one example against one chain.
    """
    # when means is a 3D tensor (N, K, D)
    # When there are N chains, each chain having K samples of dimension D
    log.debug('building theano fn for Bernoulli CSL')
    means = T.tensor3('chains')
    minibatch = T.matrix('inputs')

    # how many chains CSL average over
    N = 5
    # minibatch size
    M = 10
    # data dim
    D = 784
    minibatch.tag.test_value = as_floatX(
        numpy.random.binomial(1, 0.5, size=(M, D)))
    # chain length
    K = 100
    means.tag.test_value = as_floatX(numpy.random.uniform(size=(N, K, D)))

    # computing LL

    # the length of each chain
    sample_size = means.shape[1]

    _minibatch = minibatch.dimshuffle(0, 'x', 'x', 1)
    _means = means.dimshuffle('x', 0, 1, 2)

    A = T.log(sample_size)
    B = _minibatch * T.log(_means) + (1. - _minibatch) * T.log(1. - _means)
    C = B.sum(axis=3)
    D = log_sum_exp_theano(C, axis=2)
    E = D - A
    # G = E.mean(axis=1)
    f = function(inputs=[minibatch, means],
                 outputs=E,
                 name='CSL_independent_bernoulli_fn')
    return f
Exemplo n.º 5
0
def get_bias(shape, name="b", init_values=None):
    """
    This creates a theano shared variable for the bias parameter - normally initialized to zeros,
    but you can specify other values

    Parameters
    ----------
    shape : tuple
        The shape to use for the bias vector/matrix.
    name : str
        The name to give the shared variable.
    offset : float or array_like
        Values to add to the zeros, if you want a nonzero bias initially.

    Returns
    -------
    shared variable
        The theano shared variable with given shape.
    """
    default_init = 0

    init_values = init_values or default_init

    log.debug("Initializing bias variable with shape %s" % str(shape))
    # init to zeros plus the offset
    val = as_floatX(
        numpy.ones(shape=shape, dtype=theano.config.floatX) * init_values)
    return theano.shared(value=val, name=name)
Exemplo n.º 6
0
    def __init__(self, param, initial, reduction_factor):
        """
        A generic class for decaying a theano variable.

        Parameters
        ----------
        param : shared variable
            The theano variable you want to decay. This must already be a shared variable.
        initial : float
            The initial value the variable should have.
        reduction_factor : float
            The amount of reduction (depending on subclass's algorithm) each epoch.
        """
        # make sure the parameter is a Theano shared variable
        if not hasattr(param, 'get_value'):
            log.error('Parameter doesn\'t have a get_value() function! It is supposed to be a shared variable...')
        if not hasattr(param, 'set_value'):
            log.error('Parameter doesn\'t have a set_value() function! It is supposed to be a shared variable...')
        assert hasattr(param, 'get_value')
        assert hasattr(param, 'set_value')

        self.param = param
        self.initial = initial
        self.param.set_value(as_floatX(self.initial))
        self.reduction_factor = reduction_factor
Exemplo n.º 7
0
    def __init__(self, param, initial, reduction_factor):
        """
        A generic class for decaying a theano variable.

        Parameters
        ----------
        param : shared variable
            The theano variable you want to decay. This must already be a shared variable.
        initial : float
            The initial value the variable should have.
        reduction_factor : float
            The amount of reduction (depending on subclass's algorithm) each epoch.
        """
        # make sure the parameter is a Theano shared variable
        if not hasattr(param, 'get_value'):
            log.error(
                'Parameter doesn\'t have a get_value() function! It is supposed to be a shared variable...'
            )
        if not hasattr(param, 'set_value'):
            log.error(
                'Parameter doesn\'t have a set_value() function! It is supposed to be a shared variable...'
            )
        assert hasattr(param, 'get_value')
        assert hasattr(param, 'set_value')

        self.param = param
        self.initial = initial
        self.param.set_value(as_floatX(self.initial))
        self.reduction_factor = reduction_factor
Exemplo n.º 8
0
def get_bias(shape, name="b", init_values=None):
    """
    This creates a theano shared variable for the bias parameter - normally initialized to zeros,
    but you can specify other values

    Parameters
    ----------
    shape : tuple
        The shape to use for the bias vector/matrix.
    name : str
        The name to give the shared variable.
    offset : float or array_like
        Values to add to the zeros, if you want a nonzero bias initially.

    Returns
    -------
    shared variable
        The theano shared variable with given shape.
    """
    default_init = 0

    init_values = init_values or default_init

    log.debug("Initializing bias variable with shape %s" % str(shape))
    # init to zeros plus the offset
    val = as_floatX(numpy.ones(shape=shape, dtype=theano.config.floatX) * init_values)
    return theano.shared(value=val, name=name)
Exemplo n.º 9
0
def rectifier(x):
    """
    Returns the element-wise rectifier (ReLU) applied to x.

    Parameters
    ----------
    x : tensor
        Symbolic Tensor (or compatible).

    Returns
    -------
    tensor
        Element-wise rectifier: rectifier(x) = max(0,x) applied to `x`.

    .. note::

        This implementation uses rectifier(x) = (x + abs(x)) / 2
        which is faster than max(0,x)
        See https://github.com/SnippyHolloW/abnet/blob/807aeb9/layers.py#L15

    """
    # return T.maximum(as_floatX(0), x)
    # below fix is taken from Lasagne framework:
    # https://github.com/benanne/Lasagne/blob/master/lasagne/nonlinearities.py
    # The following is faster than lambda x: T.maximum(0, x)
    # Thanks to @SnippyHolloW for pointing this out.
    # See: https://github.com/SnippyHolloW/abnet/blob/807aeb9/layers.py#L15
    return (x + abs(x)) / as_floatX(2.0)
Exemplo n.º 10
0
def rectifier(x):
    """
    Returns the element-wise rectifier (ReLU) applied to x.

    Parameters
    ----------
    x : tensor
        Symbolic Tensor (or compatible).

    Returns
    -------
    tensor
        Element-wise rectifier: rectifier(x) = max(0,x) applied to `x`.

    .. note::

        This implementation uses rectifier(x) = (x + abs(x)) / 2
        which is faster than max(0,x)
        See https://github.com/SnippyHolloW/abnet/blob/807aeb9/layers.py#L15

    """
    # return T.maximum(as_floatX(0), x)
    # below fix is taken from Lasagne framework:
    # https://github.com/benanne/Lasagne/blob/master/lasagne/nonlinearities.py
    # The following is faster than lambda x: T.maximum(0, x)
    # Thanks to @SnippyHolloW for pointing this out.
    # See: https://github.com/SnippyHolloW/abnet/blob/807aeb9/layers.py#L15
    return (x + abs(x)) / as_floatX(2.0)
Exemplo n.º 11
0
def get_weights_uniform(shape, interval='montreal', name="W", rng=None):
    """
    This initializes a shared variable with a given shape for weights drawn from a Uniform distribution with
    low = -interval and high = interval.

    Interval can either be a number to use, or a string key to one of the predefined formulas in the
    _uniform_interval dictionary.

    Parameters
    ----------
    shape : tuple
        A tuple giving the shape information for this weight matrix.
    interval : float or str
        Either a number for your own custom interval, or a string key to one of the predefined formulas.
    name : str
        The name to give the shared variable.
    rng : random
        The random number generator to use with a .uniform method.

    Returns
    -------
    shared variable
        The theano shared variable with given shape and name drawn from a uniform distribution.

    Raises
    ------
    NotImplementedError
        If the string name for the interval couldn't be found in the dictionary.
    """
    if rng is None:
        rng = numpy.random
    # If the interval parameter is a string, grab the appropriate formula from the function dictionary,
    # and apply the appropriate shape numbers to it.
    if isinstance(interval, six.string_types):
        interval_func = _uniform_interval.get(interval)
        if interval_func is None:
            log.error(
                'Could not find uniform interval formula %s, try one of %s instead.'
                % str(interval), str(_uniform_interval.keys()))
            raise NotImplementedError(
                'Could not find uniform interval formula %s, try one of %s instead.'
                % str(interval), str(_uniform_interval.keys()))
        else:
            log.debug(
                "Creating weights with shape %s from Uniform distribution with formula name: %s",
                str(shape), str(interval))
            interval = interval_func(shape)
    else:
        log.debug(
            "Creating weights with shape %s from Uniform distribution with given interval +- %s",
            str(shape), str(interval))
    # build the uniform weights tensor
    val = as_floatX(rng.uniform(low=-interval, high=interval, size=shape))
    # check if a theano rng was used
    if isinstance(val, T.TensorVariable):
        val = val.eval()
    # make it into a shared variable
    return theano.shared(value=val, name=name)
Exemplo n.º 12
0
def get_weights_uniform(shape, interval='montreal', name="W", rng=None):
    """
    This initializes a shared variable with a given shape for weights drawn from a Uniform distribution with
    low = -interval and high = interval.

    Interval can either be a number to use, or a string key to one of the predefined formulas in the
    _uniform_interval dictionary.

    Parameters
    ----------
    shape : tuple
        A tuple giving the shape information for this weight matrix.
    interval : float or str
        Either a number for your own custom interval, or a string key to one of the predefined formulas.
    name : str
        The name to give the shared variable.
    rng : random
        The random number generator to use with a .uniform method.

    Returns
    -------
    shared variable
        The theano shared variable with given shape and name drawn from a uniform distribution.

    Raises
    ------
    NotImplementedError
        If the string name for the interval couldn't be found in the dictionary.
    """
    if rng is None:
        rng = numpy.random
    # If the interval parameter is a string, grab the appropriate formula from the function dictionary,
    # and apply the appropriate shape numbers to it.
    if isinstance(interval, six.string_types):
        interval_func = _uniform_interval.get(interval)
        if interval_func is None:
            log.error('Could not find uniform interval formula %s, try one of %s instead.' %
                      str(interval), str(_uniform_interval.keys()))
            raise NotImplementedError('Could not find uniform interval formula %s, try one of %s instead.' %
                                      str(interval), str(_uniform_interval.keys()))
        else:
            log.debug("Creating weights with shape %s from Uniform distribution with formula name: %s",
                      str(shape), str(interval))
            interval = interval_func(shape)
    else:
        log.debug("Creating weights with shape %s from Uniform distribution with given interval +- %s",
                  str(shape), str(interval))
    # build the uniform weights tensor
    val = as_floatX(rng.uniform(low=-interval, high=interval, size=shape))
    # check if a theano rng was used
    if isinstance(val, T.TensorVariable):
        val = val.eval()
    # make it into a shared variable
    return theano.shared(value=val, name=name)
Exemplo n.º 13
0
def get_weights_gaussian(shape, mean=None, std=None, name="W", rng=None):
    """
    This initializes a shared variable with the given shape for weights drawn from a
    Gaussian distribution with mean and std.

    Parameters
    ----------
    shape : tuple
        A tuple giving the shape information for this weight matrix.
    mean : float
        The mean to use for the Gaussian distribution.
    std : float
        The standard deviation to use dor the Gaussian distribution.
    name : str
        The name to give the shared variable.
    rng : random
        A given random number generator to use with .normal method.

    Returns
    -------
    shared variable
        The theano shared variable with given shape and drawn from a Gaussian distribution.
    """
    default_mean = 0
    default_std = 0.05

    mean = mean or default_mean
    std = std or default_std

    log.debug("Creating weights with shape %s from Gaussian mean=%s, std=%s",
              str(shape), str(mean), str(std))
    if rng is None:
        rng = numpy.random

    if std != 0:
        if isinstance(rng, type(numpy.random)):
            val = numpy.asarray(rng.normal(loc=mean, scale=std, size=shape),
                                dtype=theano.config.floatX)
        else:
            val = numpy.asarray(rng.normal(avg=mean, std=std,
                                           size=shape).eval(),
                                dtype=theano.config.floatX)
    else:
        val = as_floatX(mean * numpy.ones(shape, dtype=theano.config.floatX))

    # check if a theano rng was used
    if isinstance(val, T.TensorVariable):
        val = val.eval()
    # make it into a shared variable
    return theano.shared(value=val, name=name)
Exemplo n.º 14
0
def get_weights_gaussian(shape, mean=None, std=None, name="W", rng=None):
    """
    This initializes a shared variable with the given shape for weights drawn from a
    Gaussian distribution with mean and std.

    Parameters
    ----------
    shape : tuple
        A tuple giving the shape information for this weight matrix.
    mean : float
        The mean to use for the Gaussian distribution.
    std : float
        The standard deviation to use dor the Gaussian distribution.
    name : str
        The name to give the shared variable.
    rng : random
        A given random number generator to use with .normal method.

    Returns
    -------
    shared variable
        The theano shared variable with given shape and drawn from a Gaussian distribution.
    """
    default_mean = 0
    default_std  = 0.05

    mean = mean or default_mean
    std = std or default_std

    log.debug("Creating weights with shape %s from Gaussian mean=%s, std=%s", str(shape), str(mean), str(std))
    if rng is None:
        rng = numpy.random

    if std != 0:
        if isinstance(rng, type(numpy.random)):
            val = numpy.asarray(rng.normal(loc=mean, scale=std, size=shape), dtype=theano.config.floatX)
        else:
            val = numpy.asarray(rng.normal(avg=mean, std=std, size=shape).eval(), dtype=theano.config.floatX)
    else:
        val = as_floatX(mean * numpy.ones(shape, dtype=theano.config.floatX))

    # check if a theano rng was used
    if isinstance(val, T.TensorVariable):
        val = val.eval()
    # make it into a shared variable
    return theano.shared(value=val, name=name)
def _compile_csl_fn_v2(mu):
    """
    p(x) = sum_h p(x|h)p(h) where p(x|h) is independent Bernoulli with
    a vector mu, mu_i for dim_i

    This function is for computing CSL over minibatches (in a single chain).

    Parameters
    ----------
    mu : array_like
        mu is (N,D) numpy array

    Returns
    -------
    theano function
        Function computing the Bernoulli CSL log likelihood.
    """
    #
    log.debug('building theano fn for Bernoulli CSL')
    x = T.fmatrix('inputs')
    x.tag.test_value = as_floatX(numpy.random.uniform(size=(10, 784)))
    mu = numpy.clip(mu, 1e-10, (1 - (1e-5)))
    mu = mu[None, :, :]
    inner_1 = numpy.log(mu)
    inner_2 = numpy.log(1. - mu)

    k = mu.shape[1]
    D = mu.shape[2]

    # there are two terms in the log(p(x|mu))

    term_1 = -T.log(k)
    c = T.sum(x.dimshuffle(0, 'x', 1) * inner_1 +
              (1. - x.dimshuffle(0, 'x', 1)) * inner_2,
              axis=2)
    debug = c.sum(axis=1)
    term_2 = log_sum_exp_theano(c, axis=1)

    log_likelihood = term_1 + term_2
    f = function([x], log_likelihood, name='CSL_independent_bernoulli_fn')
    return f
Exemplo n.º 16
0
def _compile_csl_fn_v2(mu):
    """
    p(x) = sum_h p(x|h)p(h) where p(x|h) is independent Bernoulli with
    a vector mu, mu_i for dim_i

    This function is for computing CSL over minibatches (in a single chain).

    Parameters
    ----------
    mu : array_like
        mu is (N,D) numpy array

    Returns
    -------
    theano function
        Function computing the Bernoulli CSL log likelihood.
    """
    #
    log.debug('building theano fn for Bernoulli CSL')
    x = T.fmatrix('inputs')
    x.tag.test_value = as_floatX(numpy.random.uniform(size=(10, 784)))
    mu = numpy.clip(mu, 1e-10, (1 - (1e-5)))
    mu = mu[None, :, :]
    inner_1 = numpy.log(mu)
    inner_2 = numpy.log(1. - mu)

    k = mu.shape[1]
    D = mu.shape[2]

    # there are two terms in the log(p(x|mu))

    term_1 = -T.log(k)
    c = T.sum(x.dimshuffle(0, 'x', 1) * inner_1 +
              (1. - x.dimshuffle(0, 'x', 1)) * inner_2,
              axis=2)
    debug = c.sum(axis=1)
    term_2 = log_sum_exp_theano(c, axis=1)

    log_likelihood = term_1 + term_2
    f = function([x], log_likelihood, name='CSL_independent_bernoulli_fn')
    return f
Exemplo n.º 17
0
    def __init__(self,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 outdir='outputs/gsn/',
                 input_size=None,
                 hidden_size=1000,
                 layers=2,
                 walkbacks=4,
                 visible_activation='sigmoid',
                 hidden_activation='tanh',
                 input_sampling=True,
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 tied_weights=True,
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 cost_function='binary_crossentropy',
                 cost_args=None,
                 add_noise=True,
                 noiseless_h1=True,
                 hidden_noise='gaussian',
                 hidden_noise_level=2,
                 input_noise='salt_and_pepper',
                 input_noise_level=0.4,
                 noise_decay='exponential',
                 noise_annealing=1,
                 image_width=None,
                 image_height=None,
                 **kwargs):
        """
        Initialize a GSN.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere.
            This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's
            output layer gives a generative recurrent model.) For now, it needs to include the shape
            information (normally the dimensionality of the hiddens i.e. n_hidden).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional.
            The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an
            unsupervised model. The output is a reconstruction of the input.
        hidden_size : int
            The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than
            `input_size`, which is known as *overcomplete*.
        visible_activation : str or callable
            The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer.
            This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        layers : int
            The number of hidden layers to use.
        walkbacks : int
            The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample
            from the DAE, which means the model generates inputs in sequence, where each generated input is compared
            to the original input to create the reconstruction cost for training. For running the model, the very last
            generated input in the Gibbs chain is used as the output.
        input_sampling : bool
            During walkbacks, whether to sample from the generated input to create a new starting point for the next
            walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the
            process more stochastic - more likely to find spurious modes in the model's representation.
        mrg : random
            A random number generator that is used when adding noise into the network and for sampling from the input.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        tied_weights : bool
            DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean
            determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the reconstruction cost of the model. This should be appropriate
            for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        add_noise : bool
            Whether to add noise (corrupt) the input before passing it through the computation graph during training.
            This should most likely be set to the default of True, because this is a *denoising* autoencoder after all.
        noiseless_h1 : bool
            Whether to not add noise (corrupt) the hidden layer during computation.
        hidden_noise : str
            What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise
            for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        hidden_noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        input_noise : str
            What type of noise to use for corrupting the input before computation (if `add_noise`).
            See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper
            for binary units, etc.
        input_noise_level : float
            The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper,
            standard deviation for Gaussian, interval for Uniform, etc.
        noise_decay : str or False
            Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_annealing : float
            The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        image_width : int
            If the input should be represented as an image, the width of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        image_height : int
            If the input should be represented as an image, the height of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(GSN, self).__init__(**initial_parameters)

        # when the input should be thought of as an image, either use the specified width and height,
        # or try to make as square as possible.
        if image_height is None and image_width is None:
            (_h, _w) = closest_to_square_factors(self.input_size)
            self.image_width = _w
            self.image_height = _h
        else:
            self.image_height = image_height
            self.image_width = image_width

        ############################
        # Theano variables and RNG #
        ############################
        if self.inputs_hook is None:
            self.X = T.matrix('X')
        else:
            # inputs_hook is a (shape, input) tuple
            self.X = self.inputs_hook[1]

        ##########################
        # Network specifications #
        ##########################
        # generally, walkbacks should be at least 2*layers
        if layers % 2 == 0:
            if walkbacks < 2 * layers:
                log.warning(
                    'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                    'Generaly want 2X walkbacks to layers', str(layers),
                    str(walkbacks))
        else:
            if walkbacks < 2 * layers - 1:
                log.warning(
                    'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                    'Generaly want 2X walkbacks to layers', str(layers),
                    str(walkbacks))

        self.add_noise = add_noise
        self.noise_annealing = as_floatX(
            noise_annealing)  # noise schedule parameter
        self.hidden_noise_level = sharedX(hidden_noise_level,
                                          dtype=theano.config.floatX)
        self.hidden_noise = get_noise(name=hidden_noise,
                                      noise_level=self.hidden_noise_level,
                                      mrg=mrg)
        self.input_noise_level = sharedX(input_noise_level,
                                         dtype=theano.config.floatX)
        self.input_noise = get_noise(name=input_noise,
                                     noise_level=self.input_noise_level,
                                     mrg=mrg)

        self.walkbacks = walkbacks
        self.tied_weights = tied_weights
        self.layers = layers
        self.noiseless_h1 = noiseless_h1
        self.input_sampling = input_sampling
        self.noise_decay = noise_decay

        # if there was a hiddens_hook, unpack the hidden layers in the tensor
        if self.hiddens_hook is not None:
            hidden_size = self.hiddens_hook[0]
            self.hiddens_flag = True
        else:
            self.hiddens_flag = False

        # determine the sizes of each layer in a list.
        #  layer sizes, from h0 to hK (h0 is the visible layer)
        hidden_size = list(raise_to_list(hidden_size))
        if len(hidden_size) == 1:
            self.layer_sizes = [self.input_size] + hidden_size * self.layers
        else:
            assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \
                                                    "Hiddens %d and layers %d" % (len(hidden_size), self.layers)
            self.layer_sizes = [self.input_size] + hidden_size

        if self.hiddens_hook is not None:
            self.hiddens = self.unpack_hiddens(self.hiddens_hook[1])

        #########################
        # Activation functions! #
        #########################
        # hidden unit activation
        self.hidden_activation = get_activation_function(hidden_activation)
        # Visible layer activation
        self.visible_activation = get_activation_function(visible_activation)
        # make sure the sampling functions are appropriate for the activation functions.
        if is_binary(self.visible_activation):
            self.visible_sampling = mrg.binomial
        else:
            # TODO: implement non-binary activation
            log.error("Non-binary visible activation not supported yet!")
            raise NotImplementedError(
                "Non-binary visible activation not supported yet!")

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        ###############
        # Parameters! #
        ###############
        # make sure to deal with params_hook!
        if self.params_hook is not None:
            # if tied weights, expect layers*2 + 1 params
            if self.tied_weights:
                assert len(self.params_hook) == 2*layers + 1, \
                    "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:layers]
                self.bias_list = self.params_hook[layers:]
            # if untied weights, expect layers*3 + 1 params
            else:
                assert len(self.params_hook) == 3*layers + 1, \
                    "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:2 * layers]
                self.bias_list = self.params_hook[2 * layers:]
        # otherwise, construct our params
        else:
            # initialize a list of weights and biases based on layer_sizes for the GSN
            self.weights_list = [
                get_weights(
                    weights_init=weights_init,
                    shape=(self.layer_sizes[i], self.layer_sizes[i + 1]),
                    name="W_{0!s}_{1!s}".format(i, i + 1),
                    rng=mrg,
                    # if gaussian
                    mean=weights_mean,
                    std=weights_std,
                    # if uniform
                    interval=weights_interval) for i in range(layers)
            ]
            # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now)
            if not tied_weights:
                self.weights_list.extend([
                    get_weights(
                        weights_init=weights_init,
                        shape=(self.layer_sizes[i + 1], self.layer_sizes[i]),
                        name="W_{0!s}_{1!s}".format(i + 1, i),
                        rng=mrg,
                        # if gaussian
                        mean=weights_mean,
                        std=weights_std,
                        # if uniform
                        interval=weights_interval)
                    for i in reversed(range(layers))
                ])
            # initialize each layer bias to 0's.
            self.bias_list = [
                get_bias(shape=(self.layer_sizes[i], ),
                         name='b_' + str(i),
                         init_values=bias_init) for i in range(layers + 1)
            ]

        # build the params of the model into a list
        self.params = self.weights_list + self.bias_list
        log.debug("gsn params: %s", str(self.params))

        # using the properties, build the computational graph
        self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph(
        )
Exemplo n.º 18
0
 def decay(self):
     new_value = self.initial / (1 + self.reduction_factor * self.epoch)
     self.param.set_value(as_floatX(new_value))
     self.epoch += 1
Exemplo n.º 19
0
 def decay(self):
     new_value = self.param.get_value() * self.reduction_factor
     self.param.set_value(as_floatX(new_value))
Exemplo n.º 20
0
 def decay(self):
     new_value = self.param.get_value() - self.reduction_factor
     self.param.set_value(as_floatX(numpy.max([0, new_value])))
Exemplo n.º 21
0
 def decay(self):
     new_value = self.param.get_value()*self.reduction_factor
     self.param.set_value(as_floatX(new_value))
Exemplo n.º 22
0
 def decay(self):
     new_value = self.initial / (1 + self.reduction_factor*self.epoch)
     self.param.set_value(as_floatX(new_value))
     self.epoch += 1
Exemplo n.º 23
0
    def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gsn/',
                 input_size=None, hidden_size=1000,
                 layers=2, walkbacks=4,
                 visible_activation='sigmoid', hidden_activation='tanh',
                 input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1),
                 tied_weights=True,
                 weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3,
                 bias_init=0.0,
                 cost_function='binary_crossentropy', cost_args=None,
                 add_noise=True, noiseless_h1=True,
                 hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4,
                 noise_decay='exponential', noise_annealing=1,
                 image_width=None, image_height=None,
                 **kwargs):
        """
        Initialize a GSN.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere.
            This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's
            output layer gives a generative recurrent model.) For now, it needs to include the shape
            information (normally the dimensionality of the hiddens i.e. n_hidden).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional.
            The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an
            unsupervised model. The output is a reconstruction of the input.
        hidden_size : int
            The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than
            `input_size`, which is known as *overcomplete*.
        visible_activation : str or callable
            The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer.
            This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        layers : int
            The number of hidden layers to use.
        walkbacks : int
            The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample
            from the DAE, which means the model generates inputs in sequence, where each generated input is compared
            to the original input to create the reconstruction cost for training. For running the model, the very last
            generated input in the Gibbs chain is used as the output.
        input_sampling : bool
            During walkbacks, whether to sample from the generated input to create a new starting point for the next
            walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the
            process more stochastic - more likely to find spurious modes in the model's representation.
        mrg : random
            A random number generator that is used when adding noise into the network and for sampling from the input.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        tied_weights : bool
            DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean
            determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the reconstruction cost of the model. This should be appropriate
            for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        add_noise : bool
            Whether to add noise (corrupt) the input before passing it through the computation graph during training.
            This should most likely be set to the default of True, because this is a *denoising* autoencoder after all.
        noiseless_h1 : bool
            Whether to not add noise (corrupt) the hidden layer during computation.
        hidden_noise : str
            What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise
            for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        hidden_noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        input_noise : str
            What type of noise to use for corrupting the input before computation (if `add_noise`).
            See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper
            for binary units, etc.
        input_noise_level : float
            The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper,
            standard deviation for Gaussian, interval for Uniform, etc.
        noise_decay : str or False
            Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_annealing : float
            The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        image_width : int
            If the input should be represented as an image, the width of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        image_height : int
            If the input should be represented as an image, the height of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(GSN, self).__init__(**initial_parameters)

        # when the input should be thought of as an image, either use the specified width and height,
        # or try to make as square as possible.
        if image_height is None and image_width is None:
            (_h, _w) = closest_to_square_factors(self.input_size)
            self.image_width  = _w
            self.image_height = _h
        else:
            self.image_height = image_height
            self.image_width = image_width

        ############################
        # Theano variables and RNG #
        ############################
        if self.inputs_hook is None:
            self.X = T.matrix('X')
        else:
            # inputs_hook is a (shape, input) tuple
            self.X = self.inputs_hook[1]
        
        ##########################
        # Network specifications #
        ##########################
        # generally, walkbacks should be at least 2*layers
        if layers % 2 == 0:
            if walkbacks < 2*layers:
                log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                            'Generaly want 2X walkbacks to layers',
                            str(layers), str(walkbacks))
        else:
            if walkbacks < 2*layers-1:
                log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                            'Generaly want 2X walkbacks to layers',
                            str(layers), str(walkbacks))

        self.add_noise = add_noise
        self.noise_annealing = as_floatX(noise_annealing)  # noise schedule parameter
        self.hidden_noise_level = sharedX(hidden_noise_level, dtype=theano.config.floatX)
        self.hidden_noise = get_noise(name=hidden_noise, noise_level=self.hidden_noise_level, mrg=mrg)
        self.input_noise_level = sharedX(input_noise_level, dtype=theano.config.floatX)
        self.input_noise = get_noise(name=input_noise, noise_level=self.input_noise_level, mrg=mrg)

        self.walkbacks = walkbacks
        self.tied_weights = tied_weights
        self.layers = layers
        self.noiseless_h1 = noiseless_h1
        self.input_sampling = input_sampling
        self.noise_decay = noise_decay

        # if there was a hiddens_hook, unpack the hidden layers in the tensor
        if self.hiddens_hook is not None:
            hidden_size = self.hiddens_hook[0]
            self.hiddens_flag = True
        else:
            self.hiddens_flag = False

        # determine the sizes of each layer in a list.
        #  layer sizes, from h0 to hK (h0 is the visible layer)
        hidden_size = list(raise_to_list(hidden_size))
        if len(hidden_size) == 1:
            self.layer_sizes = [self.input_size] + hidden_size * self.layers
        else:
            assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \
                                                    "Hiddens %d and layers %d" % (len(hidden_size), self.layers)
            self.layer_sizes = [self.input_size] + hidden_size

        if self.hiddens_hook is not None:
            self.hiddens = self.unpack_hiddens(self.hiddens_hook[1])

        #########################
        # Activation functions! #
        #########################
        # hidden unit activation
        self.hidden_activation = get_activation_function(hidden_activation)
        # Visible layer activation
        self.visible_activation = get_activation_function(visible_activation)
        # make sure the sampling functions are appropriate for the activation functions.
        if is_binary(self.visible_activation):
            self.visible_sampling = mrg.binomial
        else:
            # TODO: implement non-binary activation
            log.error("Non-binary visible activation not supported yet!")
            raise NotImplementedError("Non-binary visible activation not supported yet!")

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        ###############
        # Parameters! #
        ###############
        # make sure to deal with params_hook!
        if self.params_hook is not None:
            # if tied weights, expect layers*2 + 1 params
            if self.tied_weights:
                assert len(self.params_hook) == 2*layers + 1, \
                    "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:layers]
                self.bias_list = self.params_hook[layers:]
            # if untied weights, expect layers*3 + 1 params
            else:
                assert len(self.params_hook) == 3*layers + 1, \
                    "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:2*layers]
                self.bias_list = self.params_hook[2*layers:]
        # otherwise, construct our params
        else:
            # initialize a list of weights and biases based on layer_sizes for the GSN
            self.weights_list = [get_weights(weights_init=weights_init,
                                             shape=(self.layer_sizes[i], self.layer_sizes[i+1]),
                                             name="W_{0!s}_{1!s}".format(i, i+1),
                                             rng=mrg,
                                             # if gaussian
                                             mean=weights_mean,
                                             std=weights_std,
                                             # if uniform
                                             interval=weights_interval)
                                 for i in range(layers)]
            # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now)
            if not tied_weights:
                self.weights_list.extend(
                    [get_weights(weights_init=weights_init,
                                 shape=(self.layer_sizes[i+1], self.layer_sizes[i]),
                                 name="W_{0!s}_{1!s}".format(i+1, i),
                                 rng=mrg,
                                 # if gaussian
                                 mean=weights_mean,
                                 std=weights_std,
                                 # if uniform
                                 interval=weights_interval)
                     for i in reversed(range(layers))]
                )
            # initialize each layer bias to 0's.
            self.bias_list = [get_bias(shape=(self.layer_sizes[i],),
                                       name='b_' + str(i),
                                       init_values=bias_init)
                              for i in range(layers+1)]

        # build the params of the model into a list
        self.params = self.weights_list + self.bias_list
        log.debug("gsn params: %s", str(self.params))

        # using the properties, build the computational graph
        self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph()
Exemplo n.º 24
0
 def decay(self):
     new_value = self.param.get_value() - self.reduction_factor
     self.param.set_value(as_floatX(numpy.max([0, new_value])))