示例#1
0
def get_corrupted_input(rng, input, corruption_level, ntype='zeromask'):
    MRG = RNG_MRG.MRG_RandomStreams(rng.randint(2**30))
    #theano_rng = RandomStreams()
    if corruption_level == 0.0:
        return input

    if ntype == 'zeromask':
        return MRG.binomial(size=input.shape,
                            n=1,
                            p=1 - corruption_level,
                            dtype=theano.config.floatX) * input
    elif ntype == 'gaussian':
        return input + MRG.normal(size=input.shape,
                                  avg=0.0,
                                  std=corruption_level,
                                  dtype=theano.config.floatX)
    elif ntype == 'salt_pepper':

        # salt and pepper noise
        print 'DAE uses salt and pepper noise'
        a = MRG.binomial(size=input.shape, n=1,\
                p=1-corruption_level,dtype=theano.config.floatX)
        b = MRG.binomial(size=input.shape, n=1,\
                p=corruption_level,dtype=theano.config.floatX)

        c = T.eq(a, 0) * b
        return input * a + c
def add_gaussian_noise(IN, std=1, MRG=None):
    if MRG is None:
        MRG = RNG_MRG.MRG_RandomStreams(1)
    print 'GAUSSIAN NOISE : ', std
    noise = MRG.normal(avg=0, std=std, size=IN.shape, dtype='float32')
    OUT = IN + noise
    return OUT
def corrupt_input(IN, p=0.5, MRG=None):
    if MRG is None:
        MRG = RNG_MRG.MRG_RandomStreams(1)
    # salt and pepper? masking?
    noise = MRG.binomial(p=p, n=1, size=IN.shape, dtype='float32')
    IN = IN * noise
    return IN
示例#4
0
    def __init__(self, input=None, n_visible=784, n_hidden=500, \
        W=None, hbias=None, vbias=None, numpy_rng=None,
        theano_rng=None, enhanced_grad_flag=False, batch_sz=100, mpf_type='1bit'):

        self.n_visible = n_visible
        self.n_hidden = n_hidden
        self.enhanced_grad_flag = enhanced_grad_flag
        self.batch_sz = batch_sz

        if numpy_rng is None:
            # create a number generator
            numpy_rng = np.random.RandomState(1234)

        if theano_rng is None:
            theano_rng = RNG_MRG.MRG_RandomStreams(numpy_rng.randint(2**30))
            #theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        num_vishid = n_visible * n_hidden

        # initialize input layer for standalone RBM or layer0 of DBN
        self.input = input
        if not input:
            self.input = T.matrix('input')

        self.mpf_type = mpf_type
        self._init_params(numpy_rng, n_hidden, n_visible, mpf_type)
        self.theano_rng = theano_rng
def salt_and_pepper(IN, p=0.2, MRG=None):
    if MRG is None:
        MRG = RNG_MRG.MRG_RandomStreams(1)
    # salt and pepper noise
    a = MRG.binomial(size=IN.shape, n=1, p=1 - p, dtype='float32')
    b = MRG.binomial(size=IN.shape, n=1, p=0.5, dtype='float32')
    c = T.eq(a, 0) * b
    return IN * a + c
示例#6
0
    def __init__(self, eta=0, gamma=0.55, seed=180891):

        self.eta_sqrt = shared_floatx(sqrt(eta), "eta")
        add_role(self.eta_sqrt, ALGORITHM_HYPERPARAMETER)

        self.gamma_half = shared_floatx(gamma/2, "gamma")
        add_role(self.gamma_half, ALGORITHM_HYPERPARAMETER)

        self.theano_random = rng_mrg.MRG_RandomStreams(seed=seed)
示例#7
0
文件: dvae.py 项目: piperod/DVAE
    def __init__(self, model_params):

        [
            self.batch_sz, self.num_dim, self.num_hids, numpy_rng,
            self.dim_sample, binaryF
        ] = model_params

        self.numpy_rng = numpy_rng
        self.init_params(numpy_rng)
        self.last_layer = stochastic_layer(self.num_hids[0], self.num_dim,
                                           binaryF, numpy_rng)
        self.params = self.params + self.last_layer.params

        self.MRG = RNG_MRG.MRG_RandomStreams(numpy_rng.randint(2**30))
示例#8
0
def main():
    data = TextDataset(
        path='../../../../datasets/shakespeare_input.txt',
        source=
        "http://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt",
        target_n_future=1,
        sequence_length=50)

    rnn = RNN(outdir='outputs/rnn/',
              input_size=len(data.vocab),
              hidden_size=128,
              output_size=len(data.vocab),
              layers=2,
              activation='softmax',
              hidden_activation='relu',
              mrg=RNG_MRG.MRG_RandomStreams(1),
              weights_init='uniform',
              weights_interval='montreal',
              bias_init=0.0,
              r_weights_init='identity',
              r_bias_init=0.0,
              cost_function='nll',
              cost_args=None,
              noise='dropout',
              noise_level=.7,
              noise_decay='exponential',
              noise_decay_amount=.99,
              direction='forward')

    cost_monitor = Monitor("cost",
                           rnn.get_train_cost(),
                           train=False,
                           valid=True,
                           test=True)

    optimizer = RMSProp(model=rnn,
                        dataset=data,
                        grad_clip=5.,
                        hard_clip=False,
                        learning_rate=2e-3,
                        lr_decay='exponential',
                        lr_decay_factor=0.97,
                        decay=0.95,
                        batch_size=50,
                        epochs=50)
    # optimizer = AdaDelta(model=gsn, dataset=mnist, n_epoch=200, batch_size=100, learning_rate=1e-6)
    optimizer.train(monitor_channels=cost_monitor)
示例#9
0
    def compile_sampling(self, data_train, data_valid, data_test,
                         training_n_samples):
        X = tt.matrix('X')
        batch = tt.iscalar('batch')
        n_samples = tt.iscalar('n_samples')

        n_layers = len(self.layers)
        samples = [None] * n_layers

        samples[0] = replicate_batch(X, n_samples)

        if "gpu" in theano.config.device:
            from theano.sandbox import rng_mrg
            srng = rng_mrg.MRG_RandomStreams(seed=42)
        else:
            srng = tt.shared_randomstreams.RandomStreams(seed=42)

        for layer in range(n_layers - 1):
            samples[layer + 1] = self.compute_samples(srng, samples[layer],
                                                      layer)

        givens = dict()
        givens[X] = data_valid[batch * self.batch_size:(batch + 1) *
                               self.batch_size]
        self.sample_convergence = theano.function([batch, n_samples],
                                                  samples,
                                                  givens=givens)

        givens[n_samples] = np.int32(training_n_samples)
        givens[X] = data_train[batch * self.batch_size:(batch + 1) *
                               self.batch_size]
        self.sample_train = theano.function([batch], samples, givens=givens)

        givens[X] = data_valid[batch * self.batch_size:(batch + 1) *
                               self.batch_size]
        self.sample_valid = theano.function([batch], samples, givens=givens)

        givens[X] = data_test[batch * self.batch_size:(batch + 1) *
                              self.batch_size]
        self.sample_test = theano.function([batch], samples, givens=givens)
示例#10
0
class DenoisingAutoencoder(GSN):
    '''
    Class for creating a new Denoising Autoencoder (DAE)
    This is a special case of a GSN with only one hidden layer
    '''
    # Default values to use for some DAE parameters
    _defaults = {# gsn parameters
                "walkbacks": 1,
                "input_size": None,  # number of input units - please specify for your dataset!
                "hidden_size": 1500,
                "visible_activation": 'sigmoid',
                "hidden_activation": 'tanh',
                "input_sampling": True,
                "MRG": RNG_MRG.MRG_RandomStreams(1),
                # train param
                "cost_function": 'binary_crossentropy',
                # noise parameters
                "noise_annealing": 1.0, #no noise schedule by default
                "add_noise": True,
                "noiseless_h1": True,
                "hidden_add_noise_sigma": 2,
                "input_salt_and_pepper": 0.4,
                # data parameters
                "output_path": 'outputs/dae/',
                "is_image": True,
                "vis_init": False}

    def __init__(self, config=None, defaults=_defaults, inputs_hook=None, hiddens_hook=None, dataset=None):
        # init Model
        # force the model to have one layer - DAE is a specific GSN with a single hidden layer
        defaults['layers'] = 1
        if config:
            config['layers'] = 1
        super(DenoisingAutoencoder, self).__init__(config=config,
                                  defaults=defaults,
                                  inputs_hook=inputs_hook,
                                  hiddens_hook=hiddens_hook,
                                  dataset=dataset)
示例#11
0
    def __init__(self,
                 inputs=None,
                 hiddens=None,
                 params=None,
                 outdir='outputs/lstm/',
                 activation='relu',
                 gate_activation='sigmoid',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity',
                 r_weights_interval='montreal',
                 r_weights_mean=0,
                 r_weights_std=5e-3,
                 r_bias_init=0.0,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize an LSTM.

        Parameters
        ----------
        inputs : List of [tuple(shape, `Theano.TensorType`)]
            The dimensionality of the inputs for this model, and the routing information for the model
            to accept inputs from elsewhere. `inputs` variable are expected to be of the form (timesteps, batch, data).
            `shape` will be a monad tuple representing known
            sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of
            dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its
            dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size
            but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be:
            [((None, 784), <TensorType(float32, matrix)>)].
        hiddens : int or Tuple of (shape, `Theano.TensorType`)
            Int for the number of hidden units to use, or a tuple of shape, expression to route the starting
            hidden values from elsewhere.
        params : Dict(string_name: theano SharedVariable), optional
            A dictionary of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as siamese networks or pretraining some
            weights.
        outdir : str
            The location to produce outputs from training or running the :class:`LSTM`. If None, nothing will be saved.
        activation : str or callable
            The nonlinear (or linear) activation to perform for the hidden units.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        gate_activation : str or callable
            The activation to perform for the hidden gates (default sigmoid).
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing input-hidden model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent hidden-hidden model weights.
            See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        direction : str
            The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or
            'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence,
            computing two sets of hiddens and adding them together.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(LSTM, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        backward = direction.lower() == 'backward'
        bidirectional = direction.lower() == 'bidirectional'

        ########################
        # activation functions #
        ########################
        # recurrent hidden activation functions!
        self.hidden_activation_func = get_activation_function(activation)
        self.gate_activation_func = get_activation_function(gate_activation)

        ##########
        # inputs #
        ##########
        # inputs are expected to have the shape (n_timesteps, batch_size, data)
        if len(self.inputs) > 1:
            raise NotImplementedError(
                "Expected 1 input, found %d. Please merge inputs before passing "
                "to the model!" % len(self.inputs))
        # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input)
        input_shape, self.input = self.inputs[0]
        if isinstance(input_shape, int):
            self.input_size = ((None, ) *
                               (self.input.ndim - 1)) + (input_shape, )
        else:
            self.input_size = input_shape
        assert self.input_size is not None, "Need to specify the shape for at least the last dimension of the input!"
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.input.ndim == 1:
            self.input = unbroadcast(self.input.dimshuffle(0, 'x', 'x'),
                                     [1, 2])

        elif self.input.ndim == 2:
            self.input = unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

        elif self.input.ndim > 3:
            self.input = self.input.flatten(3)
            self.input_size = self.input_size[:2] + (prod(self.input_size[2:]))

        ###########
        # hiddens #
        ###########
        # have only 1 hiddens
        assert len(
            self.hiddens) == 1, "Expected 1 `hiddens` param, found %d" % len(
                self.hiddens)
        self.hiddens = self.hiddens[0]
        # if hiddens is an int (hidden size parameter, not routing info)
        h_init = None
        if isinstance(self.hiddens, int):
            self.hidden_size = self.hiddens
        elif isinstance(self.hiddens, tuple):
            hidden_shape, h_init = self.hiddens
            if isinstance(hidden_shape, int):
                self.hidden_size = hidden_shape
            else:
                self.hidden_size = hidden_shape[-1]
        else:
            raise AssertionError(
                "Hiddens need to be an int or tuple of (shape, theano_expression), found %s"
                % type(self.hiddens))

        # output shape is going to be 3D with (timesteps, batch_size, hidden_size)
        self.output_size = (None, None, self.hidden_size)

        ##########################################################
        # parameters - make sure to deal with params dict input! #
        ##########################################################
        # all input-to-hidden weights
        W_c, W_i, W_f, W_o = [
            self.params.get(
                "W_%s" % sub,
                get_weights(
                    weights_init=weights_init,
                    shape=(self.input_size[-1], self.hidden_size),
                    name="W_%s" % sub,
                    # if gaussian
                    mean=weights_mean,
                    std=weights_std,
                    # if uniform
                    interval=weights_interval))
            for sub in ['c', 'i', 'f', 'o']
        ]
        # all hidden-to-hidden weights
        U_c, U_i, U_f, U_o = [
            self.params.get(
                "U_%s" % sub,
                get_weights(
                    weights_init=r_weights_init,
                    shape=(self.hidden_size, self.hidden_size),
                    name="U_%s" % sub,
                    # if gaussian
                    mean=r_weights_mean,
                    std=r_weights_std,
                    # if uniform
                    interval=r_weights_interval))
            for sub in ['c', 'i', 'f', 'o']
        ]
        # if bidirectional, make hidden-to-hidden weights again to go the opposite direction
        U_c_b, U_i_b, U_f_b, U_o_b = None, None, None, None
        if bidirectional:
            U_c_b, U_i_b, U_f_b, U_o_b = [
                self.params.get(
                    "U_%s_b" % sub,
                    get_weights(
                        weights_init=r_weights_init,
                        shape=(self.hidden_size, self.hidden_size),
                        name="U_%s_b" % sub,
                        # if gaussian
                        mean=r_weights_mean,
                        std=r_weights_std,
                        # if uniform
                        interval=r_weights_interval))
                for sub in ['c', 'i', 'f', 'o']
            ]
        # biases
        b_c, b_i, b_f, b_o = [
            self.params.get(
                "b_%s" % sub,
                get_bias(shape=(self.hidden_size, ),
                         name="b_%s" % sub,
                         init_values=r_bias_init))
            for sub in ['c', 'i', 'f', 'o']
        ]
        # clip gradients if we are doing that
        recurrent_params = [U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b]
        if clip_recurrent_grads:
            clip = abs(clip_recurrent_grads)
            U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b = [
                grad_clip(param, -clip, clip) if param is not None else None
                for param in recurrent_params
            ]

        # put all the parameters into our dictionary
        self.params = {
            "W_c": W_c,
            "W_i": W_i,
            "W_f": W_f,
            "W_o": W_o,
            "U_c": U_c,
            "U_i": U_i,
            "U_f": U_f,
            "U_o": U_o,
            "b_c": b_c,
            "b_i": b_i,
            "b_f": b_f,
            "b_o": b_o,
        }
        if bidirectional:
            self.params.update({
                "U_c_b": U_c_b,
                "U_i_b": U_i_b,
                "U_f_b": U_f_b,
                "U_o_b": U_o_b,
            })

        # make h_init the right sized tensor
        if h_init is None:
            h_init = zeros_like(dot(self.input[0], W_c))

        c_init = zeros_like(dot(self.input[0], W_c))

        ###############
        # computation #
        ###############
        # move some computation outside of scan to speed it up!
        x_c = dot(self.input, W_c) + b_c
        x_i = dot(self.input, W_i) + b_i
        x_f = dot(self.input, W_f) + b_f
        x_o = dot(self.input, W_o) + b_o

        # now do the recurrent stuff
        (self.hiddens,
         _), self.updates = scan(fn=self.recurrent_step,
                                 sequences=[x_c, x_i, x_f, x_o],
                                 outputs_info=[h_init, c_init],
                                 non_sequences=[U_c, U_i, U_f, U_o],
                                 go_backwards=backward,
                                 name="lstm_scan",
                                 strict=True)

        # if bidirectional, do the same in reverse!
        if bidirectional:
            (hiddens_b,
             _), updates_b = scan(fn=self.recurrent_step,
                                  sequences=[x_c, x_i, x_f, x_o],
                                  outputs_info=[h_init, c_init],
                                  non_sequences=[U_c_b, U_i_b, U_f_b, U_o_b],
                                  go_backwards=not backward,
                                  name="lstm_scan_back",
                                  strict=True)
            # flip the hiddens to be the right direction
            hiddens_b = hiddens_b[::-1]
            # update stuff
            self.updates.update(updates_b)
            self.hiddens += hiddens_b

        log.info("Initialized an LSTM!")
示例#12
0
import numpy
import theano
from theano.sandbox import rng_mrg
from ssrbm.truncated import truncated_normal as tnorm
from utils import sharedX
import pylab as pl

rng = rng_mrg.MRG_RandomStreams(1231)

avg = sharedX(5., name='mean')
std = sharedX(1, name='std')
r = tnorm(size=(10000,), avg=avg, std=std,
        lbound=numpy.cast['float32'](-2),
        ubound=numpy.cast['float32'](-0.5),
        theano_rng=rng,
        dtype=theano.config.floatX)
f = theano.function([], r)
x = f()
import pdb; pdb.set_trace()
pl.hist(x)
pl.show()
示例#13
0
    def __init__(self, inputs=None, params=None, outdir='outputs/conv1d',
                 n_filters=None, filter_size=None, stride=None, border_mode='valid',
                 weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3,
                 bias_init=0,
                 activation='rectifier',
                 convolution='mc0',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 **kwargs):
        """
        Initialize a 1-D convolutional layer.

        Parameters
        ----------
        inputs : tuple(shape, `Theano.TensorType`)
            The dimensionality of the inputs for this model, and the routing information for the model
            to accept inputs from elsewhere. `shape` will be a monad tuple representing known
            sizes for each dimension in the `Theano.TensorType`. Shape of the incoming data:
            (batch_size, num_channels, data_dimensionality). Most likely, your channels
            will be 1. For example, batches of text will be of the form (N, 1, D) where N=examples in minibatch and
            D=dimensionality (chars, words, etc.)
        params : Dict(string_name: theano SharedVariable), optional
            A dictionary of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as siamese networks or pretraining some
            weights.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        n_filters : int
            The number of filters to use (convolution kernels).
        filter_size : int
            The size of the convolution filter.
        stride : int
            The distance between the receptive field centers of neighboring units. This is the 'stride' of the
            convolution operation.
        border_mode : str, one of 'valid', 'full', 'same'
            A string indicating the convolution border mode.
            If 'valid', the convolution is only computed where the input and the
            filter fully overlap.
            If 'full', the convolution is computed wherever the input and the
            filter overlap by at least one position.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        activation : str or Callable
            The activation function to apply to the layer. See opendeep.utils.activation for options.
        convolution : str or Callable
            The 1-dimensional convolution implementation to use. The default of 'mc0' is normally fine. See
            opendeep.utils.conv1d_implementations for alternatives. (This is necessary because Theano only
            supports 2D convolutions at the moment).
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.

        Notes
        -----
        Theano's default convolution function (`theano.tensor.nnet.conv.conv2d`)
        does not support the 'same' border mode by default. This layer emulates
        it by performing a 'full' convolution and then cropping the result, which
        may negatively affect performance.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(Conv1D, self).__init__(**initial_parameters)
        if self.inputs is None:
            return

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        # expect input to be in the form (B, C, I) (batch, channel, input data)
        # inputs_hook is a tuple of (Shape, Input)
        # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input)
        input_shape, self.input = self.inputs[0]
        assert self.input.ndim == 3, "Expected 3D input variable with form (batch, channel, input_data)"
        assert len(input_shape) == 3, "Expected 3D input shape with form (batch, channel, input_data)"

        n_channels = input_shape[1]

        filter_shape = (n_filters, n_channels, filter_size)

        # activation function!
        activation_func = get_activation_function(activation)

        # convolution function!
        convolution_func = get_conv1d_function(convolution)

        outshape = ConvOp.getOutputShape(
            inshp=(input_shape[-1],),
            kshp=(filter_size,),
            stride=(stride,),
            mode=border_mode
        )
        self.output_size = (input_shape[0], n_filters) + outshape

        ##########
        # Params #
        ##########
        W = self.params.get(
            "W",
            get_weights(weights_init=weights_init,
                        shape=filter_shape,
                        name="W",
                        rng=mrg,
                        # if gaussian
                        mean=weights_mean,
                        std=weights_std,
                        # if uniform
                        interval=weights_interval)
        )

        b = self.params.get(
            "b",
            get_bias(shape=(n_filters,), name="b", init_values=bias_init)
        )

        # Finally have the two parameters!
        self.params = OrderedDict([("W", W), ("b", b)])

        ########################
        # Computational Graph! #
        ########################
        if border_mode in ['valid', 'full']:
            conved = convolution_func(self.input,
                                      W,
                                      subsample=(stride,),
                                      image_shape=input_shape,
                                      filter_shape=filter_shape,
                                      border_mode=border_mode)
        else:
            log.error("Invalid border mode: '%s'" % border_mode)
            raise RuntimeError("Invalid border mode: '%s'" % border_mode)

        self.output = activation_func(conved + b.dimshuffle('x', 0, 'x'))
示例#14
0
    def __init__(self, inputs=None, params=None, outdir='outputs/conv2d',
                 n_filters=None, filter_size=None, stride=(1, 1), border_mode='valid',
                 weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3,
                 bias_init=0,
                 activation='rectifier',
                 convolution='conv2d',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 **kwargs):
        """
        Initialize a 2-dimensional convolutional layer.

        Parameters
        ----------
        inputs : tuple(shape, `Theano.TensorType`)
            The dimensionality of the inputs for this model, and the routing information for the model
            to accept inputs from elsewhere. `shape` will be a monad tuple representing known
            sizes for each dimension in the `Theano.TensorType`. Shape of the incoming data:
            (batch_size, num_channels, input_height, input_width).
            If input_size is None, it can be inferred. However, border_mode can't be 'same'.
        params : Dict(string_name: theano SharedVariable), optional
            A dictionary of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as siamese networks or pretraining some
            weights.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        n_filters : int
            The number of filters to use (convolution kernels).
        filter_size : tuple(int) or int
            (filter_height, filter_width). If it is an int, size will be duplicated across height and width.
        stride : tuple(int)
            The distance between the receptive field centers of neighboring units. This is the 'stride' of the
            convolution operation.
        border_mode : str, one of 'valid', 'full'
            A string indicating the convolution border mode.
            If 'valid', the convolution is only computed where the input and the
            filter fully overlap.
            If 'full', the convolution is computed wherever the input and the
            filter overlap by at least one position.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        activation : str or Callable
            The activation function to apply to the layer. See opendeep.utils.activation for options.
        convolution : str or Callable
            The 2-dimensional convolution implementation to use. The default of 'conv2d' is normally fine because it
            uses theano's tensor.nnet.conv.conv2d, which cherry-picks the best implementation with a meta-optimizer if
            you set the theano configuration flag 'optimizer_including=conv_meta'. Otherwise, you could pass a
            callable function, such as cudnn or cuda-convnet if you don't want to use the meta-optimizer.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.

        Notes
        -----
        Theano's default convolution function (`theano.tensor.nnet.conv.conv2d`)
        does not support the 'same' border mode by default. This layer emulates
        it by performing a 'full' convolution and then cropping the result, which
        may negatively affect performance.
        """
        super(Conv2D, self).__init__(**{arg: val for (arg, val) in locals().items() if arg is not 'self'})

        ##################
        # specifications #
        ##################
        # expect input to be in the form (B, C, 0, 1) (batch, channel, rows, cols)
        # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input)
        input_shape, self.input = self.inputs[0]
        assert self.input.ndim == 4, "Expected 4D input variable with form (batch, channel, rows, cols)"
        assert len(input_shape) == 4, "Expected 4D input shape with form (batch, channel, rows, cols)"

        n_channels = input_shape[1]

        if isinstance(filter_size, int):
            filter_size = (filter_size, )*2

        # activation function!
        activation_func = get_activation_function(activation)

        # convolution function!
        if convolution == 'conv2d':
            # using the theano flag optimizer_including=conv_meta will let this conv function optimize itself.
            convolution_func = conv2d
        else:
            assert callable(convolution), "Input convolution was not 'conv2d' and was not Callable."
            convolution_func = convolution

        # filter shape should be in the form (num_filters, num_channels, filter_size[0], filter_size[1])

        outshape = ConvOp.getOutputShape(
            inshp=input_shape[-2:],
            kshp=filter_size,
            stride=stride,
            mode=border_mode
        )
        self.output_size = (input_shape[0], n_filters) + outshape

        filter_shape = (n_filters, n_channels) + filter_size

        ##########
        # Params #
        ##########
        W = self.params.get(
            "W",
            get_weights(weights_init=weights_init,
                        shape=filter_shape,
                        name="W",
                        rng=mrg,
                        # if gaussian
                        mean=weights_mean,
                        std=weights_std,
                        # if uniform
                        interval=weights_interval)
        )

        b = self.params.get(
            "b",
            get_bias(shape=(n_filters, ), name="b", init_values=bias_init)
        )

        # Finally have the two parameters!
        self.params = OrderedDict([("W", W), ("b", b)])

        ########################
        # Computational Graph! #
        ########################
        if border_mode in ['valid', 'full']:
            conved = convolution_func(self.input,
                                      W,
                                      subsample=stride,
                                      image_shape=input_shape,
                                      filter_shape=filter_shape,
                                      border_mode=border_mode)
        else:
            raise RuntimeError("Invalid border mode: '%s'" % border_mode)

        self.output = activation_func(conved + b.dimshuffle('x', 0, 'x', 'x'))
示例#15
0
    def __init__(self,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 outdir='outputs/rnn/',
                 input_size=None,
                 hidden_size=None,
                 output_size=None,
                 layers=1,
                 activation='sigmoid',
                 hidden_activation='relu',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity',
                 r_weights_interval='montreal',
                 r_weights_mean=0,
                 r_weights_std=5e-3,
                 r_bias_init=0.0,
                 cost_function='mse',
                 cost_args=None,
                 noise='dropout',
                 noise_level=None,
                 noise_decay=False,
                 noise_decay_amount=.99,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize a simple recurrent network.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets,
            this will be the initial starting value for hidden layers.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters.
        outdir : str
            The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved.
        input_size : int
            The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional.
        hidden_size : int
            The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional.
        output_size : int
            The size (dimensionality) of the output.
        layers : int
            The number of stacked hidden layers to use.
        activation : str or callable
            The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The activation to perform for the hidden layers.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the output cost of the model.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        noise : str
            What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        direction : str
            The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or
            'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence,
            computing two sets of hiddens and merging them before running through the final decoder.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.

        Raises
        ------
        AssertionError
            When asserting various properties of input parameters. See error messages.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(RNN, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        self.direction = direction
        self.bidirectional = (direction == "bidirectional")
        self.backward = (direction == "backward")
        self.layers = layers
        self.noise = noise

        self.weights_init = weights_init
        self.weights_mean = weights_mean
        self.weights_std = weights_std
        self.weights_interval = weights_interval

        self.r_weights_init = r_weights_init
        self.r_weights_mean = r_weights_mean
        self.r_weights_std = r_weights_std
        self.r_weights_interval = r_weights_interval

        self.bias_init = bias_init
        self.r_bias_init = r_bias_init

        #########################################
        # activation, cost, and noise functions #
        #########################################
        # recurrent hidden activation function!
        self.hidden_activation_func = get_activation_function(
            hidden_activation)

        # output activation function!
        self.activation_func = get_activation_function(activation)

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        # Now deal with noise if we added it:
        if self.noise:
            log.debug('Adding %s noise switch.' % str(noise))
            if noise_level is not None:
                noise_level = sharedX(value=noise_level)
                self.noise_func = get_noise(noise,
                                            noise_level=noise_level,
                                            mrg=mrg)
            else:
                self.noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.noise_switch = sharedX(value=1,
                                        name="basiclayer_noise_switch")

            # noise scheduling
            if noise_decay and noise_level is not None:
                self.noise_schedule = get_decay_function(
                    noise_decay, noise_level, noise_level.get_value(),
                    noise_decay_amount)

        ###############
        # inputs hook #
        ###############
        # grab info from the inputs_hook
        # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension
        # being the temporal dimension.
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.inputs_hook is not None:
            self.input = self.inputs_hook[1]

            if self.input.ndim == 1:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'),
                                           [1, 2])
                self.input_size = 1

            elif self.input.ndim == 2:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

            elif self.input.ndim == 3:
                pass

            elif self.input.ndim > 3:
                self.input = self.input.flatten(3)
                self.input_size = sum(self.input_size)
            else:
                raise NotImplementedError(
                    "Recurrent input with %d dimensions not supported!" %
                    self.input.ndim)
        else:
            # Assume input coming from optimizer is (batches, timesteps, data)
            # so, we need to reshape to (timesteps, batches, data)
            xs = T.tensor3("Xs")
            xs = xs.dimshuffle(1, 0, 2)
            self.input = xs

        # The target outputs for supervised training - in the form of (batches, timesteps, output) which is
        # the same dimension ordering as the expected input from optimizer.
        # therefore, we need to swap it like we did to input xs.
        ys = T.tensor3("Ys")
        ys = ys.dimshuffle(1, 0, 2)
        self.target = ys

        ################
        # hiddens hook #
        ################
        # set an initial value for the recurrent hiddens from hook
        if self.hiddens_hook is not None:
            self.h_init = self.hiddens_hook[1]
            self.hidden_size = self.hiddens_hook[0]
        else:
            # deal with h_init after parameters are made (have to make the same size as hiddens that are computed)
            self.hidden_size = hidden_size

        ##################
        # for generating #
        ##################
        # symbolic scalar for how many recurrent steps to use during generation from the model
        self.n_steps = T.iscalar("generate_n_steps")

        self.output, self.hiddens, self.updates, self.cost, self.params = self.build_computation_graph(
        )
示例#16
0
    def __init__(self,
                 inputs_hook=None,
                 params_hook=None,
                 outdir='outputs/convpool',
                 input_size=None,
                 filter_shape=None,
                 convstride=4,
                 padsize=0,
                 group=1,
                 poolsize=3,
                 poolstride=2,
                 weights_init='gaussian',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=.01,
                 bias_init=0,
                 local_response_normalization=False,
                 convolution='conv2d',
                 activation='rectifier',
                 mrg=RNG_MRG.MRG_RandomStreams(1)):
        """
        Initialize a convpool layer.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together. For now, it needs to include the shape information.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables).
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : tuple
            Shape of the incoming data: (batch_size, num_channels, input_height, input_width).
        filter_shape : tuple
            (num_filters, num_channels, filter_height, filter_width). This is also the shape of the weights matrix.
        convstride : int
            The distance between the receptive field centers of neighboring units. This is the 'subsample' of theano's
            convolution operation.
        padsize : int
            This is the border_mode for theano's convolution operation.
        group : int
            Not yet supported, used for multi-gpu implementation.
            .. todo:: support multi-gpu
        poolsize : int
            How much to downsample the output.
        poolstride : int
            The stride width for downsampling the output.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        activation : str or Callable
            The activation function to apply to the layer. See opendeep.utils.activation for options.
        convolution : str or Callable
            The 2-dimensional convolution implementation to use. The default of 'conv2d' is normally fine because it
            uses theano's tensor.nnet.conv.conv2d, which cherry-picks the best implementation with a meta-optimizer if
            you set the theano configuration flag 'optimizer_including=conv_meta'. Otherwise, you could pass a
            callable function, such as cudnn or cuda-convnet if you don't want to use the meta-optimizer.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        """
        super(ConvPoolLayer, self).__init__(
            **
            {arg: val
             for (arg, val) in locals().items() if arg is not 'self'})

        # deal with the inputs coming from inputs_hook - necessary for now to give an input hook
        # inputs_hook is a tuple of (Shape, Input)
        if self.inputs_hook:
            assert len(
                self.inputs_hook
            ) == 2, "expecting inputs_hook to be tuple of (shape, input)"
            self.input = inputs_hook[1]
        else:
            self.input = T.ftensor4("X")

        self.group = group

        #######################
        # layer configuration #
        #######################
        # activation function!
        self.activation_func = get_activation_function(activation)

        # convolution function!
        if convolution == 'conv2d':
            # using the theano flag optimizer_including=conv_meta will let this conv function optimize itself.
            self.convolution_func = T.nnet.conv2d
        else:
            assert callable(
                convolution
            ), "Input convolution was not 'conv2d' and was not Callable."
            self.convolution_func = convolution

        # expect image_shape to be bc01!
        self.channel = self.input_size[1]

        self.convstride = convstride
        self.padsize = padsize

        self.poolstride = poolstride
        self.poolsize = poolsize

        # if lib_conv is cudnn, it works only on square images and the grad works only when channel % 16 == 0

        assert self.group in [
            1, 2
        ], "group argument needs to be 1 or 2 (1 for default conv2d)"

        filter_shape = numpy.asarray(filter_shape)
        self.input_size = numpy.asarray(self.input_size)

        if local_response_normalization:
            lrn_func = cross_channel_normalization_bc01
        else:
            lrn_func = None

        ################################################
        # Params - make sure to deal with params_hook! #
        ################################################
        if self.group == 1:
            if self.params_hook:
                # make sure the params_hook has W and b
                assert len(self.params_hook) == 2, \
                    "Expected 2 params (W and b) for ConvPoolLayer, found {0!s}!".format(len(self.params_hook))
                self.W, self.b = self.params_hook
            else:
                self.W = get_weights(
                    weights_init=weights_init,
                    shape=filter_shape,
                    name="W",
                    rng=mrg,
                    # if gaussian
                    mean=weights_mean,
                    std=weights_std,
                    # if uniform
                    interval=weights_interval)

                self.b = get_bias(shape=filter_shape[0],
                                  init_values=bias_init,
                                  name="b")

            self.params = [self.W, self.b]

        else:
            filter_shape[0] = filter_shape[0] / 2
            filter_shape[1] = filter_shape[1] / 2

            self.input_size[0] = self.input_size[0] / 2
            self.input_size[1] = self.input_size[1] / 2
            if self.params_hook:
                assert len(self.params_hook
                           ) == 4, "expected params_hook to have 4 params"
                self.W0, self.W1, self.b0, self.b1 = self.params_hook
            else:
                self.W0 = get_weights_gaussian(shape=filter_shape, name="W0")
                self.W1 = get_weights_gaussian(shape=filter_shape, name="W1")
                self.b0 = get_bias(shape=filter_shape[0],
                                   init_values=bias_init,
                                   name="b0")
                self.b1 = get_bias(shape=filter_shape[0],
                                   init_values=bias_init,
                                   name="b1")
            self.params = [self.W0, self.b0, self.W1, self.b1]

        #############################################
        # build appropriate graph for conv. version #
        #############################################
        self.output = self._build_computation_graph()

        # Local Response Normalization (for AlexNet)
        if local_response_normalization and lrn_func is not None:
            self.output = lrn_func(self.output)

        log.debug("convpool layer initialized with shape_in: %s",
                  str(self.input_size))
示例#17
0
def experiment(state, channel):
    if state.test_model and 'config' in os.listdir('.'):
        print('Loading local config file')
        config_file = open('config', 'r')
        config = config_file.readlines()
        try:
            config_vals = config[0].split('(')[1:][0].split(')')[:-1][0].split(
                ', ')
        except:
            config_vals = config[0][3:-1].replace(': ',
                                                  '=').replace("'",
                                                               "").split(', ')
            config_vals = filter(
                lambda x: not 'jobman' in x and not '/' in x and not ':' in x
                and not 'experiment' in x, config_vals)

        for CV in config_vals:
            print(CV)
            if CV.startswith('test'):
                print('Do not override testing switch')
                continue
            try:
                exec('state.' + CV) in globals(), locals()
            except:
                exec('state.' + CV.split('=')[0] + "='" + CV.split('=')[1] +
                     "'") in globals(), locals()

    else:
        # Save the current configuration
        # Useful for logs/experiments
        print('Saving config')
        f = open('config', 'w')
        f.write(str(state))
        f.close()

    print(state)
    # Load the data, train = train+valid, and shuffle train
    # Targets are not used (will be misaligned after shuffling train
    if state.dataset == 'MNIST':
        (train_X, train_Y), (valid_X,
                             valid_Y), (test_X,
                                        test_Y) = load_mnist(state.data_path)
        train_X = numpy.concatenate((train_X, valid_X))

    elif state.dataset == 'MNIST_binary':
        (train_X,
         train_Y), (valid_X,
                    valid_Y), (test_X,
                               test_Y) = load_mnist_binary(state.data_path)
        train_X = numpy.concatenate((train_X, valid_X))

    elif state.dataset == 'TFD':
        (train_X, train_Y), (valid_X,
                             valid_Y), (test_X,
                                        test_Y) = load_tfd(state.data_path)

    N_input = train_X.shape[1]
    root_N_input = int(numpy.sqrt(N_input))  #
    numpy.random.seed(1)
    numpy.random.shuffle(train_X)
    train_X = theano.shared(train_X)
    valid_X = theano.shared(valid_X)
    test_X = theano.shared(test_X)

    # Theano variables and RNG
    X = T.fmatrix()  # Input of the graph
    index = T.lscalar()  # index to minibatch
    MRG = RNG_MRG.MRG_RandomStreams(1)

    # Network and training specifications
    K = state.K  # number of hidden layers
    N = state.N  # number of walkbacks
    layer_sizes = [
        N_input
    ] + [state.hidden_size
         ] * K  # layer sizes, from h0 to hK (h0 is the visible layer)
    learning_rate = theano.shared(cast32(state.learning_rate))  # learning rate
    annealing = cast32(state.annealing)  # exponential annealing coefficient
    momentum = theano.shared(cast32(state.momentum))  # momentum term

    # PARAMETERS : weights list and bias list.
    # initialize a list of weights and biases based on layer_sizes
    weights_list = [
        get_shared_weights(
            layer_sizes[i], layer_sizes[i + 1],
            numpy.sqrt(6. / (layer_sizes[i] + layer_sizes[i + 1])), 'W')
        for i in range(K)
    ]
    bias_list = [get_shared_bias(layer_sizes[i], 'b') for i in range(K + 1)]

    if state.test_model:
        # Load the parameters of the last epoch
        # maybe if the path is given, load these specific attributes
        param_files = list(
            filter(lambda x: 'params' in x, os.listdir('.'))
        )  # https://stackoverflow.com/questions/15876259/typeerror-filter-object-is-not-subscriptable
        max_epoch_idx = numpy.argmax(
            [int(x.split('_')[-1].split('.')[0]) for x in param_files])
        params_to_load = param_files[max_epoch_idx]
        with open(params_to_load, 'rb') as f:
            PARAMS = pk.load(f, encoding='bytes')
        [
            p.set_value(lp.get_value(borrow=False))
            for lp, p in zip(PARAMS[:len(weights_list)], weights_list)
        ]
        [
            p.set_value(lp.get_value(borrow=False))
            for lp, p in zip(PARAMS[len(weights_list):], bias_list)
        ]

    # Util functions
    def dropout(IN, p=0.5):
        noise = MRG.binomial(p=p, n=1, size=IN.shape, dtype='float32')
        OUT = (IN * noise) / cast32(p)
        return OUT

    def add_gaussian_noise(IN, std=1):
        print('GAUSSIAN NOISE : ', std)
        noise = MRG.normal(avg=0, std=std, size=IN.shape, dtype='float32')
        OUT = IN + noise
        return OUT

    def corrupt_input(IN, p=0.5):
        # salt and pepper? masking?
        noise = MRG.binomial(p=p, n=1, size=IN.shape, dtype='float32')
        IN = IN * noise
        return IN

    def salt_and_pepper(IN, p=0.2):
        # salt and pepper noise
        print('DAE uses salt and pepper noise')
        a = MRG.binomial(size=IN.shape, n=1, p=1 - p, dtype='float32')
        b = MRG.binomial(size=IN.shape, n=1, p=0.5, dtype='float32')
        c = T.eq(a, 0) * b
        return IN * a + c

    # Odd layer update function
    # just a loop over the odd layers
    def update_odd_layers(hiddens, noisy):
        for i in range(1, K + 1, 2):
            print(i)
            if noisy:
                simple_update_layer(hiddens, None, i)
            else:
                simple_update_layer(hiddens, None, i, add_noise=False)

    # Even layer update
    # p_X_chain is given to append the p(X|...) at each update (one update = odd update + even update)
    def update_even_layers(hiddens, p_X_chain, noisy):
        for i in range(0, K + 1, 2):
            print(i)
            if noisy:
                simple_update_layer(hiddens, p_X_chain, i)
            else:
                simple_update_layer(hiddens, p_X_chain, i, add_noise=False)

    # The layer update function
    # hiddens   :   list containing the symbolic theano variables [visible, hidden1, hidden2, ...]
    #               layer_update will modify this list inplace
    # p_X_chain :   list containing the successive p(X|...) at each update
    #               update_layer will append to this list
    # add_noise     : pre and post activation gaussian noise

    def simple_update_layer(hiddens, p_X_chain, i, add_noise=True):
        # Compute the dot product, whatever layer
        post_act_noise = 0

        if i == 0:
            hiddens[i] = T.dot(hiddens[i + 1],
                               weights_list[i].T) + bias_list[i]

        elif i == K:
            hiddens[i] = T.dot(hiddens[i - 1],
                               weights_list[i - 1]) + bias_list[i]

        else:
            # next layer        :   layers[i+1], assigned weights : W_i
            # previous layer    :   layers[i-1], assigned weights : W_(i-1)
            hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + T.dot(
                hiddens[i - 1], weights_list[i - 1]) + bias_list[i]

        # Add pre-activation noise if NOT input layer
        if i == 1 and state.noiseless_h1:
            print('>>NO noise in first layer')
            add_noise = False

        # pre activation noise
        if i != 0 and add_noise:
            print('Adding pre-activation gaussian noise')
            hiddens[i] = add_gaussian_noise(hiddens[i],
                                            state.hidden_add_noise_sigma)

        # ACTIVATION!
        if i == 0:
            print('Sigmoid units')
            hiddens[i] = T.nnet.sigmoid(hiddens[i])
        else:
            print('Hidden units')
            hiddens[i] = hidden_activation(hiddens[i])

        # post activation noise
        if i != 0 and add_noise:
            print('Adding post-activation gaussian noise')
            hiddens[i] = add_gaussian_noise(hiddens[i],
                                            state.hidden_add_noise_sigma)

        # build the reconstruction chain
        if i == 0:
            # if input layer -> append p(X|...)
            p_X_chain.append(hiddens[i])

            # sample from p(X|...)
            if state.input_sampling:
                print('Sampling from input')
                sampled = MRG.binomial(p=hiddens[i],
                                       size=hiddens[i].shape,
                                       dtype='float32')
            else:
                print('>>NO input sampling')
                sampled = hiddens[i]
            # add noise
            sampled = salt_and_pepper(sampled, state.input_salt_and_pepper)

            # set input layer
            hiddens[i] = sampled

    def update_layers(hiddens, p_X_chain, noisy=True):
        print('odd layer update')
        update_odd_layers(hiddens, noisy)
        print
        print('even layer update')
        update_even_layers(hiddens, p_X_chain, noisy)

    ''' F PROP '''
    if state.act == 'sigmoid':
        print('Using sigmoid activation')
        hidden_activation = T.nnet.sigmoid
    elif state.act == 'rectifier':
        print('Using rectifier activation')
        hidden_activation = lambda x: T.maximum(cast32(0), x)
    elif state.act == 'tanh':
        hidden_activation = lambda x: T.tanh(x)
    ''' Corrupt X '''
    X_corrupt = salt_and_pepper(X, state.input_salt_and_pepper)
    ''' hidden layer init '''
    hiddens = [X_corrupt]
    p_X_chain = []
    print("Hidden units initialization")
    for w, b in zip(weights_list, bias_list[1:]):
        # init with zeros
        print("Init hidden units at zero before creating the graph")
        hiddens.append(T.zeros_like(T.dot(hiddens[-1], w)))

    # The layer update scheme
    print("Building the graph :", N, "updates")
    for i in range(N):
        update_layers(hiddens, p_X_chain)

    # COST AND GRADIENTS
    print('Cost w.r.t p(X|...) at every step in the graph')
    #COST        =   T.mean(T.nnet.binary_crossentropy(reconstruction, X))
    COST = [T.mean(T.nnet.binary_crossentropy(rX, X)) for rX in p_X_chain]
    #COST = [T.mean(T.sqr(rX-X)) for rX in p_X_chain]
    show_COST = COST[-1]
    COST = numpy.sum(COST)
    #COST = T.mean(COST)

    params = weights_list + bias_list
    print('======== COST:', COST)
    print('======== params:', params)
    gradient = T.grad(COST, params)

    gradient_buffer = [
        theano.shared(numpy.zeros(x.get_value().shape, dtype='float32'))
        for x in params
    ]

    m_gradient = [
        momentum * gb + (cast32(1) - momentum) * g
        for (gb, g) in zip(gradient_buffer, gradient)
    ]
    g_updates = [(p, p - learning_rate * mg)
                 for (p, mg) in zip(params, m_gradient)]
    b_updates = zip(gradient_buffer, m_gradient)

    updates = OrderedDict(g_updates + list(b_updates))

    f_cost = theano.function(inputs=[X], outputs=show_COST)

    indexed_batch = train_X[index * state.batch_size:(index + 1) *
                            state.batch_size]
    sampled_batch = MRG.binomial(p=indexed_batch,
                                 size=indexed_batch.shape,
                                 dtype='float32')

    f_learn = theano.function(inputs=[index],
                              updates=updates,
                              givens={X: indexed_batch},
                              outputs=show_COST)

    f_test = theano.function(inputs=[X],
                             outputs=[X_corrupt] + hiddens[0] + p_X_chain,
                             on_unused_input='warn')

    #############
    # Denoise some numbers  :   show number, noisy number, reconstructed number
    #############
    import random as R
    R.seed(1)
    random_idx = numpy.array(R.sample(range(len(test_X.get_value())), 100))
    numbers = test_X.get_value()[random_idx]

    f_noise = theano.function(inputs=[X],
                              outputs=salt_and_pepper(
                                  X, state.input_salt_and_pepper))
    noisy_numbers = f_noise(test_X.get_value()[random_idx])

    # Recompile the graph without noise for reconstruction function
    hiddens_R = [X]
    p_X_chain_R = []

    for w, b in zip(weights_list, bias_list[1:]):
        # init with zeros
        hiddens_R.append(T.zeros_like(T.dot(hiddens_R[-1], w)))

    # The layer update scheme
    for i in range(N):
        update_layers(hiddens_R, p_X_chain_R, noisy=False)

    f_recon = theano.function(inputs=[X], outputs=p_X_chain_R[-1])

    ############
    # Sampling #
    ############

    # the input to the sampling function
    network_state_input = [X] + [T.fmatrix() for i in range(K)]

    # "Output" state of the network (noisy)
    # initialized with input, then we apply updates
    #network_state_output    =   network_state_input

    network_state_output = [X] + network_state_input[1:]

    visible_pX_chain = []

    # ONE update
    update_layers(network_state_output, visible_pX_chain, noisy=True)

    if K == 1:
        f_sample_simple = theano.function(inputs=[X],
                                          outputs=visible_pX_chain[-1])

    # WHY IS THERE A WARNING????
    # because the first odd layers are not used -> directly computed FROM THE EVEN layers
    # unused input = warn
    f_sample2 = theano.function(inputs=network_state_input,
                                outputs=network_state_output +
                                visible_pX_chain,
                                on_unused_input='warn')

    def sample_some_numbers_single_layer():
        x0 = test_X.get_value()[:1]
        samples = [x0]
        x = f_noise(x0)
        for i in range(399):
            x = f_sample_simple(x)
            samples.append(x)
            x = numpy.random.binomial(n=1, p=x, size=x.shape).astype('float32')
            x = f_noise(x)
        return numpy.vstack(samples)

    def sampling_wrapper(NSI):
        out = f_sample2(*NSI)
        NSO = out[:len(network_state_output)]
        vis_pX_chain = out[len(network_state_output):]
        return NSO, vis_pX_chain

    def sample_some_numbers(N=400):
        # The network's initial state
        init_vis = test_X.get_value()[:1]

        noisy_init_vis = f_noise(init_vis)

        network_state = [[noisy_init_vis] + [
            numpy.zeros((1, len(b.get_value())), dtype='float32')
            for b in bias_list[1:]
        ]]

        visible_chain = [init_vis]

        noisy_h0_chain = [noisy_init_vis]

        for i in range(N - 1):

            # feed the last state into the network, compute new state, and obtain visible units expectation chain
            net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1])

            # append to the visible chain
            visible_chain += vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)

            noisy_h0_chain.append(net_state_out[0])

        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)

    def plot_samples(epoch_number):
        to_sample = time.time()
        if K == 1:
            # one layer model
            V = sample_some_numbers_single_layer()
        else:
            V, H0 = sample_some_numbers()
        img_samples = PIL.Image.fromarray(
            tile_raster_images(V, (root_N_input, root_N_input), (20, 20)))

        fname = 'samples_epoch_' + str(epoch_number) + '.png'
        img_samples.save(fname)
        print('Took ' + str(time.time() - to_sample) +
              ' to sample 400 numbers')

    ##############
    # Inpainting #
    ##############
    def inpainting(digit):
        # The network's initial state

        # NOISE INIT
        init_vis = cast32(numpy.random.uniform(size=digit.shape))

        #noisy_init_vis  =   f_noise(init_vis)
        #noisy_init_vis  =   cast32(numpy.random.uniform(size=init_vis.shape))

        # INDEXES FOR VISIBLE AND NOISY PART
        noise_idx = (numpy.arange(N_input) % root_N_input < (root_N_input / 2))
        fixed_idx = (numpy.arange(N_input) % root_N_input > (root_N_input / 2))

        # function to re-init the visible to the same noise

        # FUNCTION TO RESET HALF VISIBLE TO DIGIT
        def reset_vis(V):
            V[0][fixed_idx] = digit[0][fixed_idx]
            return V

        # INIT DIGIT : NOISE and RESET HALF TO DIGIT
        init_vis = reset_vis(init_vis)

        network_state = [[init_vis] + [
            numpy.zeros((1, len(b.get_value())), dtype='float32')
            for b in bias_list[1:]
        ]]

        visible_chain = [init_vis]

        noisy_h0_chain = [init_vis]

        for i in range(49):

            # feed the last state into the network, compute new state, and obtain visible units expectation chain
            net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1])

            # reset half the digit
            net_state_out[0] = reset_vis(net_state_out[0])
            vis_pX_chain[0] = reset_vis(vis_pX_chain[0])

            # append to the visible chain
            visible_chain += vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)

            noisy_h0_chain.append(net_state_out[0])

        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)

    def save_params(n, params):
        print('saving parameters...')
        save_path = 'params_epoch_' + str(n) + '.pkl'
        f = open(save_path, 'wb')
        try:
            pk.dump(params, f, protocol=pk.HIGHEST_PROTOCOL)
        finally:
            f.close()

    # TRAINING
    n_epoch = state.n_epoch
    batch_size = state.batch_size
    STOP = False
    counter = 0

    train_costs = []
    valid_costs = []
    test_costs = []

    if state.vis_init:
        bias_list[0].set_value(
            logit(numpy.clip(0.9, 0.001,
                             train_X.get_value().mean(axis=0))))

    if state.test_model:
        # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting
        print('Testing : skip training')
        STOP = True

    while not STOP:
        counter += 1
        t = time.time()
        print(
            counter,
            '\t',
        )

        #train
        train_cost = []
        for i in range(len(train_X.get_value(borrow=True)) // batch_size):
            #train_cost.append(f_learn(train_X[i * batch_size : (i+1) * batch_size]))
            #training_idx = numpy.array(range(i*batch_size, (i+1)*batch_size), dtype='int32')
            train_cost.append(f_learn(i))
        train_cost = numpy.mean(train_cost)
        train_costs.append(train_cost)
        print(
            'Train : ',
            trunc(train_cost),
            '\t',
        )

        #valid
        valid_cost = []
        for i in range(len(valid_X.get_value(borrow=True)) // 100):
            valid_cost.append(
                f_cost(valid_X.get_value()[i * 100:(i + 1) * batch_size]))
        valid_cost = numpy.mean(valid_cost)
        #valid_cost  =   123
        valid_costs.append(valid_cost)
        print(
            'Valid : ',
            trunc(valid_cost),
            '\t',
        )

        #test
        test_cost = []
        for i in range(len(test_X.get_value(borrow=True)) // 100):
            test_cost.append(
                f_cost(test_X.get_value()[i * 100:(i + 1) * batch_size]))
        test_cost = numpy.mean(test_cost)
        test_costs.append(test_cost)
        print(
            'Test  : ',
            trunc(test_cost),
            '\t',
        )

        if counter >= n_epoch:
            STOP = True

        print(
            'time : ',
            trunc(time.time() - t),
        )

        print(
            'MeanVisB : ',
            trunc(bias_list[0].get_value().mean()),
        )

        print('W : ', [
            trunc(abs(w.get_value(borrow=True)).mean()) for w in weights_list
        ])

        if (counter % 5) == 0:
            # Checking reconstruction
            reconstructed = f_recon(noisy_numbers)
            # Concatenate stuff
            stacked = numpy.vstack([
                numpy.vstack([
                    numbers[i * 10:(i + 1) * 10],
                    noisy_numbers[i * 10:(i + 1) * 10],
                    reconstructed[i * 10:(i + 1) * 10]
                ]) for i in range(10)
            ])

            number_reconstruction = PIL.Image.fromarray(
                tile_raster_images(stacked, (root_N_input, root_N_input),
                                   (10, 30)))
            #epoch_number    =   reduce(lambda x,y : x + y, ['_'] * (4-len(str(counter)))) + str(counter)
            number_reconstruction.save('number_reconstruction' + str(counter) +
                                       '.png')

            #sample_numbers(counter, 'seven')
            plot_samples(counter)

            #save params
            save_params(counter, params)

        # ANNEAL!
        new_lr = learning_rate.get_value() * annealing
        learning_rate.set_value(new_lr)

    # Save
    state.train_costs = train_costs
    state.valid_costs = valid_costs
    state.test_costs = test_costs

    # if test

    # 10k samples
    print('Generating 10,000 samples')
    samples, _ = sample_some_numbers(N=10000)
    f_samples = 'samples.npy'
    numpy.save(f_samples, samples)
    print('saved digits')

    # parzen
    print('Evaluating parzen window')
    import likelihood_estimation_parzen
    likelihood_estimation_parzen.main(0.20, 'mnist')

    # Inpainting
    print('Inpainting')
    test_X = test_X.get_value()

    numpy.random.seed(2)
    test_idx = numpy.arange(len(test_Y))

    for Iter in range(10):

        numpy.random.shuffle(test_idx)
        test_X = test_X[test_idx]
        test_Y = test_Y[test_idx]

        digit_idx = [(test_Y == i).argmax() for i in range(10)]
        inpaint_list = []

        for idx in digit_idx:
            DIGIT = test_X[idx:idx + 1]
            V_inpaint, H_inpaint = inpainting(DIGIT)
            inpaint_list.append(V_inpaint)

        INPAINTING = numpy.vstack(inpaint_list)

        plot_inpainting = PIL.Image.fromarray(
            tile_raster_images(INPAINTING, (root_N_input, root_N_input),
                               (10, 50)))

        fname = 'inpainting_' + str(Iter) + '.png'
        #fname   =   os.path.join(state.model_path, fname)

        plot_inpainting.save(fname)

        if False and __name__ == "__main__":
            os.system('eog inpainting.png')

    if __name__ == '__main__':
        import ipdb
        ipdb.set_trace()

    return
def experiment(state, outdir_base='./'):
    rng.seed(1)  #seed the numpy random generator
    # Initialize output directory and files
    data.mkdir_p(outdir_base)
    outdir = outdir_base + "/" + state.dataset + "/"
    data.mkdir_p(outdir)
    logfile = outdir + "log.txt"
    with open(logfile, 'w') as f:
        f.write("MODEL 2, {0!s}\n\n".format(state.dataset))
    train_convergence_pre = outdir + "train_convergence_pre.csv"
    train_convergence_post = outdir + "train_convergence_post.csv"
    valid_convergence_pre = outdir + "valid_convergence_pre.csv"
    valid_convergence_post = outdir + "valid_convergence_post.csv"
    test_convergence_pre = outdir + "test_convergence_pre.csv"
    test_convergence_post = outdir + "test_convergence_post.csv"

    print
    print "----------MODEL 2, {0!s}--------------".format(state.dataset)
    print

    #load parameters from config file if this is a test
    config_filename = outdir + 'config'
    if state.test_model and 'config' in os.listdir(outdir):
        config_vals = load_from_config(config_filename)
        for CV in config_vals:
            print CV
            if CV.startswith('test'):
                print 'Do not override testing switch'
                continue
            try:
                exec('state.' + CV) in globals(), locals()
            except:
                exec('state.' + CV.split('=')[0] + "='" + CV.split('=')[1] +
                     "'") in globals(), locals()
    else:
        # Save the current configuration
        # Useful for logs/experiments
        print 'Saving config'
        with open(config_filename, 'w') as f:
            f.write(str(state))

    print state
    # Load the data, train = train+valid, and sequence
    artificial = False
    if state.dataset == 'MNIST_1' or state.dataset == 'MNIST_2' or state.dataset == 'MNIST_3':
        (train_X,
         train_Y), (valid_X,
                    valid_Y), (test_X,
                               test_Y) = data.load_mnist(state.data_path)
        train_X = numpy.concatenate((train_X, valid_X))
        train_Y = numpy.concatenate((train_Y, valid_Y))
        artificial = True
        try:
            dataset = int(state.dataset.split('_')[1])
        except:
            raise AssertionError(
                "artificial dataset number not recognized. Input was " +
                state.dataset)
    else:
        raise AssertionError("dataset not recognized.")

    train_X = theano.shared(train_X)
    train_Y = theano.shared(train_Y)
    valid_X = theano.shared(valid_X)
    valid_Y = theano.shared(valid_Y)
    test_X = theano.shared(test_X)
    test_Y = theano.shared(test_Y)

    if artificial:
        print 'Sequencing MNIST data...'
        print 'train set size:', len(train_Y.eval())
        print 'valid set size:', len(valid_Y.eval())
        print 'test set size:', len(test_Y.eval())
        data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X,
                                 test_Y, dataset, rng)
        print 'train set size:', len(train_Y.eval())
        print 'valid set size:', len(valid_Y.eval())
        print 'test set size:', len(test_Y.eval())
        print 'Sequencing done.'
        print

    N_input = train_X.eval().shape[1]
    root_N_input = numpy.sqrt(N_input)

    # Network and training specifications
    layers = state.layers  # number hidden layers
    walkbacks = state.walkbacks  # number of walkbacks
    layer_sizes = [
        N_input
    ] + [state.hidden_size
         ] * layers  # layer sizes, from h0 to hK (h0 is the visible layer)
    learning_rate = theano.shared(cast32(state.learning_rate))  # learning rate
    annealing = cast32(state.annealing)  # exponential annealing coefficient
    momentum = theano.shared(cast32(state.momentum))  # momentum term

    # PARAMETERS : weights list and bias list.
    # initialize a list of weights and biases based on layer_sizes
    weights_list = [
        get_shared_weights(layer_sizes[i],
                           layer_sizes[i + 1],
                           name="W_{0!s}_{1!s}".format(i, i + 1))
        for i in range(layers)
    ]  # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out))
    recurrent_weights_list = [
        get_shared_weights(layer_sizes[i + 1],
                           layer_sizes[i],
                           name="V_{0!s}_{1!s}".format(i + 1, i))
        for i in range(layers)
    ]  # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out))
    bias_list = [
        get_shared_bias(layer_sizes[i], name='b_' + str(i))
        for i in range(layers + 1)
    ]  # initialize each layer to 0's.

    # Theano variables and RNG
    MRG = RNG_MRG.MRG_RandomStreams(1)
    X = T.fmatrix('X')
    Xs = [
        T.fmatrix(name="X_initial") if i == 0 else T.fmatrix(name="X_" +
                                                             str(i + 1))
        for i in range(walkbacks + 1)
    ]
    hiddens_input = [X] + [
        T.fmatrix(name="h_" + str(i + 1)) for i in range(layers)
    ]
    hiddens_output = hiddens_input[:1] + hiddens_input[1:]

    # Check variables for bad inputs and stuff
    if state.batch_size > len(Xs):
        warnings.warn(
            "Batch size should not be bigger than walkbacks+1 (len(Xs)) unless you know what you're doing. You need to know the sequence length beforehand."
        )
    if state.batch_size <= 0:
        raise AssertionError("batch size cannot be <= 0")
    ''' F PROP '''
    if state.hidden_act == 'sigmoid':
        print 'Using sigmoid activation for hiddens'
        hidden_activation = T.nnet.sigmoid
    elif state.hidden_act == 'rectifier':
        print 'Using rectifier activation for hiddens'
        hidden_activation = lambda x: T.maximum(cast32(0), x)
    elif state.hidden_act == 'tanh':
        print 'Using hyperbolic tangent activation for hiddens'
        hidden_activation = lambda x: T.tanh(x)
    else:
        raise AssertionError(
            "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid"
            .format(state.hidden_act))

    if state.visible_act == 'sigmoid':
        print 'Using sigmoid activation for visible layer'
        visible_activation = T.nnet.sigmoid
    elif state.visible_act == 'softmax':
        print 'Using softmax activation for visible layer'
        visible_activation = T.nnet.softmax
    else:
        raise AssertionError(
            "Did not recognize visible activation {0!s}, please use sigmoid or softmax"
            .format(state.visible_act))

    def update_layers(hiddens,
                      p_X_chain,
                      Xs,
                      sequence_idx,
                      noisy=True,
                      sampling=True):
        print 'odd layer updates'
        update_odd_layers(hiddens, noisy)
        print 'even layer updates'
        update_even_layers(hiddens, p_X_chain, Xs, sequence_idx, noisy,
                           sampling)
        # choose the correct output for hidden_outputs based on batch_size and walkbacks (this is due to an issue with batches, see note in run_story2.py)
        if state.batch_size <= len(
                Xs) and sequence_idx == state.batch_size - 1:
            return hiddens
        else:
            return None
        print 'done full update.'
        print

    # Odd layer update function
    # just a loop over the odd layers
    def update_odd_layers(hiddens, noisy):
        for i in range(1, len(hiddens), 2):
            print 'updating layer', i
            simple_update_layer(hiddens, None, None, None, i, add_noise=noisy)

    # Even layer update
    # p_X_chain is given to append the p(X|...) at each full update (one update = odd update + even update)
    def update_even_layers(hiddens, p_X_chain, Xs, sequence_idx, noisy,
                           sampling):
        for i in range(0, len(hiddens), 2):
            print 'updating layer', i
            simple_update_layer(hiddens,
                                p_X_chain,
                                Xs,
                                sequence_idx,
                                i,
                                add_noise=noisy,
                                input_sampling=sampling)

    # The layer update function
    # hiddens   :   list containing the symbolic theano variables [visible, hidden1, hidden2, ...]
    #               layer_update will modify this list inplace
    # p_X_chain :   list containing the successive p(X|...) at each update
    #               update_layer will append to this list
    # add_noise     : pre and post activation gaussian noise

    def simple_update_layer(hiddens,
                            p_X_chain,
                            Xs,
                            sequence_idx,
                            i,
                            add_noise=True,
                            input_sampling=True):
        # Compute the dot product, whatever layer
        # If the visible layer X
        if i == 0:
            print 'using', recurrent_weights_list[i]
            hiddens[i] = (T.dot(hiddens[i + 1], recurrent_weights_list[i]) +
                          bias_list[i])
        # If the top layer
        elif i == len(hiddens) - 1:
            print 'using', weights_list[i - 1]
            hiddens[i] = T.dot(hiddens[i - 1],
                               weights_list[i - 1]) + bias_list[i]
        # Otherwise in-between layers
        else:
            # next layer        :   hiddens[i+1], assigned weights : W_i
            # previous layer    :   hiddens[i-1], assigned weights : W_(i-1)
            print "using {0!s} and {1!s}".format(weights_list[i - 1],
                                                 recurrent_weights_list[i])
            hiddens[i] = T.dot(
                hiddens[i + 1], recurrent_weights_list[i]) + T.dot(
                    hiddens[i - 1], weights_list[i - 1]) + bias_list[i]

        # Add pre-activation noise if NOT input layer
        if i == 1 and state.noiseless_h1:
            print '>>NO noise in first hidden layer'
            add_noise = False

        # pre activation noise
        if i != 0 and add_noise:
            print 'Adding pre-activation gaussian noise for layer', i
            hiddens[i] = add_gaussian_noise(hiddens[i],
                                            state.hidden_add_noise_sigma)

        # ACTIVATION!
        if i == 0:
            print 'Sigmoid units activation for visible layer X'
            hiddens[i] = visible_activation(hiddens[i])
        else:
            print 'Hidden units {} activation for layer'.format(state.act), i
            hiddens[i] = hidden_activation(hiddens[i])

        # post activation noise
        # why is there post activation noise? Because there is already pre-activation noise, this just doubles the amount of noise between each activation of the hiddens.
#         if i != 0 and add_noise:
#             print 'Adding post-activation gaussian noise for layer', i
#             hiddens[i]  =   add_gaussian(hiddens[i], state.hidden_add_noise_sigma)

# build the reconstruction chain if updating the visible layer X
        if i == 0:
            # if input layer -> append p(X|...)
            p_X_chain.append(
                hiddens[i])  #what the predicted next input should be

            if sequence_idx + 1 < len(Xs):
                next_input = Xs[sequence_idx + 1]
                # sample from p(X|...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL. real-valued inputs should be gaussian
                if input_sampling:
                    print 'Sampling from input'
                    sampled = MRG.binomial(p=next_input,
                                           size=next_input.shape,
                                           dtype='float32')
                else:
                    print '>>NO input sampling'
                    sampled = next_input
                # add noise
                sampled = salt_and_pepper(sampled, state.input_salt_and_pepper)

                # DOES INPUT SAMPLING MAKE SENSE FOR SEQUENTIAL? - not really since it was used in walkbacks which was gibbs.
                # set input layer
                hiddens[i] = sampled

    def build_graph(hiddens, Xs, noisy=True, sampling=True):
        predicted_X_chain = [
        ]  # the visible layer that gets generated at each update_layers run
        H_chain = [
        ]  # either None or hiddens that gets generated at each update_layers run, this is used to determine what the correct hiddens_output should be
        print "Building the graph :", walkbacks, "updates"
        for i in range(walkbacks):
            print "Forward Prediction {!s}/{!s}".format(i + 1, walkbacks)
            H_chain.append(
                update_layers(hiddens, predicted_X_chain, Xs, i, noisy,
                              sampling))
        return predicted_X_chain, H_chain

    '''Build the main training graph'''
    # corrupt x
    hiddens_output[0] = salt_and_pepper(hiddens_output[0],
                                        state.input_salt_and_pepper)
    # build the computation graph and the generated visible layers and appropriate hidden_output
    predicted_X_chain, H_chain = build_graph(hiddens_output,
                                             Xs,
                                             noisy=True,
                                             sampling=state.input_sampling)
    #     predicted_X_chain, H_chain = build_graph(hiddens_output, Xs, noisy=False, sampling=state.input_sampling) #testing one-hot without noise

    # choose the correct output for hiddens_output (this is due to the issue with batches - see note in run_story2.py)
    # this finds the not-None element of H_chain and uses that for hiddens_output
    h_empty = [True if h is None else False for h in H_chain]
    if False in h_empty:  # if there was a not-None element
        hiddens_output = H_chain[h_empty.index(
            False
        )]  # set hiddens_output to the appropriate element from H_chain

    ######################
    # COST AND GRADIENTS #
    ######################
    print
    if state.cost_funct == 'binary_crossentropy':
        print 'Using binary cross-entropy cost!'
        cost_function = lambda x, y: T.mean(T.nnet.binary_crossentropy(x, y))
    elif state.cost_funct == 'square':
        print "Using square error cost!"
        cost_function = lambda x, y: T.mean(T.sqr(x - y))
    else:
        raise AssertionError(
            "Did not recognize cost function {0!s}, please use binary_crossentropy or square"
            .format(state.cost_funct))
    print 'Cost w.r.t p(X|...) at every step in the graph'

    costs = [
        cost_function(predicted_X_chain[i], Xs[i + 1])
        for i in range(len(predicted_X_chain))
    ]
    # outputs for the functions
    show_COSTs = [costs[0]] + [costs[-1]]

    # cost for the gradient
    # care more about the immediate next predictions rather than the future - use exponential decay
    #     COST = T.sum(costs)
    COST = T.sum([
        T.exp(-i / T.ceil(walkbacks / 3)) * costs[i] for i in range(len(costs))
    ])

    params = weights_list + recurrent_weights_list + bias_list
    print "params:", params

    print "creating functions..."
    gradient = T.grad(COST, params)

    gradient_buffer = [
        theano.shared(numpy.zeros(param.get_value().shape, dtype='float32'))
        for param in params
    ]

    m_gradient = [
        momentum * gb + (cast32(1) - momentum) * g
        for (gb, g) in zip(gradient_buffer, gradient)
    ]
    param_updates = [(param, param - learning_rate * mg)
                     for (param, mg) in zip(params, m_gradient)]
    gradient_buffer_updates = zip(gradient_buffer, m_gradient)

    updates = OrderedDict(param_updates + gradient_buffer_updates)

    #odd layer h's not used from input -> calculated directly from even layers (starting with h_0) since the odd layers are updated first.
    f_cost = theano.function(inputs=hiddens_input + Xs,
                             outputs=hiddens_output + show_COSTs,
                             on_unused_input='warn')

    f_learn = theano.function(inputs=hiddens_input + Xs,
                              updates=updates,
                              outputs=hiddens_output + show_COSTs,
                              on_unused_input='warn')

    print "functions done."
    print

    #############
    # Denoise some numbers  :   show number, noisy number, reconstructed number
    #############
    import random as R
    R.seed(1)
    # a function to add salt and pepper noise
    f_noise = theano.function(inputs=[X],
                              outputs=salt_and_pepper(
                                  X, state.input_salt_and_pepper))

    # Recompile the graph without noise for reconstruction function - the input x_recon is already going to be noisy, and this is to test on a simulated 'real' input.
    X_recon = T.fvector("X_recon")
    Xs_recon = [T.fvector("Xs_recon")]
    hiddens_R_input = [X_recon] + [
        T.fvector(name="h_recon_" + str(i + 1)) for i in range(layers)
    ]
    hiddens_R_output = hiddens_R_input[:1] + hiddens_R_input[1:]

    # The layer update scheme
    print "Creating graph for noisy reconstruction function at checkpoints during training."
    p_X_chain_R, H_chain_R = build_graph(hiddens_R_output,
                                         Xs_recon,
                                         noisy=False)

    # choose the correct output from H_chain for hidden_outputs based on batch_size and walkbacks
    # choose the correct output for hiddens_output
    h_empty = [True if h is None else False for h in H_chain_R]
    if False in h_empty:  # if there was a set of hiddens output from the batch_size-1 element of the chain
        hiddens_R_output = H_chain_R[h_empty.index(
            False
        )]  # extract out the not-None element from the list if it exists
#     if state.batch_size <= len(Xs_recon):
#         for i in range(len(hiddens_R_output)):
#             hiddens_R_output[i] = H_chain_R[state.batch_size - 1][i]

    f_recon = theano.function(inputs=hiddens_R_input + Xs_recon,
                              outputs=hiddens_R_output +
                              [p_X_chain_R[0], p_X_chain_R[-1]],
                              on_unused_input="warn")

    ############
    # Sampling #
    ############

    # the input to the sampling function
    X_sample = T.fmatrix("X_sampling")
    network_state_input = [X_sample] + [
        T.fmatrix("H_sampling_" + str(i + 1)) for i in range(layers)
    ]

    # "Output" state of the network (noisy)
    # initialized with input, then we apply updates

    network_state_output = [X_sample] + network_state_input[1:]

    visible_pX_chain = []

    # ONE update
    print "Performing one walkback in network state sampling."
    _ = update_layers(network_state_output,
                      visible_pX_chain, [X_sample],
                      0,
                      noisy=True)

    if layers == 1:
        f_sample_simple = theano.function(inputs=[X_sample],
                                          outputs=visible_pX_chain[-1])

    # WHY IS THERE A WARNING????
    # because the first odd layers are not used -> directly computed FROM THE EVEN layers
    # unused input = warn
    f_sample2 = theano.function(inputs=network_state_input,
                                outputs=network_state_output +
                                visible_pX_chain,
                                on_unused_input='warn')

    def sample_some_numbers_single_layer():
        x0 = test_X.get_value()[:1]
        samples = [x0]
        x = f_noise(x0)
        for i in range(399):
            x = f_sample_simple(x)
            samples.append(x)
            x = numpy.random.binomial(n=1, p=x, size=x.shape).astype('float32')
            x = f_noise(x)
        return numpy.vstack(samples)

    def sampling_wrapper(NSI):
        # * is the "splat" operator: It takes a list as input, and expands it into actual positional arguments in the function call.
        out = f_sample2(*NSI)
        NSO = out[:len(network_state_output)]
        vis_pX_chain = out[len(network_state_output):]
        return NSO, vis_pX_chain

    def sample_some_numbers(N=400):
        # The network's initial state
        init_vis = test_X.get_value()[:1]

        noisy_init_vis = f_noise(init_vis)

        network_state = [[noisy_init_vis] + [
            numpy.zeros((1, len(b.get_value())), dtype='float32')
            for b in bias_list[1:]
        ]]

        visible_chain = [init_vis]

        noisy_h0_chain = [noisy_init_vis]

        for i in range(N - 1):

            # feed the last state into the network, compute new state, and obtain visible units expectation chain
            net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1])

            # append to the visible chain
            visible_chain += vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)

            noisy_h0_chain.append(net_state_out[0])

        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)

    def plot_samples(epoch_number, iteration):
        to_sample = time.time()
        if layers == 1:
            # one layer model
            V = sample_some_numbers_single_layer()
        else:
            V, H0 = sample_some_numbers()
        img_samples = PIL.Image.fromarray(
            tile_raster_images(V, (root_N_input, root_N_input), (20, 20)))

        fname = outdir + 'samples_iteration_' + str(
            iteration) + '_epoch_' + str(epoch_number) + '.png'
        img_samples.save(fname)
        print 'Took ' + str(time.time() - to_sample) + ' to sample 400 numbers'

    ##############
    # Inpainting #
    ##############
    def inpainting(digit):
        # The network's initial state

        # NOISE INIT
        init_vis = cast32(numpy.random.uniform(size=digit.shape))

        #noisy_init_vis  =   f_noise(init_vis)
        #noisy_init_vis  =   cast32(numpy.random.uniform(size=init_vis.shape))

        # INDEXES FOR VISIBLE AND NOISY PART
        noise_idx = (numpy.arange(N_input) % root_N_input < (root_N_input / 2))
        fixed_idx = (numpy.arange(N_input) % root_N_input > (root_N_input / 2))

        # function to re-init the visible to the same noise

        # FUNCTION TO RESET HALF VISIBLE TO DIGIT
        def reset_vis(V):
            V[0][fixed_idx] = digit[0][fixed_idx]
            return V

        # INIT DIGIT : NOISE and RESET HALF TO DIGIT
        init_vis = reset_vis(init_vis)

        network_state = [[init_vis] + [
            numpy.zeros((1, len(b.get_value())), dtype='float32')
            for b in bias_list[1:]
        ]]

        visible_chain = [init_vis]

        noisy_h0_chain = [init_vis]

        for i in range(49):

            # feed the last state into the network, compute new state, and obtain visible units expectation chain
            net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1])

            # reset half the digit
            net_state_out[0] = reset_vis(net_state_out[0])
            vis_pX_chain[0] = reset_vis(vis_pX_chain[0])

            # append to the visible chain
            visible_chain += vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)

            noisy_h0_chain.append(net_state_out[0])

        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)

    def save_params_to_file(name, n, params, iteration):
        print 'saving parameters...'
        save_path = outdir + name + '_params_iteration_' + str(
            iteration) + '_epoch_' + str(n) + '.pkl'
        f = open(save_path, 'wb')
        try:
            cPickle.dump(params, f, protocol=cPickle.HIGHEST_PROTOCOL)
        finally:
            f.close()

    ################
    # GSN TRAINING #
    ################
    def train_recurrent_GSN(iteration, train_X, train_Y, valid_X, valid_Y,
                            test_X, test_Y):
        print '----------------------------------------'
        print 'TRAINING GSN FOR ITERATION', iteration
        with open(logfile, 'a') as f:
            f.write(
                "--------------------------\nTRAINING GSN FOR ITERATION {0!s}\n"
                .format(iteration))

        # TRAINING
        n_epoch = state.n_epoch
        batch_size = state.batch_size
        STOP = False
        counter = 0
        if iteration == 0:
            learning_rate.set_value(cast32(
                state.learning_rate))  # learning rate
        times = []
        best_cost = float('inf')
        patience = 0

        print 'learning rate:', learning_rate.get_value()

        print 'train X size:', str(train_X.shape.eval())
        print 'valid X size:', str(valid_X.shape.eval())
        print 'test X size:', str(test_X.shape.eval())

        train_costs = []
        valid_costs = []
        test_costs = []
        train_costs_post = []
        valid_costs_post = []
        test_costs_post = []

        if state.vis_init:
            bias_list[0].set_value(
                logit(numpy.clip(0.9, 0.001,
                                 train_X.get_value().mean(axis=0))))

        if state.test_model:
            # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting
            print 'Testing : skip training'
            STOP = True

        while not STOP:
            counter += 1
            t = time.time()
            print counter, '\t',
            with open(logfile, 'a') as f:
                f.write("{0!s}\t".format(counter))
            #shuffle the data
            data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y,
                                     test_X, test_Y, dataset, rng)

            #train
            #init hiddens
            #             hiddens = [(T.zeros_like(train_X[:batch_size]).eval())]
            #             for i in range(len(weights_list)):
            #                 # init with zeros
            #                 hiddens.append(T.zeros_like(T.dot(hiddens[i], weights_list[i])).eval())
            hiddens = [
                T.zeros((batch_size, layer_size)).eval()
                for layer_size in layer_sizes
            ]
            train_cost = []
            train_cost_post = []
            for i in range(len(train_X.get_value(borrow=True)) / batch_size):
                xs = [
                    train_X.get_value(
                        borrow=True)[(i * batch_size) +
                                     sequence_idx:((i + 1) * batch_size) +
                                     sequence_idx]
                    for sequence_idx in range(len(Xs))
                ]
                xs, hiddens = fix_input_size(xs, hiddens)
                hiddens[0] = xs[0]
                _ins = hiddens + xs
                _outs = f_learn(*_ins)
                hiddens = _outs[:len(hiddens)]
                cost = _outs[-2]
                cost_post = _outs[-1]
                train_cost.append(cost)
                train_cost_post.append(cost_post)

            train_cost = numpy.mean(train_cost)
            train_costs.append(train_cost)
            train_cost_post = numpy.mean(train_cost_post)
            train_costs_post.append(train_cost_post)
            print 'Train : ', trunc(train_cost), trunc(train_cost_post), '\t',
            with open(logfile, 'a') as f:
                f.write("Train : {0!s} {1!s}\t".format(trunc(train_cost),
                                                       trunc(train_cost_post)))
            with open(train_convergence_pre, 'a') as f:
                f.write("{0!s},".format(train_cost))
            with open(train_convergence_post, 'a') as f:
                f.write("{0!s},".format(train_cost_post))

            #valid
            #init hiddens
            hiddens = [
                T.zeros((batch_size, layer_size)).eval()
                for layer_size in layer_sizes
            ]
            valid_cost = []
            valid_cost_post = []
            for i in range(len(valid_X.get_value(borrow=True)) / batch_size):
                xs = [
                    valid_X.get_value(
                        borrow=True)[(i * batch_size) +
                                     sequence_idx:((i + 1) * batch_size) +
                                     sequence_idx]
                    for sequence_idx in range(len(Xs))
                ]
                xs, hiddens = fix_input_size(xs, hiddens)
                hiddens[0] = xs[0]
                _ins = hiddens + xs
                _outs = f_cost(*_ins)
                hiddens = _outs[:-2]
                cost = _outs[-2]
                cost_post = _outs[-1]
                valid_cost.append(cost)
                valid_cost_post.append(cost_post)

            valid_cost = numpy.mean(valid_cost)
            valid_costs.append(valid_cost)
            valid_cost_post = numpy.mean(valid_cost_post)
            valid_costs_post.append(valid_cost_post)
            print 'Valid : ', trunc(valid_cost), trunc(valid_cost_post), '\t',
            with open(logfile, 'a') as f:
                f.write("Valid : {0!s} {1!s}\t".format(trunc(valid_cost),
                                                       trunc(valid_cost_post)))
            with open(valid_convergence_pre, 'a') as f:
                f.write("{0!s},".format(valid_cost))
            with open(valid_convergence_post, 'a') as f:
                f.write("{0!s},".format(valid_cost_post))

            #test
            #init hiddens
            hiddens = [
                T.zeros((batch_size, layer_size)).eval()
                for layer_size in layer_sizes
            ]
            test_cost = []
            test_cost_post = []
            for i in range(len(test_X.get_value(borrow=True)) / batch_size):
                xs = [
                    test_X.get_value(
                        borrow=True)[(i * batch_size) +
                                     sequence_idx:((i + 1) * batch_size) +
                                     sequence_idx]
                    for sequence_idx in range(len(Xs))
                ]
                xs, hiddens = fix_input_size(xs, hiddens)
                hiddens[0] = xs[0]
                _ins = hiddens + xs
                _outs = f_cost(*_ins)
                hiddens = _outs[:-2]
                cost = _outs[-2]
                cost_post = _outs[-1]
                test_cost.append(cost)
                test_cost_post.append(cost_post)

            test_cost = numpy.mean(test_cost)
            test_costs.append(test_cost)
            test_cost_post = numpy.mean(test_cost_post)
            test_costs_post.append(test_cost_post)
            print 'Test  : ', trunc(test_cost), trunc(test_cost_post), '\t',
            with open(logfile, 'a') as f:
                f.write("Test : {0!s} {1!s}\t".format(trunc(test_cost),
                                                      trunc(test_cost_post)))
            with open(test_convergence_pre, 'a') as f:
                f.write("{0!s},".format(test_cost))
            with open(test_convergence_post, 'a') as f:
                f.write("{0!s},".format(test_cost_post))

            #check for early stopping
            cost = train_cost
            if cost < best_cost * state.early_stop_threshold:
                patience = 0
                best_cost = cost
            else:
                patience += 1

            if counter >= n_epoch or patience >= state.early_stop_length:
                STOP = True
                save_params_to_file('gsn', counter, params, iteration)

            timing = time.time() - t
            times.append(timing)

            print 'time : ', trunc(timing),

            print 'remaining: ', trunc(
                (n_epoch - counter) * numpy.mean(times) / 60 / 60), 'hrs',

            print 'B : ', [
                trunc(abs(b.get_value(borrow=True)).mean()) for b in bias_list
            ],

            print 'W : ', [
                trunc(abs(w.get_value(borrow=True)).mean())
                for w in weights_list
            ],

            print 'V : ', [
                trunc(abs(v.get_value(borrow=True)).mean())
                for v in recurrent_weights_list
            ]

            with open(logfile, 'a') as f:
                f.write("MeanVisB : {0!s}\t".format(
                    trunc(bias_list[0].get_value().mean())))

            with open(logfile, 'a') as f:
                f.write("W : {0!s}\t".format(
                    str([
                        trunc(abs(w.get_value(borrow=True)).mean())
                        for w in weights_list
                    ])))

            with open(logfile, 'a') as f:
                f.write("Time : {0!s} seconds\n".format(trunc(timing)))

            if (counter % state.save_frequency) == 0:
                # Checking reconstruction
                nums = test_X.get_value()[range(100)]
                noisy_nums = f_noise(test_X.get_value()[range(100)])
                reconstructed_prediction = []
                reconstructed_prediction_end = []
                #init reconstruction hiddens
                hiddens = [
                    T.zeros(layer_size).eval() for layer_size in layer_sizes
                ]
                for num in noisy_nums:
                    hiddens[0] = num
                    for i in range(len(hiddens)):
                        if len(hiddens[i].shape
                               ) == 2 and hiddens[i].shape[0] == 1:
                            hiddens[i] = hiddens[i][0]
                    _ins = hiddens + [num]
                    _outs = f_recon(*_ins)
                    hiddens = _outs[:len(hiddens)]
                    [reconstructed_1, reconstructed_n] = _outs[len(hiddens):]
                    reconstructed_prediction.append(reconstructed_1)
                    reconstructed_prediction_end.append(reconstructed_n)

                with open(logfile, 'a') as f:
                    f.write("\n")
                for i in range(len(nums)):
                    if len(
                            reconstructed_prediction[i].shape
                    ) == 2 and reconstructed_prediction[i].shape[0] == 1:
                        reconstructed_prediction[i] = reconstructed_prediction[
                            i][0]
                    print nums[i].tolist(
                    ), "->", reconstructed_prediction[i].tolist()
                    with open(logfile, 'a') as f:
                        f.write("{0!s} -> {1!s}\n".format(
                            nums[i].tolist(), [
                                trunc(n)
                                if n > 0.0001 else trunc(0.00000000000000000)
                                for n in reconstructed_prediction[i].tolist()
                            ]))
                with open(logfile, 'a') as f:
                    f.write("\n")

#                 # Concatenate stuff
#                 stacked = numpy.vstack([numpy.vstack([nums[i*10 : (i+1)*10], noisy_nums[i*10 : (i+1)*10], reconstructed_prediction[i*10 : (i+1)*10], reconstructed_prediction_end[i*10 : (i+1)*10]]) for i in range(10)])
#                 numbers_reconstruction = PIL.Image.fromarray(tile_raster_images(stacked, (root_N_input,root_N_input), (10,40)))
#                 numbers_reconstruction.save(outdir+'gsn_number_reconstruction_iteration_'+str(iteration)+'_epoch_'+str(counter)+'.png')
#
#                 #sample_numbers(counter, 'seven')
#                 plot_samples(counter, iteration)
#
#                 #save params
#                 save_params_to_file('gsn', counter, params, iteration)

# ANNEAL!
            new_lr = learning_rate.get_value() * annealing
            learning_rate.set_value(new_lr)

        # 10k samples
        print 'Generating 10,000 samples'
        samples, _ = sample_some_numbers(N=10000)
        f_samples = outdir + 'samples.npy'
        numpy.save(f_samples, samples)
        print 'saved digits'

    #####################
    # STORY 2 ALGORITHM #
    #####################
    for iter in range(state.max_iterations):
        train_recurrent_GSN(iter, train_X, train_Y, valid_X, valid_Y, test_X,
                            test_Y)
示例#19
0
    def __init__(self,
                 inputs=None,
                 outputs=None,
                 params=None,
                 outdir='outputs/basic',
                 activation='rectifier',
                 weights_init='uniform',
                 weights_mean=0,
                 weights_std=5e-3,
                 weights_interval='glorot',
                 bias_init=0.0,
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 **kwargs):
        """
        Initialize a basic layer.

        Parameters
        ----------
        inputs : List of [tuple(shape, `Theano.TensorType`)]
            The dimensionality of the inputs for this model, and the routing information for the model
            to accept inputs from elsewhere. `shape` will be a monad tuple representing known
            sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of
            dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its
            dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size
            but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be:
            [((None, 784), <TensorType(float32, matrix)>)].
        outputs : int
            The dimensionality of the output for this model.
        params : Dict(string_name: theano SharedVariable), optional
            A dictionary of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as siamese networks or pretraining some
            weights.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        activation : str or callable
            The activation function to use after the dot product going from input -> output. This can be a string
            representing an option from opendeep.utils.activation, or your own function as long as it is callable.
        weights_init : str
            Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(Dense, self).__init__(**initial_parameters)
        if self.inputs is None:
            return

        ##################
        # specifications #
        ##################
        if len(self.inputs) > 1:
            raise NotImplementedError(
                "Expected 1 input to Dense, found %d. Please merge inputs before passing "
                "to the Dense model!" % len(self.inputs))
        # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input)
        input_shape, self.input = self.inputs[0]
        if isinstance(input_shape, int):
            self.input_size = ((None, ) *
                               (self.input.ndim - 1)) + (input_shape, )
        else:
            self.input_size = input_shape
        assert self.input_size is not None, "Need to specify the shape for the last dimension of the input!"

        # We also only have 1 output
        assert self.output_size is not None, "Need to specify outputs size!"
        out_size = self.output_size[0]
        if isinstance(out_size, int):
            self.output_size = self.input_size[:-1] + (out_size, )
        else:
            self.output_size = out_size

        # activation function!
        activation_func = get_activation_function(activation)

        #########################################################
        # parameters - make sure to deal with input dictionary! #
        #########################################################
        W = self.params.get("W") or get_weights(
            weights_init=weights_init,
            shape=(self.input_size[-1], self.output_size[-1]),
            name="W",
            rng=mrg,
            # if gaussian
            mean=weights_mean,
            std=weights_std,
            # if uniform
            interval=weights_interval)

        b = self.params.get("b") or get_bias(
            shape=self.output_size[-1], name="b", init_values=bias_init)

        # Finally have the two parameters - weights matrix W and bias vector b. That is all!
        self.params = OrderedDict([("W", W), ("b", b)])

        ###############
        # computation #
        ###############
        # Here is the meat of the computation transforming input -> output
        # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing
        # the result through our activation function (normally something nonlinear such as: max(0, output))
        self.output = activation_func(dot(self.input, W) + b)

        log.debug(
            "Initialized a basic fully-connected layer with shape %s and activation: %s",
            str((self.input_size[-1], self.output_size[-1])), str(activation))
示例#20
0
    def __init__(self,
                 prefix,
                 options,
                 create_param=True,
                 repeat_actions=False,
                 plan_steps=10,
                 ntimesteps=10,
                 inter_size=64,
                 dec_dim=500,
                 batch_size=None,
                 context_dim=-1,
                 use_gate=True,
                 always_recommit=False,
                 bounded_sigm_temp_act=False,
                 do_commit=True,
                 do_layerNorm=False):

        self.repeat_actions = repeat_actions
        self.ntimesteps = ntimesteps
        self.prefix = prefix
        self.inter_size = inter_size
        self.bounded_sigm_temp_act = bounded_sigm_temp_act
        self.dec_dim = dec_dim
        self.context_dim = context_dim
        self.use_gate = use_gate
        self.always_recommit = always_recommit
        self.do_commit = do_commit

        if not "st_estimator" in options:
            options['st_estimator'] = "GumbelSoftmax"
            self.st_estimator = "GumbelSoftmax"

        self.st_estimator = options['st_estimator']

        if self.st_estimator is None:
            self.st_estimator = "GumbelSoftmax"
            options['st_estimator'] = self.st_estimator

        self.rng = rng_mrg.MRG_RandomStreams(seed=1993)

        if 'plan_step' in options:
            self.plan_steps = options['plan_step']
        else:
            self.plan_steps = plan_steps

        self.only_use_w = False
        if 'only_use_w' in options:
            self.only_use_w = options['only_use_w']
            if self.only_use_w:
                print "We will only use the h2 state for the attention."
            else:
                print "We will use all the hidden state for the attention."

        if 'use_gate' in options:  # Shitty way to do it, but it's a pain to add everything everywhere
            self.use_gate = options['use_gate']
            if self.use_gate:
                print "We are using a gate in the planner"
            else:
                print "We won't be using the gate for the planner"

        self.learn_t = False
        if 'learn_t' in options:
            self.learn_t = options['learn_t']
            if self.learn_t:
                print "We are learning the temperature"
            else:
                print "We won't be learning the temperature"

        if self.st_estimator == "REINFORCE":
            print "Using REINFORCE"
        elif self.st_estimator == "GumbelSoftmax":
            print "Using GumbelSoftmax"
        else:
            raise ValueError("Wrong st estimator: {}".format(
                self.st_estimator))

        self.action_plan_steps = plan_steps
        if 'repeat_actions' in options:
            self.repeat_actions = options['repeat_actions']
            if self.repeat_actions:
                print "We will repeat the action until recommitment (and won't be using gates."
                self.action_plan_steps = 1
                self.use_gate = False
            else:
                print "We We will plan ahead all futur alignment."

        self.do_layerNorm = do_layerNorm
        if 'planning_do_layerNorm' in options:
            self.do_layerNorm = options['planning_do_layerNorm']
            if self.do_layerNorm:
                print "We are doing layernorm in the PAG network"
            else:
                print "We are not doing layernorm in the PAG network"

        self.actionPlanner = ActionPlan(inter_size=inter_size,
                                        context_size=context_dim,
                                        dec_size=dec_dim,
                                        create_param=create_param,
                                        batch_size=batch_size,
                                        repeat_actions=self.repeat_actions,
                                        plan_steps=self.plan_steps,
                                        ntimesteps=ntimesteps,
                                        options=options)

        if do_commit:
            self.commitplan = CommitmentPlan(
                create_param=create_param,
                bellow_size=dec_dim,
                plan_steps=self.plan_steps,
                bounded_sigm_temp_act=self.bounded_sigm_temp_act,
                options=options,
                rng=self.rng)
        else:
            print "WARNING, we are not doing any commitment."
            self.commitplan = CommitmentPlan(
                create_param=create_param,
                bellow_size=dec_dim,
                plan_steps=self.plan_steps,
                bounded_sigm_temp_act=self.bounded_sigm_temp_act,
                options=options,
                rng=self.rng)

        if create_param:
            self.init_params()
示例#21
0
    def __init__(self,
                 inputs_hook=None,
                 params_hook=None,
                 outdir='outputs/basic',
                 input_size=None,
                 output_size=None,
                 activation='rectifier',
                 cost='mse',
                 cost_args=None,
                 weights_init='uniform',
                 weights_mean=0,
                 weights_std=5e-3,
                 weights_interval='montreal',
                 bias_init=0.0,
                 noise=None,
                 noise_level=None,
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 **kwargs):
        """
        Initialize a basic layer.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together. For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. input_size).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the layer. If shape is provided in `inputs_hook`,
            this is optional.
        output_size : int
            The size (dimensionality) of the output from the layer.
        activation : str or callable
            The activation function to use after the dot product going from input -> output. This can be a string
            representing an option from opendeep.utils.activation, or your own function as long as it is callable.
        cost : str or callable
            The cost function to use when training the layer. This should be appropriate for the output type, i.e.
            mse for real-valued outputs, binary cross-entropy for binary outputs, etc.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        weights_init : str
            Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        noise : str
            What type of noise to use for corrupting the output (if not None). See opendeep.utils.noise
            for options. This should be appropriate for the output activation, i.e. Gaussian for tanh or other
            real-valued activations, etc. Often, you will use 'dropout' here as a regularization in BasicLayers.
        noise_level : float
            The amount of noise to use for the noise function specified by `noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(Dense, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        if inputs_hook is not None:  # inputs_hook is a tuple of (Shape, Input)
            assert len(
                inputs_hook
            ) == 2, 'Expected inputs_hook to be tuple!'  # make sure inputs_hook is a tuple
            self.input = inputs_hook[1]
        else:
            # make the input a symbolic matrix
            self.input = T.matrix('X')

        # now that we have the input specs, define the output 'target' variable to be used in supervised training!
        if kwargs.get('out_as_probs') == False:
            self.target = T.vector('Y', dtype='int64')
        else:
            self.target = T.matrix('Y')

        # either grab the output's desired size from the parameter directly, or copy input_size
        self.output_size = self.output_size or self.input_size

        # other specifications
        # activation function!
        activation_func = get_activation_function(activation)
        # cost function!
        cost_func = get_cost_function(cost)
        cost_args = cost_args or dict()

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if params_hook is not None:
            # make sure the params_hook has W (weights matrix) and b (bias vector)
            assert len(params_hook) == 2, \
                "Expected 2 params (W and b) for Dense, found {0!s}!".format(len(params_hook))
            W, b = params_hook
        else:
            W = get_weights(
                weights_init=weights_init,
                shape=(self.input_size, self.output_size),
                name="W",
                rng=mrg,
                # if gaussian
                mean=weights_mean,
                std=weights_std,
                # if uniform
                interval=weights_interval)

            # grab the bias vector
            b = get_bias(shape=output_size, name="b", init_values=bias_init)

        # Finally have the two parameters - weights matrix W and bias vector b. That is all!
        self.params = [W, b]

        ###############
        # computation #
        ###############
        # Here is the meat of the computation transforming input -> output
        # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing
        # the result through our activation function (normally something nonlinear such as: max(0, output))
        self.output = activation_func(T.dot(self.input, W) + b)

        # Now deal with noise if we added it:
        if noise:
            log.debug('Adding noise switch.')
            if noise_level is not None:
                noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.switch = sharedX(value=1, name="basiclayer_noise_switch")
            self.output = T.switch(self.switch, noise_func(input=self.output),
                                   self.output)

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_func(output=self.output,
                              target=self.target,
                              **cost_args)

        log.debug(
            "Initialized a basic fully-connected layer with shape %s and activation: %s",
            str((self.input_size, self.output_size)), str(activation))
示例#22
0
def run(rng_seed,ltype, mtype,load_path, load_epoch, sample=False, nclass=10, whichclass=None, verbose=False, class_list=None, ckernr=None, cri_ckern=None):
    
    assert ckernr!=None
    #  ltype -> GAN LSGAN WGAN 
    #    JS      0.4+-asdf
    #    LS
    #    WA
    #    MMD 
    #    IS


    ### MODEL PARAMS
    ### MODEL PARAMS
    # ltype       = sys.argv[3]
    # mtype       = 'js'
    # print 'ltype: ' + ltype
    # print 'mtype: ' + mtype
    mmdF        = False
    nndF        = False

    # CONV (DISC)
    conv_num_hid= 100
    num_channel = 3 #Fixed
    num_class   = 1 #Fixed
    D=64*64*3
    kern=int(ckernr.split('_')[0])

    ### OPT PARAMS
    batch_sz    = 100
    momentum    = 0.0 #Not Used
    lam         = 0.0
    
    epsilon_dis = 0.0002
    epsilon_gen = 0.0001
    
    # if mtype =='js' :
    #     epsilon_dis = 0.0002
    #     epsilon_gen = 0.0001
    #     K=5 #FIXED
    #     J=1
    # elif mtype == 'ls':
    #     epsilon_dis = 0.0002
    #     epsilon_gen = 0.0001
    #     K=5 #FIXED
    #     J=1
    # else:
    #     epsilon_dis = 0.0002
    #     epsilon_gen = 0.0001
    #     K=2 #FIXED
    #     J=1

    # ganI (GEN)
    filter_sz   = 4 #FIXED
    nkerns      = [1,8,4,2,1]
    ckern       = int(ckernr.split('_')[-1]) #20
    num_hid1    = nkerns[0]*ckern*filter_sz*filter_sz #Fixed
    num_z       = 100

    ### TRAIN PARAMS
    num_epoch   = 10
    epoch_start = 0 #Fixed
    contF       = True #Fixed
    
    num_hids     = [num_hid1]
    
    input_width = 64
    input_height = 64
    input_depth = 3
    
    ### SAVE PARAM
    model_param_save = 'num_hid%d.batch%d.eps_dis%g.eps_gen%g.num_z%d.num_epoch%g.lam%g.ts%d.data.100_CONV_lsun'%(conv_num_hid,batch_sz, epsilon_dis, epsilon_gen, num_z, num_epoch, lam1, num_steps)

    
    # device=sys.argv[1]
    import os
    os.environ['RNG_SEED'] = str(rng_seed)
    os.environ['LOAD_PATH'] = load_path
    os.environ['LOAD_EPOCH'] = str(load_epoch)
    os.environ['LTYPE'] = ltype
    # os.environ['MTYPE'] = mtype
    try:
        a=os.environ['CRI_KERN']
    except:
        if cri_ckern!=None: 
            os.environ['CRI_KERN']=cri_ckern
        else:
            raise RuntimeError('cri_kern not provided')
    
    import theano 
    import theano.sandbox.rng_mrg as RNG_MRG
    rng = np.random.RandomState(int(os.environ['RNG_SEED']))
    MRG = RNG_MRG.MRG_RandomStreams(rng.randint(2 ** 30))
    
    from util_cifar10 import load_cifar10
    from utils import shared_dataset, unpickle
    
    
    import pwd; username = pwd.getpwuid(os.geteuid()).pw_name
    
    global nnd_path
    if username in ['hma02', 'mahe6562']:
        if username=='hma02':
            datapath = '/mnt/data/hma02/data/cifar10/cifar-10-batches-py/'
            save_path = '/mnt/data/hma02/gap/dcgan-cifar10/'
            nnd_path = '/mnt/data/hma02/gap/'
        else:
            datapath = '/scratch/g/gwtaylor/mahe6562/data/cifar10/cifar-10-batches-py/'
            save_path = '/scratch/g/gwtaylor/mahe6562/gap/dcgan-cifar10/'
            nnd_path = '//scratch/g/gwtaylor/mahe6562/gap/'
            
        import time; date = '%d-%d' % (time.gmtime()[1], time.gmtime()[2])
        import os; worker_id = os.getpid()
        save_path+= date+'-%d-%s/' % (worker_id,ltype)
        # if not os.path.exists(save_path):
        #     os.makedirs(save_path); print 'create dir',save_path
        #
        # save_the_env(dir_to_save='../mnist', path=save_path)
        
    global train_set_np,valid_set_np,test_set_np
    
    train_set_np, valid_set_np, test_set_np = load_cifar10(path=datapath, verbose=False)
    # 127.5 - 1. in order to rescale to -1 to 1.
    
    
    train_set_np[0] = train_set_np[0] / 255.0 #127.5 - 1.
    valid_set_np[0] = valid_set_np[0] / 255.0 #127.5 - 1.
    test_set_np[0]  = test_set_np[0]  / 255.0 #127.5 - 1.
    
    N ,D = train_set_np[0].shape; Nv,D = valid_set_np[0].shape; Nt,D = test_set_np[0].shape
    
    train_set = shared_dataset(train_set_np)
    valid_set = shared_dataset(valid_set_np)
    test_set  = shared_dataset(test_set_np )

    # print 'batch sz %d, epsilon gen %g, epsilon dis %g, hnum_z %d, num_conv_hid %g, num_epoch %di, lam %g' % \
#                                     (batch_sz, epsilon_gen, epsilon_dis, num_z, conv_num_hid, num_epoch, lam)

    book_keeping = []

    num_hids     = [num_hid1]
    train_params = [num_epoch, epoch_start, contF]
    opt_params   = [batch_sz, epsilon_gen, epsilon_dis, momentum, num_epoch, N, Nv, Nt, lam]    
    ganI_params  = [batch_sz, D, num_hids, rng, num_z, nkerns, ckern, num_channel]
    conv_params  = [conv_num_hid, D, num_class, batch_sz, num_channel, kern]
    
    if sample==True:
        samples = main(train_set, valid_set, test_set, opt_params, ganI_params, train_params, conv_params, sample)
        return 0,0,0,0
    else:
        te_score_ls, te_score_iw , mmd_te , is_sam = main(train_set, valid_set, test_set, opt_params, ganI_params, train_params, conv_params, sample)
    
        return te_score_ls, te_score_iw , mmd_te , is_sam
示例#23
0
    def __init__(self,
                 inputs_hook=None,
                 params_hook=None,
                 outdir='outputs/conv1d',
                 input_size=None,
                 filter_shape=None,
                 stride=None,
                 border_mode='valid',
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0,
                 activation='rectifier',
                 convolution='mc0',
                 mrg=RNG_MRG.MRG_RandomStreams(1)):
        """
        Initialize a 1-D convolutional layer.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together. For now, it needs to include the shape information.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables).
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : tuple
            Shape of the incoming data: (batch_size, num_channels, data_dimensionality). Most likely, your channels
            will be 1. For example, batches of text will be of the form (N, 1, D) where N=examples in minibatch and
            D=dimensionality (chars, words, etc.)
        filter_shape : tuple
            (num_filters, num_channels, filter_length). This is also the shape of the weights matrix.
        stride : int
            The distance between the receptive field centers of neighboring units. This is the 'stride' of the
            convolution operation.
        border_mode : str, one of 'valid', 'full', 'same'
            A string indicating the convolution border mode.
            If 'valid', the convolution is only computed where the input and the
            filter fully overlap.
            If 'full', the convolution is computed wherever the input and the
            filter overlap by at least one position.
            If 'same', the convolution is computed wherever the input and the
            filter overlap by at least half the filter size, when the filter size
            is odd. In practice, the input is zero-padded with half the filter size
            at the beginning and half at the end (or one less than half in the case
            of an even filter size). This results in an output length that is the
            same as the input length (for both odd and even filter sizes).
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        activation : str or Callable
            The activation function to apply to the layer. See opendeep.utils.activation for options.
        convolution : str or Callable
            The 1-dimensional convolution implementation to use. The default of 'mc0' is normally fine. See
            opendeep.utils.conv1d_implementations for alternatives. (This is necessary because Theano only
            supports 2D convolutions at the moment).
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.

        Notes
        -----
        Theano's default convolution function (`theano.tensor.nnet.conv.conv2d`)
        does not support the 'same' border mode by default. This layer emulates
        it by performing a 'full' convolution and then cropping the result, which
        may negatively affect performance.
        """
        super(Conv1D, self).__init__(
            **
            {arg: val
             for (arg, val) in locals().items() if arg is not 'self'})

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        # expect input to be in the form (B, C, I) (batch, channel, input data)
        # inputs_hook is a tuple of (Shape, Input)
        if self.inputs_hook is not None:
            # make sure inputs_hook is a tuple
            assert len(
                self.inputs_hook
            ) == 2, "expecting inputs_hook to be tuple of (shape, input)"
            self.input = inputs_hook[1]
        else:
            # make the input a symbolic matrix
            self.input = T.ftensor3('X')

        # activation function!
        activation_func = get_activation_function(activation)

        # convolution function!
        convolution_func = get_conv1d_function(convolution)

        # filter shape should be in the form (num_filters, num_channels, filter_length)
        num_filters = filter_shape[0]
        filter_length = filter_shape[2]

        ################################################
        # Params - make sure to deal with params_hook! #
        ################################################
        if self.params_hook:
            # make sure the params_hook has W and b
            assert len(self.params_hook) == 2, \
                "Expected 2 params (W and b) for Conv1D, found {0!s}!".format(len(self.params_hook))
            W, b = self.params_hook
        else:
            W = get_weights(
                weights_init=weights_init,
                shape=filter_shape,
                name="W",
                rng=mrg,
                # if gaussian
                mean=weights_mean,
                std=weights_std,
                # if uniform
                interval=weights_interval)

            b = get_bias(shape=(num_filters, ),
                         name="b",
                         init_values=bias_init)

        # Finally have the two parameters!
        self.params = [W, b]

        ########################
        # Computational Graph! #
        ########################
        if border_mode in ['valid', 'full']:
            conved = convolution_func(self.input,
                                      W,
                                      subsample=(stride, ),
                                      image_shape=self.input_size,
                                      filter_shape=filter_shape,
                                      border_mode=border_mode)
        elif border_mode == 'same':
            conved = convolution_func(self.input,
                                      W,
                                      subsample=(stride, ),
                                      image_shape=self.input_size,
                                      filter_shape=filter_shape,
                                      border_mode='full')
            shift = (filter_length - 1) // 2
            conved = conved[:, :, shift:self.input_size[2] + shift]

        else:
            log.error("Invalid border mode: '%s'" % border_mode)
            raise RuntimeError("Invalid border mode: '%s'" % border_mode)

        self.output = activation_func(conved + b.dimshuffle('x', 0, 'x'))
示例#24
0
This module provides the important noise functions - mostly used for regularization purposes to prevent the
deep nets from overfitting.

Based on code from Li Yao (University of Montreal)
https://github.com/yaoli/GSN
"""
# standard libraries
import logging
from functools import partial
# third party libraries
import theano
import theano.tensor as T
import theano.sandbox.rng_mrg as RNG_MRG
import theano.compat.six as six

theano_random = RNG_MRG.MRG_RandomStreams(seed=23455)
# set a fixed number initializing RandomSate for 2 purpose:
#  1. repeatable experiments; 2. for multiple-GPU, the same initial weights

log = logging.getLogger(__name__)

def get_noise(name, *args, **kwargs):
    """
    Helper function to return a partially applied noise functions - all you need to do is apply them to an input.

    Parameters
    ----------
    name : str
        Name of noise function to use (key in a function dictionary).

    Returns
示例#25
0
def experiment(state, channel):
    print 'LOADING MODEL CONFIG'
    config_path =   '/'+os.path.join(*state.model_path.split('/'))
    print state.model_path

    if 'config' in os.listdir(config_path):
        
        config_file = open(os.path.join(config_path, 'config'), 'r')
        config      =   config_file.readlines()
        try:
            config_vals =   config[0].split('(')[1:][0].split(')')[:-1][0].split(', ')
        except:
            config_vals =   config[0][3:-1].replace(': ','=').replace("'","").split(', ')
            config_vals =   filter(lambda x:not 'jobman' in x and not '/' in x and not ':' in x and not 'experiment' in x, config_vals)
        
        for CV in config_vals:
            print CV
            try:
                exec('state.'+CV) in globals(), locals()
            except:
                exec('state.'+CV.split('=')[0]+"='"+CV.split('=')[1]+"'") in globals(), locals()
    else:
        import pdb; pdb.set_trace()

    # LOAD DATA
    if 'mnist' in state.data_path:
        (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_mnist(state.data_path)
        train_X = numpy.concatenate((train_X, valid_X))
    elif 'TFD' in state.data_path:
        (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_tfd(state.data_path)
    
    N_input =   train_X.shape[1]
    root_N_input = numpy.sqrt(N_input)


    #train_X = binarize(train_X)
    #valid_X = binarize(valid_X)
    #test_X = binarize(test_X)
    numpy.random.seed(1)
    numpy.random.shuffle(train_X)
    train_X = theano.shared(train_X)
    valid_X = theano.shared(valid_X)
    test_X  = theano.shared(test_X)
    # shuffle Y also if necessary

    # THEANO VARIABLES
    X       = T.fmatrix()
    index   = T.lscalar()
    MRG = RNG_MRG.MRG_RandomStreams(1)
    
    # SPECS
    K               =   state.K
    N               =   state.N
    layer_sizes     =   [N_input] + [state.hidden_size] * K
    learning_rate   =   theano.shared(cast32(state.learning_rate))
    annealing       =   cast32(state.annealing)
    momentum        =   theano.shared(cast32(state.momentum))

    # PARAMETERS
    # weights

    weights_list    =   [get_shared_weights(layer_sizes[i], layer_sizes[i+1], numpy.sqrt(6. / (layer_sizes[i] + layer_sizes[i+1] )), 'W') for i in range(K)]
    bias_list       =   [get_shared_bias(layer_sizes[i], 'b') for i in range(K + 1)]


    # LOAD PARAMS
    print 'Loading model params...',
    print 'Loading last epoch...',
    param_files =   filter(lambda x: x.endswith('ft'), os.listdir(config_path))
    max_epoch   =   numpy.argmax([int(x.split('_')[-1].split('.')[0]) for x in param_files])

    params_to_load  =   os.path.join(config_path, param_files[max_epoch])
    F   =   open(params_to_load, 'r')

    n_params = len(weights_list) + len(bias_list)
    print param_files[max_epoch]

    for i in range(0, len(weights_list)):
        weights_list[i].set_value(ft.read(F))

    for i in range(len(bias_list)):
        bias_list[i].set_value(ft.read(F))


    print 'Model parameters loaded!!'

        # functions
    def dropout(IN, p = 0.5):
        noise   =   MRG.binomial(p = p, n = 1, size = IN.shape, dtype='float32')
        OUT     =   (IN * noise) / cast32(p)
        return OUT

    def add_gaussian_noise(IN, std = 1):
        print 'GAUSSIAN NOISE : ', std
        noise   =   MRG.normal(avg  = 0, std  = std, size = IN.shape, dtype='float32')
        OUT     =   IN + noise
        return OUT

    def corrupt_input(IN, p = 0.5):
        # salt and pepper? masking?
        noise   =   MRG.binomial(p = p, n = 1, size = IN.shape, dtype='float32')
        IN      =   IN * noise
        return IN

    def salt_and_pepper(IN, p = 0.2):
        # salt and pepper noise
        print 'DAE uses salt and pepper noise'
        a = MRG.binomial(size=IN.shape, n=1,
                              p = 1 - p,
                              dtype='float32')
        b = MRG.binomial(size=IN.shape, n=1,
                              p = 0.5,
                              dtype='float32')
        c = T.eq(a,0) * b
        return IN * a + c

    def update_odd_layers(hiddens, noisy):
        for i in range(1, K+1, 2):
            print i
            if noisy:
                simple_update_layer(hiddens, None, i)
            else:
                simple_update_layer(hiddens, None, i, mul_noise = False, add_noise = False)

    # we can append the reconstruction at each step
    def update_even_layers(hiddens, p_X_chain, autoregression, noisy):
        for i in range(0, K+1, 2):
            print i
            if noisy:
                simple_update_layer(hiddens, p_X_chain, i, autoregression)
            else:
                simple_update_layer(hiddens, p_X_chain, i, autoregression, mul_noise = False, add_noise = False)

    def simple_update_layer(hiddens, p_X_chain, i, autoregression=False, mul_noise=True, add_noise=True):
        # Compute the dot product, whatever layer
        post_act_noise  =   0
        if i == 0:
            hiddens[i]  =   T.dot(hiddens[i+1], weights_list[i].T) + bias_list[i]           

        elif i == K:
            hiddens[i]  =   T.dot(hiddens[i-1], weights_list[i-1]) + bias_list[i]
            # TODO compute d h_i / d h_(i-1)

            # derivee de h[i] par rapport a h[i-1]
            # W is what transpose...

            if state.scaled_noise:
            
                # to remove this, remove the post_act_noise variable initialisation and the following block
                # and put back post activation noise like it was (just normal calling of the function)
                W   =   weights_list[i-1]
                hn  =   T.tanh(hiddens[i])
                ww  =   T.dot(W.T, W)
                s   =   (cast32(1) - hn**2)
                jj  =   ww * s.dimshuffle(0, 'x', 1) * s.dimshuffle(0, 1, 'x')
                scale_noise =   lambda alpha : (alpha.dimshuffle(0, 1, 'x') * jj).sum(1)

                print 'SCALED_NOISE!!!, Last layer : set add_noise to False, add its own scaled noise'
                add_noise   =   False

                #pre_act_noise   =   MRG.normal(avg  = 0, std  = std, size = hn.shape, dtype='float32')
                post_act_noise  =   MRG.normal(avg  = 0, std  = state.hidden_add_noise_sigma, size = hn.shape, dtype='float32')

                #pre_act_noise   =   scale_noise(pre_act_noise)
                post_act_noise  =   scale_noise(post_act_noise)

                #hiddens[i]      +=  pre_act_noise


        else:
            # next layer        :   layers[i+1], assigned weights : W_i
            # previous layer    :   layers[i-1], assigned weights : W_(i-1)
            hiddens[i]  =   T.dot(hiddens[i+1], weights_list[i].T) + T.dot(hiddens[i-1], weights_list[i-1]) + bias_list[i]

        # Add pre-activation noise if NOT input layer
        if i==1 and state.noiseless_h1:
            print '>>NO noise in first layer'
            add_noise   =   False


        # pre activation noise            
        if i != 0 and add_noise and not state.scaled_noise:
            print 'Adding pre-activation gaussian noise'
            hiddens[i]  =   add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma)
      
      
      
       
        # ACTIVATION!
        if i == 0:
            print 'Sigmoid units'
            hiddens[i]  =   T.nnet.sigmoid(hiddens[i])
        else:
            print 'Hidden units'
            hiddens[i]  =   hidden_activation(hiddens[i])

        # post activation noise            
        if i != 0 and add_noise:
            print 'Adding post-activation gaussian noise'
            if state.scaled_noise:
                hiddens[i]  +=  post_act_noise
            else:
                hiddens[i]  =   add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma)


        # POST ACTIVATION NOISE 
        if i != 0 and mul_noise and state.hidden_dropout:
            # dropout if hidden
            print 'Dropping out', state.hidden_dropout
            hiddens[i]  =   dropout(hiddens[i], state.hidden_dropout)
        elif i == 0:
            # if input layer -> append p(X|...)
            p_X_chain.append(hiddens[i])
            
            # sample from p(X|...)
            if state.input_sampling:
                print 'Sampling from input'
                sampled     =   MRG.binomial(p = hiddens[i], size=hiddens[i].shape, dtype='float32')
            else:
                print '>>NO input sampling'
                sampled     =   hiddens[i]
            # add noise
            sampled     =   salt_and_pepper(sampled, state.input_salt_and_pepper)
            
            # set input layer
            hiddens[i]  =   sampled

    def update_layers(hiddens, p_X_chain, autoregression, noisy = True):
        print 'odd layer update'
        update_odd_layers(hiddens, noisy)
        print
        print 'even layer update'
        update_even_layers(hiddens, p_X_chain, autoregression, noisy)

 
    ''' F PROP '''
    #X = T.fmatrix()
    if state.act == 'sigmoid':
        print 'Using sigmoid activation'
        hidden_activation = T.nnet.sigmoid
    elif state.act == 'rectifier':
        print 'Using rectifier activation'
        hidden_activation = lambda x : T.maximum(cast32(0), x)
    elif state.act == 'tanh':
        hidden_activation = lambda x : T.tanh(x)    
   
    
    ''' Corrupt X '''
    X_corrupt   = salt_and_pepper(X, state.input_salt_and_pepper)

    f_noise = theano.function(inputs = [X], outputs = salt_and_pepper(X, state.input_salt_and_pepper))

    ''' Commented for now (unless we need more denoising stuff)
    #############
    # Denoise some numbers  :   show number, noisy number, reconstructed number
    #############
    import random as R
    R.seed(1)
    random_idx      =   numpy.array(R.sample(range(len(test_X.get_value())), 100))
    numbers         =   test_X.get_value()[random_idx]
    
    noisy_numbers   =   f_noise(test_X.get_value()[random_idx])

    # Recompile the graph without noise for reconstruction function
    hiddens_R     = [X]
    p_X_chain_R   = []

    for w,b in zip(weights_list, bias_list[1:]):
        # init with zeros
        hiddens_R.append(T.zeros_like(T.dot(hiddens_R[-1], w)))

    # The layer update scheme
    for i in range(2 * N * K):
        update_layers(hiddens_R, p_X_chain_R, noisy=False, autoregression=state.autoregression)

    f_recon = theano.function(inputs = [X], outputs = p_X_chain_R[-1]) 

    '''
    
    ##################################
    # Sampling, round 2 motherf***** #
    ##################################
    
    # the input to the sampling function
    network_state_input     =   [X] + [T.fmatrix() for i in range(K)]
    'first input will be a noisy number and zeros at the hidden layer, is this correc?'
   
    # "Output" state of the network (noisy)
    # initialized with input, then we apply updates
    #network_state_output    =   network_state_input
    # WTFFFF why is it not the same? f*****g python list = list not the same as list = list(list) ???
    network_state_output    =   [X] + network_state_input[1:]
    

    visible_pX_chain        =   []

    #for i in range(2 * N * K):
    #    update_layers(network_state_output, visible_pX_chain, noisy=True, autoregression=False)

    # ONE update
    update_layers(network_state_output, visible_pX_chain, noisy=True, autoregression=False)

    # WHY IS THERE A WARNING????
    # because the first odd layers are not used -> directly computed FROM THE EVEN layers
    f_sample2   =   theano.function(inputs = network_state_input, outputs = network_state_output + visible_pX_chain, on_unused_input='warn')

    def sampling_wrapper(NSI):
        out             =   f_sample2(*NSI)
        NSO             =   out[:len(network_state_output)]
        vis_pX_chain    =   out[len(network_state_output):]
        return NSO, vis_pX_chain

    def sample_some_numbers(n_digits = 400):
        to_sample = time.time()
        # The network's initial state
        #init_vis    =   test_X.get_value()[:1]
        init_vis    =   test_X[:1]

        noisy_init_vis  =   f_noise(init_vis)

        network_state   =   [[noisy_init_vis] + [numpy.zeros((1,len(b.get_value())), dtype='float32') for b in bias_list[1:]]]

        visible_chain   =   [init_vis]

        noisy_h0_chain  =   [noisy_init_vis]

        for i in range(n_digits - 1):
           
            # feed the last state into the network, compute new state, and obtain visible units expectation chain 
            net_state_out, vis_pX_chain =   sampling_wrapper(network_state[-1])

            # append to the visible chain
            visible_chain   +=  vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)
            
            noisy_h0_chain.append(net_state_out[0])

        print 'Took ' + str(time.time() - to_sample) + ' to sample ' + str(n_digits) + ' digits'
        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)
    
    def plot_samples(epoch_number):
        V, H0 = sample_some_numbers()
        img_samples =   PIL.Image.fromarray(tile_raster_images(V, (root_N_input,root_N_input), (20,20)))
        
        fname       =   'samples_epoch_'+str(epoch_number)+'.png'
        img_samples.save(fname) 

    def save_params(n, params):
        fname   =   'params_epoch_'+str(n)+'.ft'
        f       =   open(fname, 'w')
        
        for p in params:
            ft.write(f, p.get_value(borrow=True))
       
        f.close() 


    def plot_one_digit(digit):
        plot_one    =   PIL.Image.fromarray(tile_raster_images(digit, (root_N_input,root_N_input), (1,1)))
        fname       =   'one_digit.png'
        plot_one.save(fname)
        os.system('eog one_digit.png')

    def inpainting(digit):
        # The network's initial state

        # NOISE INIT
        init_vis    =   cast32(numpy.random.uniform(size=digit.shape))

        #noisy_init_vis  =   f_noise(init_vis)
        #noisy_init_vis  =   cast32(numpy.random.uniform(size=init_vis.shape))

        # INDEXES FOR VISIBLE AND NOISY PART
        noise_idx = (numpy.arange(N_input) % root_N_input < (root_N_input/2))
        fixed_idx = (numpy.arange(N_input) % root_N_input > (root_N_input/2))
        # function to re-init the visible to the same noise

        # FUNCTION TO RESET HALF VISIBLE TO DIGIT
        def reset_vis(V):
            V[0][fixed_idx] =   digit[0][fixed_idx]
            return V
        
        # INIT DIGIT : NOISE and RESET HALF TO DIGIT
        init_vis = reset_vis(init_vis)

        network_state   =   [[init_vis] + [numpy.zeros((1,len(b.get_value())), dtype='float32') for b in bias_list[1:]]]

        visible_chain   =   [init_vis]

        noisy_h0_chain  =   [init_vis]

        for i in range(49):
           
            # feed the last state into the network, compute new state, and obtain visible units expectation chain 
            net_state_out, vis_pX_chain =   sampling_wrapper(network_state[-1])


            # reset half the digit
            net_state_out[0] = reset_vis(net_state_out[0])
            vis_pX_chain[0]  = reset_vis(vis_pX_chain[0])

            # append to the visible chain
            visible_chain   +=  vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)
            
            noisy_h0_chain.append(net_state_out[0])

        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)
 

    #V_inpaint, H_inpaint = inpainting(test_X.get_value()[:1])
    #plot_one    =   PIL.Image.fromarray(tile_raster_images(V_inpaint, (root_N_input,root_N_input), (1,50)))
    #fname       =   'test.png'
    #plot_one.save(fname)
    #os.system('eog test.png')
                                   
   
    #get all digits, and do it a couple of times
    test_X  =   test_X.get_value()
    #test_Y  =   test_Y.get_value()

    numpy.random.seed(1)
    test_idx    =   numpy.arange(len(test_Y))


    for Iter in range(10):

        numpy.random.shuffle(test_idx)
        test_X = test_X[test_idx]
        test_Y = test_Y[test_idx]
    
        digit_idx = [(test_Y==i).argmax() for i in range(10)]
        inpaint_list = []

        for idx in digit_idx:
            DIGIT = test_X[idx:idx+1] 
            V_inpaint, H_inpaint = inpainting(DIGIT)
            inpaint_list.append(V_inpaint)

        INPAINTING  =   numpy.vstack(inpaint_list)

        plot_inpainting =   PIL.Image.fromarray(tile_raster_images(INPAINTING, (root_N_input,root_N_input), (10,50)))

        fname   =   'inpainting_'+str(Iter)+'.png'

        plot_inpainting.save(fname)

        if False and __name__ ==  "__main__":
            os.system('eog inpainting.png')



    # PARZEN 
    # Generating 10000 samples
    samples, _ = sample_some_numbers(n_digits=10000) 
    
    Mean, Std   =   main(state.sigma_parzen, samples, test_X)

    #plot_samples(999)
    #sample_numbers(counter, [])

    if __name__ == '__main__':
        return Mean, Std
        #import ipdb; ipdb.set_trace()
    
    return channel.COMPLETE
示例#26
0
    def __init__(self,
                 inputs_hook=None,
                 params_hook=None,
                 outdir='outputs/conv2d',
                 input_size=None,
                 filter_shape=None,
                 strides=(1, 1),
                 border_mode='valid',
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0,
                 activation='rectifier',
                 convolution='conv2d',
                 mrg=RNG_MRG.MRG_RandomStreams(1)):
        """
        Initialize a 2-dimensional convolutional layer.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together. For now, it needs to include the shape information.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables).
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : tuple
            Shape of the incoming data: (batch_size, num_channels, input_height, input_width).
            If input_size is None, it can be inferred. However, border_mode can't be 'same'.
        filter_shape : tuple
            (num_filters, num_channels, filter_height, filter_width). This is also the shape of the weights matrix.
        stride : int
            The distance between the receptive field centers of neighboring units. This is the 'stride' of the
            convolution operation.
        border_mode : str, one of 'valid', 'full', 'same'
            A string indicating the convolution border mode.
            If 'valid', the convolution is only computed where the input and the
            filter fully overlap.
            If 'full', the convolution is computed wherever the input and the
            filter overlap by at least one position.
            If 'same', the convolution is computed wherever the input and the
            filter overlap by at least half the filter size, when the filter size
            is odd. In practice, the input is zero-padded with half the filter size
            at the beginning and half at the end (or one less than half in the case
            of an even filter size). This results in an output length that is the
            same as the input length (for both odd and even filter sizes).
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        activation : str or Callable
            The activation function to apply to the layer. See opendeep.utils.activation for options.
        convolution : str or Callable
            The 2-dimensional convolution implementation to use. The default of 'conv2d' is normally fine because it
            uses theano's tensor.nnet.conv.conv2d, which cherry-picks the best implementation with a meta-optimizer if
            you set the theano configuration flag 'optimizer_including=conv_meta'. Otherwise, you could pass a
            callable function, such as cudnn or cuda-convnet if you don't want to use the meta-optimizer.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.

        Notes
        -----
        Theano's default convolution function (`theano.tensor.nnet.conv.conv2d`)
        does not support the 'same' border mode by default. This layer emulates
        it by performing a 'full' convolution and then cropping the result, which
        may negatively affect performance.
        """
        super(Conv2D, self).__init__(
            **
            {arg: val
             for (arg, val) in locals().items() if arg is not 'self'})

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        # expect input to be in the form (B, C, 0, 1) (batch, channel, rows, cols)
        # inputs_hook is a tuple of (Shape, Input)
        if self.inputs_hook:
            # make sure inputs_hook is a tuple
            assert len(
                self.inputs_hook
            ) == 2, "expecting inputs_hook to be tuple of (shape, input)"
            self.input = inputs_hook[1]
        else:
            # make the input a symbolic matrix
            self.input = T.ftensor4('X')

        # activation function!
        activation_func = get_activation_function(activation)

        # convolution function!
        if convolution == 'conv2d':
            # using the theano flag optimizer_including=conv_meta will let this conv function optimize itself.
            convolution_func = T.nnet.conv2d
        else:
            assert callable(
                convolution
            ), "Input convolution was not 'conv2d' and was not Callable."
            convolution_func = convolution

        # filter shape should be in the form (num_filters, num_channels, filter_size[0], filter_size[1])
        num_filters = filter_shape[0]
        filter_size = filter_shape[2:3]

        ################################################
        # Params - make sure to deal with params_hook! #
        ################################################
        if self.params_hook:
            # make sure the params_hook has W and b
            assert len(self.params_hook) == 2, \
                "Expected 2 params (W and b) for Conv2D, found {0!s}!".format(len(self.params_hook))
            W, b = self.params_hook
        else:
            W = get_weights(
                weights_init=weights_init,
                shape=filter_shape,
                name="W",
                rng=mrg,
                # if gaussian
                mean=weights_mean,
                std=weights_std,
                # if uniform
                interval=weights_interval)

            b = get_bias(shape=(num_filters, ),
                         name="b",
                         init_values=bias_init)

        # Finally have the two parameters!
        self.params = [W, b]

        ########################
        # Computational Graph! #
        ########################
        if border_mode in ['valid', 'full']:
            conved = convolution_func(self.input,
                                      W,
                                      subsample=strides,
                                      image_shape=self.input_size,
                                      filter_shape=filter_shape,
                                      border_mode=border_mode)
        elif border_mode == 'same':
            assert self.input_size is not None, "input_size has to be specified for border_mode 'same'!"
            conved = convolution_func(self.input,
                                      W,
                                      subsample=strides,
                                      image_shape=self.input_size,
                                      filter_shape=filter_shape,
                                      border_mode='full')
            shift_x = (filter_size[0] - 1) // 2
            shift_y = (filter_size[1] - 1) // 2
            conved = conved[:, :, shift_x:self.input_size[2] + shift_x,
                            shift_y:self.input_size[3] + shift_y]
        else:
            raise RuntimeError("Invalid border mode: '%s'" % border_mode)

        self.output = activation_func(conved + b.dimshuffle('x', 0, 'x', 'x'))
示例#27
0
    def __init__(self,
                 inputs=None,
                 outputs=None,
                 params=None,
                 outdir='outputs/softmax',
                 weights_init='uniform',
                 weights_mean=0,
                 weights_std=5e-3,
                 weights_interval='glorot',
                 bias_init=0.0,
                 out_as_probs=True,
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 **kwargs):
        """
        Initialize a Softmax layer.

        Parameters
        ----------
        inputs : List of [tuple(shape, `Theano.TensorType`)]
            The dimensionality of the inputs for this model, and the routing information for the model
            to accept inputs from elsewhere. `shape` will be a monad tuple representing known
            sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of
            dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its
            dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size
            but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be:
            [((None, 784), <TensorType(float32, matrix)>)].
        outputs : int
            The dimensionality of the output for this model.
        params : Dict(string_name: theano SharedVariable), optional
            A dictionary of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as siamese networks or pretraining some
            weights.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        weights_init : str
            Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        out_as_probs : bool
            Whether to output the argmax prediction (the predicted class of the model), or the probability distribution
            over all classes. True means output the distribution of size `output_size` and False means output a single
            number index for the class that had the highest probability.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        """
        # init the fully connected generic layer with a softmax activation function
        super(Softmax, self).__init__(inputs=inputs,
                                      outputs=outputs,
                                      params=params,
                                      outdir=outdir,
                                      activation='softmax',
                                      weights_init=weights_init,
                                      weights_mean=weights_mean,
                                      weights_std=weights_std,
                                      weights_interval=weights_interval,
                                      bias_init=bias_init,
                                      out_as_probs=out_as_probs,
                                      mrg=mrg,
                                      **kwargs)
        if self.inputs is None:
            return
        # the outputs of the layer are the probabilities of being in a given class
        self.p_y_given_x = super(Softmax, self).get_outputs()
        self.y_pred = argmax(self.p_y_given_x, axis=1)

        if out_as_probs:
            self.output = self.p_y_given_x
        else:
            self.output = self.y_pred

        self.out_as_probs = out_as_probs
示例#28
0
    def __init__(self,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 outdir='outputs/gsn/',
                 input_size=None,
                 hidden_size=1000,
                 layers=2,
                 walkbacks=4,
                 visible_activation='sigmoid',
                 hidden_activation='tanh',
                 input_sampling=True,
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 tied_weights=True,
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 cost_function='binary_crossentropy',
                 cost_args=None,
                 add_noise=True,
                 noiseless_h1=True,
                 hidden_noise='gaussian',
                 hidden_noise_level=2,
                 input_noise='salt_and_pepper',
                 input_noise_level=0.4,
                 noise_decay='exponential',
                 noise_annealing=1,
                 image_width=None,
                 image_height=None,
                 **kwargs):
        """
        Initialize a GSN.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere.
            This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's
            output layer gives a generative recurrent model.) For now, it needs to include the shape
            information (normally the dimensionality of the hiddens i.e. n_hidden).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional.
            The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an
            unsupervised model. The output is a reconstruction of the input.
        hidden_size : int
            The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than
            `input_size`, which is known as *overcomplete*.
        visible_activation : str or callable
            The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer.
            This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        layers : int
            The number of hidden layers to use.
        walkbacks : int
            The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample
            from the DAE, which means the model generates inputs in sequence, where each generated input is compared
            to the original input to create the reconstruction cost for training. For running the model, the very last
            generated input in the Gibbs chain is used as the output.
        input_sampling : bool
            During walkbacks, whether to sample from the generated input to create a new starting point for the next
            walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the
            process more stochastic - more likely to find spurious modes in the model's representation.
        mrg : random
            A random number generator that is used when adding noise into the network and for sampling from the input.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        tied_weights : bool
            DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean
            determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the reconstruction cost of the model. This should be appropriate
            for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        add_noise : bool
            Whether to add noise (corrupt) the input before passing it through the computation graph during training.
            This should most likely be set to the default of True, because this is a *denoising* autoencoder after all.
        noiseless_h1 : bool
            Whether to not add noise (corrupt) the hidden layer during computation.
        hidden_noise : str
            What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise
            for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        hidden_noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        input_noise : str
            What type of noise to use for corrupting the input before computation (if `add_noise`).
            See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper
            for binary units, etc.
        input_noise_level : float
            The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper,
            standard deviation for Gaussian, interval for Uniform, etc.
        noise_decay : str or False
            Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_annealing : float
            The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        image_width : int
            If the input should be represented as an image, the width of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        image_height : int
            If the input should be represented as an image, the height of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(GSN, self).__init__(**initial_parameters)

        # when the input should be thought of as an image, either use the specified width and height,
        # or try to make as square as possible.
        if image_height is None and image_width is None:
            (_h, _w) = closest_to_square_factors(self.input_size)
            self.image_width = _w
            self.image_height = _h
        else:
            self.image_height = image_height
            self.image_width = image_width

        ############################
        # Theano variables and RNG #
        ############################
        if self.inputs_hook is None:
            self.X = T.matrix('X')
        else:
            # inputs_hook is a (shape, input) tuple
            self.X = self.inputs_hook[1]

        ##########################
        # Network specifications #
        ##########################
        # generally, walkbacks should be at least 2*layers
        if layers % 2 == 0:
            if walkbacks < 2 * layers:
                log.warning(
                    'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                    'Generaly want 2X walkbacks to layers', str(layers),
                    str(walkbacks))
        else:
            if walkbacks < 2 * layers - 1:
                log.warning(
                    'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                    'Generaly want 2X walkbacks to layers', str(layers),
                    str(walkbacks))

        self.add_noise = add_noise
        self.noise_annealing = as_floatX(
            noise_annealing)  # noise schedule parameter
        self.hidden_noise_level = sharedX(hidden_noise_level,
                                          dtype=theano.config.floatX)
        self.hidden_noise = get_noise(name=hidden_noise,
                                      noise_level=self.hidden_noise_level,
                                      mrg=mrg)
        self.input_noise_level = sharedX(input_noise_level,
                                         dtype=theano.config.floatX)
        self.input_noise = get_noise(name=input_noise,
                                     noise_level=self.input_noise_level,
                                     mrg=mrg)

        self.walkbacks = walkbacks
        self.tied_weights = tied_weights
        self.layers = layers
        self.noiseless_h1 = noiseless_h1
        self.input_sampling = input_sampling
        self.noise_decay = noise_decay

        # if there was a hiddens_hook, unpack the hidden layers in the tensor
        if self.hiddens_hook is not None:
            hidden_size = self.hiddens_hook[0]
            self.hiddens_flag = True
        else:
            self.hiddens_flag = False

        # determine the sizes of each layer in a list.
        #  layer sizes, from h0 to hK (h0 is the visible layer)
        hidden_size = list(raise_to_list(hidden_size))
        if len(hidden_size) == 1:
            self.layer_sizes = [self.input_size] + hidden_size * self.layers
        else:
            assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \
                                                    "Hiddens %d and layers %d" % (len(hidden_size), self.layers)
            self.layer_sizes = [self.input_size] + hidden_size

        if self.hiddens_hook is not None:
            self.hiddens = self.unpack_hiddens(self.hiddens_hook[1])

        #########################
        # Activation functions! #
        #########################
        # hidden unit activation
        self.hidden_activation = get_activation_function(hidden_activation)
        # Visible layer activation
        self.visible_activation = get_activation_function(visible_activation)
        # make sure the sampling functions are appropriate for the activation functions.
        if is_binary(self.visible_activation):
            self.visible_sampling = mrg.binomial
        else:
            # TODO: implement non-binary activation
            log.error("Non-binary visible activation not supported yet!")
            raise NotImplementedError(
                "Non-binary visible activation not supported yet!")

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        ###############
        # Parameters! #
        ###############
        # make sure to deal with params_hook!
        if self.params_hook is not None:
            # if tied weights, expect layers*2 + 1 params
            if self.tied_weights:
                assert len(self.params_hook) == 2*layers + 1, \
                    "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:layers]
                self.bias_list = self.params_hook[layers:]
            # if untied weights, expect layers*3 + 1 params
            else:
                assert len(self.params_hook) == 3*layers + 1, \
                    "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:2 * layers]
                self.bias_list = self.params_hook[2 * layers:]
        # otherwise, construct our params
        else:
            # initialize a list of weights and biases based on layer_sizes for the GSN
            self.weights_list = [
                get_weights(
                    weights_init=weights_init,
                    shape=(self.layer_sizes[i], self.layer_sizes[i + 1]),
                    name="W_{0!s}_{1!s}".format(i, i + 1),
                    rng=mrg,
                    # if gaussian
                    mean=weights_mean,
                    std=weights_std,
                    # if uniform
                    interval=weights_interval) for i in range(layers)
            ]
            # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now)
            if not tied_weights:
                self.weights_list.extend([
                    get_weights(
                        weights_init=weights_init,
                        shape=(self.layer_sizes[i + 1], self.layer_sizes[i]),
                        name="W_{0!s}_{1!s}".format(i + 1, i),
                        rng=mrg,
                        # if gaussian
                        mean=weights_mean,
                        std=weights_std,
                        # if uniform
                        interval=weights_interval)
                    for i in reversed(range(layers))
                ])
            # initialize each layer bias to 0's.
            self.bias_list = [
                get_bias(shape=(self.layer_sizes[i], ),
                         name='b_' + str(i),
                         init_values=bias_init) for i in range(layers + 1)
            ]

        # build the params of the model into a list
        self.params = self.weights_list + self.bias_list
        log.debug("gsn params: %s", str(self.params))

        # using the properties, build the computational graph
        self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph(
        )
示例#29
0
 The programs and documents are distributed without any warranty, express or
 implied.  As the programs were written for research purposes only, they have
 not been tested to the degree that would be advisable in any important
 application.  All use of these programs is entirely at the user's own risk.'''
'''Demo of Generating images with recurrent adversarial networks.
For more information, see: http://arxiv.org/abs/1602.05110
'''

import os, sys, gzip, time, timeit
import theano
import numpy as np
import scipy as sp

import theano.sandbox.rng_mrg as RNG_MRG
rng = np.random.RandomState()
MRG = RNG_MRG.MRG_RandomStreams(rng.randint(2**30))

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from optimize_gan import *
from gran import *
from utils import *
from util_cifar10 import *

#datapath='/data/lisa/data/cifar10/cifar-10-batches-py/'
#datapath='/eecs/research/asr/chris/DG_project/dataset/cifar-10-batches-py/'
datapath = '/home/daniel/Documents/data/cifar10/cifar-10-batches-py/'
''' Battle between two models M1 and M2'''
def run(rng_seed,
        ltype,
        mtype,
        load_path,
        load_epoch,
        verbose=False,
        ckernr=None,
        cri_ckern=None):

    assert ckernr != None

    np_rng = np.random.RandomState(rng_seed)  # only for shuflling files
    import base.subnets.layers.utils as utils
    import base.subnets.layers.someconfigs as someconfigs
    someconfigs.backend = 'gpuarray'
    utils.rng = np.random.RandomState(
        rng_seed)  # for init network and corrupt images
    rng = utils.rng

    #  ltype -> GAN LSGAN WGAN
    #    JS      0.4+-asdf
    #    LS
    #    WA
    #    MMD
    #    IS

    ### MODEL PARAMS
    ### MODEL PARAMS
    # ltype       = sys.argv[3]
    # mtype       = 'js'
    # print 'ltype: ' + ltype
    # print 'mtype: ' + mtype
    mmdF = False
    nndF = False

    # CONV (DISC)
    conv_num_hid = 100
    num_channel = 3  #Fixed
    num_class = 1  #Fixed
    D = 64 * 64 * 3
    kern = int(ckernr.split('_')[0])

    ### OPT PARAMS
    batch_sz = 100
    momentum = 0.0  #Not Used
    lam = 0.0

    epsilon_dis = 0.0002
    epsilon_gen = 0.0001

    # if mtype =='js' :
    #     epsilon_dis = 0.0002
    #     epsilon_gen = 0.0001
    #     K=5 #FIXED
    #     J=1
    # elif mtype == 'ls':
    #     epsilon_dis = 0.0002
    #     epsilon_gen = 0.0001
    #     K=5 #FIXED
    #     J=1
    # else:
    #     epsilon_dis = 0.0002
    #     epsilon_gen = 0.0001
    #     K=2 #FIXED
    #     J=1

    # ganI (GEN)
    filter_sz = 4  #FIXED
    nkerns = [8, 4, 2, 1, 3]
    ckern = int(ckernr.split('_')[-1])  #20
    num_hid1 = nkerns[0] * ckern * filter_sz * filter_sz  #Fixed
    num_steps = 3  # time steps
    num_z = 100

    lam1 = 0.000001
    ### TRAIN PARAMS
    num_epoch = 2
    epoch_start = 0  #Fixed
    contF = True  #Fixed

    num_hids = [num_hid1]

    input_width = 64
    input_height = 64
    input_depth = 3

    N = 1000
    Nv = N
    Nt = N  #Dummy variable

    ### SAVE PARAM
    model_param_save = 'num_hid%d.batch%d.eps_dis%g.eps_gen%g.num_z%d.num_epoch%g.lam%g.ts%d.data.100_CONV_lsun' % (
        conv_num_hid, batch_sz, epsilon_dis, epsilon_gen, num_z, num_epoch,
        lam1, num_steps)

    # device=sys.argv[1]
    import os
    os.environ['RNG_SEED'] = str(rng_seed)
    os.environ['LOAD_PATH'] = load_path
    os.environ['LOAD_EPOCH'] = str(load_epoch)
    os.environ['LTYPE'] = ltype
    # os.environ['MTYPE'] = mtype
    try:
        a = os.environ['CRI_KERN']
    except:
        if cri_ckern != None:
            os.environ['CRI_KERN'] = cri_ckern
        else:
            raise RuntimeError('cri_kern not provided')

    import theano
    import theano.sandbox.rng_mrg as RNG_MRG
    rng = np.random.RandomState(int(os.environ['RNG_SEED']))
    MRG = RNG_MRG.MRG_RandomStreams(rng.randint(2**30))

    import pwd
    username = pwd.getpwuid(os.geteuid()).pw_name

    # if username=='djkim117':
    #     save_path = '/work/djkim117/params/gap/lsun/'
    #     datapath = '/work/djkim117/lsun/church/preprocessed_toy_100/'
    # elif username=='imj':
    #     datapath = '/work/djkim117/lsun/church/preprocessed_toy_100/'
    #     save_path = '/work/imj/gap/dcgans/lsun/dcgan4_100swap_30epoch_noise'
    if username == 'mahe6562':
        datapath = '/scratch/g/gwtaylor/mahe6562/data/lsun/bedroom/preprocessed_toy_100/'

    #---

    # store the filenames into a list.
    train_filenames = sorted(
        glob.glob(datapath + 'train_hkl_b100_b_100/*' + '.hkl'))

    #4.shuffle train data order for each worker
    indices = np_rng.permutation(len(train_filenames))
    train_filenames = np.array(train_filenames)[indices].tolist()
    #---

    valid_filenames = sorted(
        glob.glob(datapath + 'val_hkl_b100_b_100/*' + '.hkl'))
    test_filenames = sorted(
        glob.glob(datapath + 'test_hkl_b100_b_100/*' + '.hkl'))

    num_hids = [num_hid1]
    train_params = [
        num_epoch, epoch_start, contF, train_filenames, valid_filenames,
        test_filenames
    ]
    opt_params = [
        batch_sz, epsilon_gen, epsilon_dis, momentum, num_epoch, N, Nv, Nt,
        lam1
    ]
    ganI_params = [
        batch_sz, D, num_hids, rng, num_z, nkerns, ckern, num_channel,
        num_steps
    ]
    conv_params = [conv_num_hid, D, num_class, batch_sz, num_channel, kern]
    book_keeping = main(opt_params, ganI_params, train_params, conv_params)

    te_score_ls, te_score_iw, mmd_te = main(opt_params, ganI_params,
                                            train_params, conv_params)

    return te_score_ls, te_score_iw, mmd_te  #, is_sam