Exemplo n.º 1
0
def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
    R = MRG_RandomStreams(234, use_cuda=False)
    u = R.binomial(size=size, p=mean)
    f = theano.function(var_input, u, mode=mode)
    out = f(*input)

    # Increase the number of steps if sizes implies only a few samples
    if numpy.prod(const_size) < 10:
        steps_ = steps * 100
    else:
        steps_ = steps
    basictest(f, steps_, const_size, prefix="mrg  cpu", inputs=input, allow_01=True, target_avg=mean, mean_rtol=rtol)

    if mode != "FAST_COMPILE" and cuda_available:
        R = MRG_RandomStreams(234, use_cuda=True)
        u = R.binomial(size=size, p=mean, dtype="float32")
        # well, it's really that this test w GPU doesn't make sense otw
        assert u.dtype == "float32"
        f = theano.function(
            var_input, theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(u), borrow=True), mode=mode_with_gpu
        )
        gpu_out = numpy.asarray(f(*input))

        basictest(
            f, steps_, const_size, prefix="mrg  gpu", inputs=input, allow_01=True, target_avg=mean, mean_rtol=rtol
        )
        numpy.testing.assert_array_almost_equal(out, gpu_out, decimal=6)

    RR = theano.tensor.shared_randomstreams.RandomStreams(234)

    uu = RR.binomial(size=size, p=mean)
    ff = theano.function(var_input, uu, mode=mode)
    # It's not our problem if numpy generates 0 or 1
    basictest(ff, steps_, const_size, prefix="numpy", allow_01=True, inputs=input, target_avg=mean, mean_rtol=rtol)
Exemplo n.º 2
0
    def get_fixed_var_descr(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """

        assert Y is not None

        batch_size = model.batch_size

        drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size))
        drop_mask_X.name = 'drop_mask'

        X_space = model.get_input_space()

        updates = OrderedDict()
        rval = FixedVarDescr()
        inputs=[X, Y]

        if not self.supervised:
            update_X = self.mask_gen(X, X_space = X_space)
        else:
            drop_mask_Y = sharedX(np.ones(batch_size,))
            drop_mask_Y.name = 'drop_mask_Y'
            update_X, update_Y = self.mask_gen(X, Y, X_space)
            updates[drop_mask_Y] = update_Y
            rval.fixed_vars['drop_mask_Y'] =  drop_mask_Y
        if self.mask_gen.sync_channels:
            n = update_X.ndim
            assert n == drop_mask_X.ndim - 1
            update_X.name = 'raw_update_X'
            zeros_like_X = T.zeros_like(X)
            zeros_like_X.name = 'zeros_like_X'
            update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x')
            update_X.name = 'update_X'
        updates[drop_mask_X] = update_X

        rval.fixed_vars['drop_mask'] = drop_mask_X

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            include_prob_V = model.inference_procedure.include_prob_V
            include_prob_Y = model.inference_procedure.include_prob_Y

            theano_rng = MRG_RandomStreams(2012+11+20)
            for elem in flatten([model.inference_procedure.V_dropout]):
                updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V
            if "Softmax" in str(type(model.hidden_layers[-1])):
                hid = model.inference_procedure.H_dropout[:-1]
                y = model.inference_procedure.H_dropout[-1]
                updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y
            else:
                hid = model.inference_procedure.H_dropout
            for elem in flatten(hid):
                updates[elem] =  theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob

        rval.on_load_batch = [utils.function(inputs, updates=updates)]

        return rval
Exemplo n.º 3
0
def dropout(x, level, noise_shape=None, seed=None):
    '''Sets entries in `x` to zero at random,
    while scaling the entire tensor.

    # Arguments
        x: tensor
        level: fraction of the entries in the tensor
            that will be set to 0.
        noise_shape: shape for randomly generated keep/drop flags,
            must be broadcastable to the shape of `x`
        seed: random seed to ensure determinism.
    '''
    if level < 0. or level >= 1:
        raise Exception('Dropout level must be in interval [0, 1[.')
    if seed is None:
        seed = np.random.randint(1, 10e6)

    rng = RandomStreams(seed=seed)
    retain_prob = 1. - level

    if noise_shape is None:
        random_tensor = rng.binomial(x.shape, p=retain_prob, dtype=x.dtype)
    else:
        random_tensor = rng.binomial(noise_shape, p=retain_prob, dtype=x.dtype)
        random_tensor = T.patternbroadcast(random_tensor, [dim == 1 for dim in noise_shape])

    x *= random_tensor
    x /= retain_prob
    return x
def build_model(tparams, options):

    opt_ret = dict()

    trng = RandomStreams(1234)
    p = 0.5
    retain_prob = 1. - p
    print('dropout: {0}'.format(p))

    # description string: #words x #samples
    # text: text sentence
    # hypothesis: hypothesis sentence
    text_embedding = tensor.tensor3('text_embedding', dtype='float32')
    # text = tensor.matrix('text', dtype='int64')
    text_mask = tensor.matrix('text_mask', dtype='float32')
    hypothesis_embedding = tensor.tensor3('hypothesis_embedding', dtype='float32')
    # hypothesis = tensor.matrix('hypothesis', dtype='int64')
    hypothesis_mask = tensor.matrix('hypothesis_mask', dtype='float32')

    label = tensor.vector('label', dtype='int64')

    # encoder
    proj = get_layer(options['encoder'])[1](tparams, text_embedding, None, options,
                                            prefix='encoder',
                                            mask=text_mask)
    ctx = proj[0][-1]
    dec_ctx = ctx
    # dropout
    dec_ctx_dropped = dec_ctx
    dec_ctx_dropped *= trng.binomial(dec_ctx_dropped.shape, p=retain_prob, dtype=dec_ctx_dropped.dtype)
    dec_ctx_dropped /= retain_prob

    # decoder (hypothesis)
    proj_hypo = get_layer(options['decoder'])[1](tparams, hypothesis_embedding, dec_ctx, options,
                                             prefix='h_decode_t',
                                             mask=hypothesis_mask)
    proj_hypo_dropped = get_layer(options['decoder'])[1](tparams, hypothesis_embedding, dec_ctx_dropped, options,
                                             prefix='h_decode_t',
                                             mask=hypothesis_mask)
    hypo_ctx = proj_hypo[0][-1]
    hypo_ctx_dropped = proj_hypo_dropped[0][-1]
    # dropout
    hypo_ctx_dropped *= trng.binomial(hypo_ctx_dropped.shape, p=retain_prob, dtype=hypo_ctx_dropped.dtype)
    hypo_ctx_dropped /= retain_prob


    # cost (cross entropy)

    logit = get_layer('ff')[1](tparams, hypo_ctx, options, prefix='ff_logit', activ='tensor.nnet.sigmoid')
    logit_dropped = get_layer('ff')[1](tparams, hypo_ctx_dropped, options, prefix='ff_logit', activ='tensor.nnet.sigmoid')

    # flatten logit
    logit = logit.flatten()
    logit_dropped = logit_dropped.flatten()
    cost = binary_crossentropy(logit_dropped, label)
    cost = tensor.mean(cost)
    acc = tensor.mean(tensor.eq(tensor.round(logit), label))

    return text_embedding, text_mask, hypothesis_embedding, hypothesis_mask, label, cost, acc
Exemplo n.º 5
0
def get_sequence_dropout_mask(shape, p, stocdrop=False):
    srng = RandomStreams(seed=np.random.randint(1e6))
    if not stocdrop:
        return srng.binomial(size=shape, p=1.0 - p, dtype=floatX) / (1.0 - p)
    else:
        # FIXME assumes shape of dim (time steps, batch size, hidden size)
        col_mask = srng.binomial(size=(shape[0], shape[1], 1), p=1.0 - p, dtype=floatX)
        mask = T.tile(col_mask, (1, 1, shape[2]))
        return mask
Exemplo n.º 6
0
def rbm_ais_gibbs_for_v(rbmA_params, rbmB_params, beta, v_sample, seed=23098):
    """
    Parameters:
    -----------
    rbmA_params: list
        Parameters of the baserate model (usually infinite temperature). List
        should be of length 3 and contain numpy.ndarrays corresponding to model
        parameters (weights, visbias, hidbias).

    rbmB_params: list
        similar to rbmA_params, but for model at temperature 1.

    beta: theano.shared
        scalar, represents inverse temperature at which we wish to sample from.

    v_sample: theano.shared
        matrix of shape (n_runs, nvis), state of current particles.

    seed: int
        optional seed parameter for sampling from binomial units.
    """

    (weights_a, visbias_a, hidbias_a) = rbmA_params
    (weights_b, visbias_b, hidbias_b) = rbmB_params

    theano_rng = RandomStreams(seed)

    # equation 15 (Salakhutdinov & Murray 2008)
    ph_a = nnet.sigmoid(
        (1 - beta) * (tensor.dot(v_sample, weights_a) + hidbias_a))
    ha_sample = theano_rng.binomial(
        size=(v_sample.shape[0], len(hidbias_a)),
        n=1,
        p=ph_a,
        dtype=config.floatX)

    # equation 16 (Salakhutdinov & Murray 2008)
    ph_b = nnet.sigmoid(beta * (tensor.dot(v_sample, weights_b) + hidbias_b))
    hb_sample = theano_rng.binomial(
        size=(v_sample.shape[0], len(hidbias_b)),
        n=1,
        p=ph_b,
        dtype=config.floatX)

    # equation 17 (Salakhutdinov & Murray 2008)
    pv_act = (1 - beta) * (tensor.dot(ha_sample, weights_a.T) + visbias_a) + \
                beta * (tensor.dot(hb_sample, weights_b.T) + visbias_b)
    pv = nnet.sigmoid(pv_act)
    new_v_sample = theano_rng.binomial(
        size=(v_sample.shape[0], len(visbias_b)),
        n=1,
        p=pv,
        dtype=config.floatX)

    return new_v_sample
Exemplo n.º 7
0
 def compute_output(self, network, in_vw):
     p = network.find_hyperparameter(["dropout_probability",
                                      "probability",
                                      "p"],
                                     0)
     if p == 0:
         network.copy_variable(
             name="default",
             previous_variable=in_vw,
             tags={"output"},
         )
     else:
         rescale_factor = 1 / (1 - p)
         mask_shape = in_vw.shape
         if any(s is None for s in mask_shape):
             # NOTE: this uses symbolic shape - can be an issue with
             # theano.clone and random numbers
             # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs
             warnings.warn("using symbolic shape for dropout mask, "
                           "which can be an issue with theano.clone")
             mask_shape = in_vw.variable.shape
         # TODO save this state so that we can seed the rng
         srng = MRG_RandomStreams()
         mask = rescale_factor * srng.binomial(mask_shape,
                                               p=p,
                                               dtype=floatX)
         network.create_variable(
             "default",
             variable=in_vw.variable * mask,
             shape=in_vw.shape,
             tags={"output"},
         )
Exemplo n.º 8
0
class SemMemModule(MergeLayer):
    # Semantic Memory Module (= Word Embedding Layer)
    # Lasagne Library has Merge Layer, which is basic layer class accepting multiple inputs.
    # Semantic Memory Module and its parameters ared shared into Input Module and Question Module.
    # Therefore, It might not act as ordinary feed-forward layer, and needs extra codes to be trained.
    def __init__(self, incomings, voc_size, hid_state_size, W=Normal(), **kwargs):
        # Initialize parameters and create theano variables
        super(SemMemModule, self).__init__(incomings, **kwargs)
        self.hid_state_size = hid_state_size
        self.W = self.add_param(W, (voc_size, hid_state_size), name='Word_Embedding', regularizable=False)
        self.rand_stream = RandomStreams(np.random.randint(1, 2147462579))
    
    def get_output_shape_for(self, input_shapes):
        # Define output shape for certain input shapes (helps debugging)
        return (None, None, self.hid_state_size)

    def get_output_for(self, inputs, **kwargs):
        # Core part that actually describes how the theano variables works to produce output
        # input is in shape of (batch, sentence, word)
        # word_dropout is the varible determines the proportion of words to be masked to 0-vectors
        input         = inputs[0]
        word_dropout  = inputs[1]

        # Apply an input tensor to word embedding matrix and word_dropout.
        # And then, flatten it to shape of (batch*sentence, word, hid_state) to be fit into GRU library  
        return T.reshape(self.W[input], (-1, input.shape[2], self.hid_state_size)) * self.rand_stream.binomial((input.shape[0]*input.shape[1], input.shape[2]), p=1-word_dropout, dtype=theano.config.floatX).dimshuffle((0,1,'x'))
Exemplo n.º 9
0
class SimpleModel(Model):

    def __init__(self, nvis, num_hid, num_class):
        self.__dict__.update(locals())
        del self.self

        self.input_space = VectorSpace(nvis)
        self.output_space = VectorSpace(num_class)
        self.theano_rng = MRG_RandomStreams(2012 + 10 + 16)
        rng = np.random.RandomState([16,10,2012])

        self.W = sharedX(rng.uniform(-.05,.05,(nvis, num_hid)))
        self.hb = sharedX(np.zeros((num_hid,)) - 1.)
        self.V = sharedX(rng.uniform(-.05,.05,(num_hid, num_class)))
        self.cb = sharedX(np.zeros((num_class,)))

        self._params = [self.W, self.hb, self.V, self.cb ]

    def get_weights(self):
        return self.W.get_value()

    def get_weights_format(self):
        return ('v','h')

    def emit(self, X):

        Z = T.dot(X, self.W) + self.hb
        exp_H = T.nnet.sigmoid(Z)
        H = self.theano_rng.binomial(p = exp_H, n = 1, size = exp_H.shape, dtype = exp_H.dtype)

        Zc = T.dot(H, self.V) + self.cb

        return exp_H, H, Zc
Exemplo n.º 10
0
class BitFlip:

    def __init__(self,  nvis, prob):
        """ A conditional distribution that flips
        bits
        """

        self.__dict__.update(locals())
        del self.self

        self.s_rng = RandomStreams(17)

    def random_design_matrix(self, X):

        flip =  self.s_rng.binomial(size=X.shape, n = 1, p = self.prob, dtype=config.floatX)

        return X * (1-flip) + (1-X)*flip

    def is_symmetric(self):
        """ A property of conditional distributions
        P(Y|X)
        Return true if P(y|x) = P(x|y) for all x,y
        """

        return True
def dropout(X, p=0.):
    srng = RandomStreams()
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X
class SimpleBernoulliSampleLayer(lasagne.layers.Layer):
    """
    Simple sampling layer drawing samples from bernoulli distributions.

    Parameters
    ----------
    mean : :class:`Layer` instances
          Parameterizing the mean value of each bernoulli distribution
    seed : int
        seed to random stream
    Methods
    ----------
    seed : Helper function to change the random seed after init is called
    """

    def __init__(self, mean,
                 seed=lasagne.random.get_rng().randint(1, 2147462579),
                 **kwargs):
        super(SimpleBernoulliSampleLayer, self).__init__(mean, **kwargs)

        self._srng = RandomStreams(seed)

    def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
        self._srng.seed(seed)

    def get_output_shape_for(self, input_shape):
        return input_shape

    def get_output_for(self, mu, **kwargs):
        return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype)
Exemplo n.º 13
0
class AdditiveMaskedDiagonalMND:

    def __init__(self, init_beta, nvis, prob):
        """ A conditional distribution that adds
        gaussian noise with diagonal precision
        matrix beta to another variable that it
        conditions on
        """

        self.__dict__.update(locals())
        del self.self

        self.beta = sharedX(np.ones((nvis,))*init_beta)
        assert self.beta.ndim == 1

        self.s_rng = RandomStreams(17)

    def random_design_matrix(self, X):
        """ X: a theano variable containing a design matrix
        of observations of the random vector to condition on."""
        Z = self.s_rng.normal(size=X.shape,
                              avg=0., std=1./T.sqrt(self.beta), dtype=config.floatX)

        mask = self.s_rng.binomial(size=X.shape, n = 1, p = self.prob, dtype=config.floatX)

        return X+mask*Z

    def is_symmetric(self):
        """ A property of conditional distributions
        P(Y|X)
        Return true if P(y|x) = P(x|y) for all x,y
        """

        return True
Exemplo n.º 14
0
class FullyConnectedLayer(object):
    """Used to create a fully connected layer of neurons."""
    
    def __init__(self, n_inputs, n_outputs, dropout_rate = 0.0, activation_fn=sigmoid):
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        self.dropout_rate = dropout_rate
        self.activation_fn = activation_fn
        self.is_convolutional = False
        
        # Initializing weights and biases to samples from normal Gaussian
        self.w = shared(rng.normal(0,1.0/(self.n_outputs*(1-self.dropout_rate)),(self.n_outputs,self.n_inputs)).astype(config.floatX), borrow=True)
        self.b = shared(rng.normal(0,1.0,(self.n_outputs,)).astype(config.floatX), borrow=True)
        self.params = [self.w, self.b]
        
        self.nrg = RandomStreams()
    
    def set_inpt(self, inpt, training):
        self.inpt = T.flatten(inpt, 2)
        if training:
            bern = self.nrg.binomial(size = T.shape(self.inpt),p=1.0-self.dropout_rate, ndim=2).astype(config.floatX)
            #bern = bern.reshape(T.shape(inpt))
            inpt = inpt*bern
        else:
            inpt = inpt*(1-self.dropout_rate)
        
        self.output = T.dot(self.w, self.inpt.T).T + self.b.dimshuffle('x',0)
        return self.activation_fn(self.output)
Exemplo n.º 15
0
class Dropout(MaskedLayer):
    '''
        Hinton's dropout.
    '''
    def __init__(self, p):
        super(Dropout, self).__init__()
        self.p = p
        self.srng = RandomStreams(seed=np.random.randint(10e6))

    def get_output(self, train=False):
        X = self.get_input(train)
        if self.p > 0.:
            retain_prob = 1. - self.p
            if train:
                X *= self.srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
            else:
                X *= retain_prob
        return X

    def calc_output_dims(self, lastdims):
        return lastdims

    def get_config(self):
        return {"name": self.__class__.__name__,
                "p": self.p}
Exemplo n.º 16
0
class Dropout(object):
    def __init__(self, input, p, is_train_stage):
        self.p = p
        self.srng = RandomStreams(seed = np.random.randint(10e6))
        self.output = T.switch(is_train_stage,
            input * self.srng.binomial(input.shape, p = p, dtype = theano.config.floatX) / (1 - self.p),
            input)
Exemplo n.º 17
0
class SampleBernoulli(Layer):
    """
    Layer which samples a Bernoulli distribution whose statistics (mean, 'p') are given 
    as inputs to the layer.

    :param mode:    'maximum_likelihood' for maximum likelihood sample, 
                    'random' for random sample,
                    'mean_field' for mean-field approximation.
    """
    def __init__(self, mode='maximum_likelihood'):
        super(SampleBernoulli, self).__init__()
        self.mode = mode
        if self.mode == 'random':
            self.srng = RandomStreams(seed=np.random.randint(10e6))

    def get_output(self, train=False):
        p = self.get_input(train)
        if self.mode == 'maximum_likelihood':
            # draw maximum likelihood sample from Bernoulli distribution
            #    x* = argmax_x p(x) = 1         if p(x=1) >= 0.5
            #                         0         otherwise
            return T.round(p, mode='half_away_from_zero')
        elif self.mode == 'random':
            # draw random sample from Bernoulli distribution
            #    x* = x ~ p(x) = 1              if p(x=1) > uniform(0, 1)
            #                    0              otherwise
            return self.srng.binomial(size=p.shape, n=1, p=p, dtype=theano.config.floatX)
        elif self.mode == 'mean_field':
            # draw mean-field approximation sample from Bernoulli distribution
            #    x* = E[p(x)] = E[Bern(x; p)] = p
            return p
        else:
            raise NotImplementedError('Unknown sample mode!')
Exemplo n.º 18
0
class DropOp(Layer):
    """
    This layers randomly drops elements of the input by multiplying with a
    mask sampled from a binomial distribution
    """
    def __init__(self, rng = None, name=None, dropout=1.):
        super(DropOp, self).__init__(0, 0, None, name)
        self.dropout = dropout
        if dropout < 1.:
            self.trng = RandomStreams(rng.randint(1e5))

    def fprop(self, state_below, use_noise = True):
        print 'dropop use noise:', use_noise
        self.out = state_below
        if self.dropout < 1.:
            if use_noise:
                print 'training use noise'
                self.out = self.out * self.trng.binomial(self.out.shape,
                                                         n=1,
                                                         p=self.dropout,
                                                         dtype=self.out.dtype)
            else:
                print 'decoding not use noise'
                self.out = self.out * self.dropout
        return self.out
Exemplo n.º 19
0
 def compute_output(self, network, in_vw):
     deterministic = network.find_hyperparameter(["deterministic"])
     p = network.find_hyperparameter(["dropout_probability", "probability", "p"], 0)
     if deterministic or p == 0:
         network.copy_vw(name="default", previous_vw=in_vw, tags={"output"})
     else:
         rescale_factor = 1 / (1 - p)
         mask_shape = in_vw.shape
         if any(s is None for s in mask_shape):
             # NOTE: this uses symbolic shape - can be an issue with
             # theano.clone and random numbers
             # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs
             warnings.warn("using symbolic shape for dropout mask, " "which can be an issue with theano.clone")
             mask_shape = in_vw.symbolic_shape()
         # FIXME generalize to other shape dimensions.
         # assume this is of the form bc01 (batch, channel, width, height)
         mask_shape = mask_shape[:2]
         # TODO save this state so that we can seed the rng
         srng = MRG_RandomStreams()
         # set bernoulli probability to be inverse of dropout probability
         # because 1 means to keep the unit
         bernoulli_prob = 1 - p
         mask = rescale_factor * srng.binomial(mask_shape, p=bernoulli_prob, dtype=fX)
         mask = mask.dimshuffle(0, 1, "x", "x")
         network.create_vw("default", variable=in_vw.variable * mask, shape=in_vw.shape, tags={"output"})
 def output(self, x, a):
     p = self.p
     srng = RandomStreams()
     if p > 0:
         retain_prob = 1 - p
         x *= srng.binomial(x.shape, p=retain_prob, dtype=theano.config.floatX)
         x /= retain_prob
     return x
Exemplo n.º 21
0
class Dropout:

    def __init__(self, inp, p):
        # NOTE need to set p to 0 during testing
        self.srng = RandomStreams(seed=np.random.randint(1e6))
        self.p = p
        self.inp = inp
        self.out = self.inp * self.srng.binomial(size=self.inp.shape, p=1.0 - self.p, dtype=floatX) / (1.0 - self.p)
Exemplo n.º 22
0
def dropout(rng, x, p=0.5):
    """ Zero-out random values in x with probability p using rng """
    if p > 0. and p < 1.:
        seed = rng.randint(2 ** 30)
        srng = RandomStreams(seed)
        mask = srng.binomial(n=1, p=1.-p, size=x.shape, dtype=theano.config.floatX)
        return x * mask
    return x
Exemplo n.º 23
0
    def __init__(self, x, p=0.5):
        use_noise = theano.shared(numpy_floatX(0.))
        trng = RandomStreams(415)

        self.output = T.switch(use_noise,
                               (x * trng.binomial(x.shape, p=p, n=1, dtype=x.dtype)),
                               x * p
                               )
Exemplo n.º 24
0
class WordDropoutLayer(Layer):
    """Dropout layer
    Sets values to zero with probability p. See notes for disabling dropout
    during testing.
    Parameters
    ----------
    incoming : a :class:`Layer` instance or a tuple
        the layer feeding into this layer, or the expected input shape
    p : float or scalar tensor
        The probability of setting a value to zero
    rescale : bool
        If true the input is rescaled with input / (1-p) when deterministic
        is False.
    Notes
    -----
    The dropout layer is a regularizer that randomly sets input values to
    zero; see [1]_, [2]_ for why this might improve generalization.
    During training you should set deterministic to false and during
    testing you should set deterministic to true.
    If rescale is true the input is scaled with input / (1-p) when
    deterministic is false, see references for further discussion. Note that
    this implementation scales the input at training time.
    References
    ----------
    .. [1] Hinton, G., Srivastava, N., Krizhevsky, A., Sutskever, I.,
           Salakhutdinov, R. R. (2012):
           Improving neural networks by preventing co-adaptation of feature
           detectors. arXiv preprint arXiv:1207.0580.
    .. [2] Srivastava Nitish, Hinton, G., Krizhevsky, A., Sutskever,
           I., & Salakhutdinov, R. R. (2014):
           Dropout: A Simple Way to Prevent Neural Networks from Overfitting.
           Journal of Machine Learning Research, 5(Jun)(2), 1929-1958.
    """
    def __init__(self, incoming, p=0.5, **kwargs):
        super(WordDropoutLayer, self).__init__(incoming, **kwargs)
        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
        self.p = p

    def get_output_for(self, input, deterministic=False, **kwargs):
        """
        Parameters
        ----------
        input : tensor
            output from the previous layer
        deterministic : bool
            If true dropout and scaling is disabled, see notes
        """
        if deterministic or self.p == 0:
            return input
        else:
            retain_prob = 1 - self.p
            # use nonsymbolic shape for dropout mask if possible
            input_shape = self.input_shape
            if any(s is None for s in input_shape):
                input_shape = input.shape

            return input * self._srng.binomial(input_shape,p=retain_prob,dtype='int32')
Exemplo n.º 25
0
def dropout(x, level, seed=None):
    if level < 0. or level >= 1:
        raise Exception('Dropout level must be in interval [0, 1[.')
    if seed is None:
        seed = np.random.randint(10e6)
    rng = RandomStreams(seed=seed)
    retain_prob = 1. - level
    x *= rng.binomial(x.shape, p=retain_prob, dtype=x.dtype)
    x /= retain_prob
    return x
Exemplo n.º 26
0
def dropout_layer(state_before, use_noise, trng=None):
    if trng is None:
        trng = RandomStreams(1234)

    proj = tensor.switch(
        use_noise,
        state_before * trng.binomial(state_before.shape, p=0.5, n=1,
                                     dtype=state_before.dtype) * 2,
        state_before)
    return proj
Exemplo n.º 27
0
class Dropout:

    # NOTE p here is the probability that we drop (zero out) unit

    def __init__(self, inp, p):
        # NOTE need to set p to 0 during testing
        self.srng = RandomStreams(seed=np.random.randint(1e6))
        self.p = p
        self.inp = inp
        self.out = self.inp * self.srng.binomial(size=self.inp.shape, p=1.0 - self.p, dtype=floatX) / (1.0 - self.p)
Exemplo n.º 28
0
    def get_cost(self, X, Y, **kwargs):

        # Dream
        theano_rng = MRG_RandomStreams(2012 + 12 + 18)
        exp_y = T.nnet.softmax(T.alloc(0., self.batch_size, self.n_classes) + self.gyb)
        dy = theano_rng.multinomial(pvals = exp_y, dtype='float32')
        dy = block_gradient(dy)
        exp_h2 = T.nnet.sigmoid(T.dot(dy, self.gh2w) + self.gh2b)
        dh2 = theano_rng.binomial(p = exp_h2, size = exp_h2.shape, dtype='float32')
        dh2 = block_gradient(dh2)
        exp_h1 = T.nnet.sigmoid(T.dot(dh2, self.gh1w) + self.gh1b)
        dh1 = theano_rng.binomial(p = exp_h1, size = exp_h1.shape, dtype='float32')
        dh1 = block_gradient(dh1)
        exp_v = T.nnet.sigmoid(T.dot(dh1, self.gvw) + self.gvb)
        dv = theano_rng.binomial(p = exp_v, size = exp_v.shape, dtype='float32')
        dv = block_gradient(dv)

        # Explanation of dream
        zh1, rh1 = self.infer_h1(dv)
        zh2 = T.dot(rh1, self.rh2w) + self.rh2b
        rh2 = T.nnet.sigmoid(zh2)
        zy = T.dot(rh2, self.ryw) + self.ryb

        # Probability of dream
        dream_prob = sigmoid_prob(zh1, dh1) + sigmoid_prob(zh2, dh2) + softmax_prob(zy, dy)

        # Explanation of reality
        zh1, rh1 = self.infer_h1(X)
        rh1 = block_gradient(rh1)
        zh2 = T.dot(rh1, self.rh2w) + self.rh2b
        rh2 = theano_rng.binomial(p = T.nnet.sigmoid(zh2), size = zh2.shape, dtype='float32')
        rh2 = block_gradient(rh2)

        # Probability of reality
        real_prob = softmax_prob(T.alloc(0., self.batch_size, self.n_classes) + self.gyb, Y) + \
                sigmoid_prob(T.dot(Y, self.gh2w) + self.gh2b, rh2) + \
                sigmoid_prob(T.dot(rh2, self.gh1w) + self.gh1b, rh1) + \
                sigmoid_prob(T.dot(rh1, self.gvw) + self.gvb, X)

        return - dream_prob - real_prob + .0001 * (
            T.sqr(self.gvw).sum() + T.sqr(self.gh1w).sum() + \
                    T.sqr(self.gh2w).sum()
                )
Exemplo n.º 29
0
class DropoutLayer:
    def __init__(self, p):
        self.p = p
        self.srng = RandomStreams(seed=np.random.randint(10e8))

    def apply(self, x, training_time):
        if training_time:
            return x * self.srng.binomial(x.shape, p=1 - self.p, dtype=theano.config.floatX) / (1 - self.p)

        return x
Exemplo n.º 30
0
def dropout(X,p=0.):
    """
    Add some noise to regularize by drop out by probility p
    so to prevent overfitting
    """
    if p>0:
        retain_prob=1-p
        srng=RandomStreams()
        X*=srng.binomial(X.shape,p=retain_prob,dtype=theano.config.floatX)
        X/=retain_prob
    return X
Exemplo n.º 31
0
    def __init__(self,
                 num_neurons,
                 id=-1,
                 distribution='binomial',
                 verbose=2,
                 options=None):

        if verbose >= 3:
            print("... Creating a " + distribution + "random layer of " + \
                  "output_shape " +  str(num_neurons))

        super(random_layer, self).__init__(id=id,
                                           type='random',
                                           verbose=verbose)
        rng = numpy.random
        srng = RandomStreams(rng.randint(1, 2147462468), use_cuda=None)

        if isinstance(num_neurons, int):
            num_neurons = (num_neurons, )

        if distribution == 'binomial':
            if not 'p' in options.keys():
                if verbose >= 3:
                    print("... Needs input p, by default assuming 0.5")
                p = 0.5
            else:
                p = options["p"]

            self.output = srng.binomial(n=1,
                                        p=p,
                                        size=num_neurons,
                                        dtype=theano.config.floatX)

        elif distribution == 'uniform':
            if not 'limits' in options.keys():
                if verbose >= 3:
                    print("... Needs limits, assuming default (0,1)")
                limits = (0, 1)
            else:
                limits = options['limits']

            self.output = srng.uniform(size=num_neurons,
                                       low=limits[0],
                                       high=limits[1],
                                       dtype=theano.config.floatX)

        elif distribution == 'gaussian' or distribution == 'normal':
            if not 'mu' in options.keys():
                if verbose >= 3:
                    print("... Needs mu, assuming default 0")
                mu = 0
            else:
                mu = options['mu']
            if not 'sigma' in options.keys():
                if verbose >= 3:
                    print("... Needs sigma, assuming default 1")
                sigma = 1
            else:
                sigma = options['sigma']

            self.output = srng.normal(size=num_neurons,
                                      avg=mu,
                                      std=sigma,
                                      dtype=theano.config.floatX)

        self.output_shape = num_neurons
        self.num_neurons = num_neurons

        if verbose >= 3:
            print("... Random layer is created with output shape " +
                  str(self.output_shape))
Exemplo n.º 32
0
class VAE:
    def __init__(self,
                 n_in,
                 n_hidden,
                 n_out,
                 n_hidden_decoder=None,
                 trans_func=rectify,
                 batch_size=100):
        self.n_in = n_in
        self.n_hidden = n_hidden
        self.n_out = n_out
        self.l_in = InputLayer((batch_size, n_in))
        self.batch_size = batch_size
        self.transf = trans_func

        self.srng = RandomStreams()

        l_in_encoder = lasagne.layers.InputLayer(shape=(batch_size, n_in))
        l_in_decoder = lasagne.layers.InputLayer(shape=(batch_size, n_out))

        l_prev_encoder = l_in_encoder
        l_prev_decoder = l_in_decoder

        for i in range(len(n_hidden)):
            l_tmp_encoder = lasagne.layers.DenseLayer(l_prev_encoder,
                                                      num_units=n_hidden[i],
                                                      W=lasagne.init.Uniform(),
                                                      nonlinearity=self.transf)
            l_prev_encoder = l_tmp_encoder

        # cause you might want a decoder which is not the mirror of the encoder
        if n_hidden_decoder is None:
            n_hidden_decoder = n_hidden
        self.n_hidden_decoder = n_hidden_decoder

        for i in range(len(n_hidden_decoder)):
            l_tmp_decoder = lasagne.layers.DenseLayer(
                l_prev_decoder,
                num_units=n_hidden_decoder[-(i + 1)],
                W=lasagne.init.Uniform(),
                nonlinearity=self.transf)

            l_prev_decoder = l_tmp_decoder

        l_in = lasagne.layers.InputLayer(shape=(batch_size, n_in))
        self.model = VAELayer(l_in,
                              encoder=l_prev_encoder,
                              decoder=l_prev_decoder,
                              latent_size=n_out,
                              x_distribution='bernoulli',
                              qz_distribution='gaussian',
                              pz_distribution='gaussian')
        self.x = T.matrix('x')

    def build_model(self, train_x, test_x, valid_x, update, update_args):
        self.train_x = train_x
        self.test_x = test_x
        self.validation_x = valid_x
        self.update = update
        self.update_args = update_args
        self.index = T.iscalar('index')
        self.batch_slice = slice(self.index * self.batch_size,
                                 (self.index + 1) * self.batch_size)

        x = self.srng.binomial(size=self.x.shape, n=1, p=self.x)
        log_pz, log_qz_given_x, log_px_given_z = self.model.get_log_distributions(
            self.x)
        loss_eval = (log_pz + log_px_given_z - log_qz_given_x).sum()
        loss_eval /= self.batch_size

        all_params = get_all_params(self.model)
        updates = self.update(-loss_eval, all_params, *self.update_args)

        train_model = theano.function(
            [self.index],
            loss_eval,
            updates=updates,
            givens={
                self.x: self.train_x[self.batch_slice],
            },
        )

        test_model = theano.function(
            [self.index],
            loss_eval,
            givens={
                self.x: self.test_x[self.batch_slice],
            },
        )

        validate_model = theano.function(
            [self.index],
            loss_eval,
            givens={
                self.x: self.validation_x[self.batch_slice],
            },
        )

        return train_model, test_model, validate_model

    def draw_sample(self, z):
        return self.model.draw_sample(z)

    def get_output(self, dat):
        z, _, _ = self.model.get_z_mu_sigma(dat)
        return z

    def get_reconstruction(self, z):
        return self.model.decoder_output(z)
Exemplo n.º 33
0
class VisibleLayer(object):
    def __init__(self, v_dim, h_dim, v_type, mrng=None, rng=None, name=''):

        self.name = name if name != '' else 'v_layer'

        self.v_dim = v_dim
        self.h_dim = h_dim
        self.v_type = v_type

        seed = np.random.randint(1, 2**30)
        self._rng = RandomStreams(seed) if rng is None else rng
        self._mrng = MRG_RandomStreams(seed) if mrng is None else mrng

        self._build_params()

    def set_total_count(self, total_count):
        if not (self.v_type == InputType.poisson):
            raise ValueError(
                "The input type should be Poisson to set total count")
        self.total_count = total_count

    def _build_params(self):
        # W to connect with hidden layer
        self.params = []
        if self.v_type == InputType.poisson:
            init_W = np.random.uniform(low=-1 / self.h_dim,
                                       high=1 / self.h_dim,
                                       size=(self.v_dim, self.h_dim))
            self.W = init_weight(self.v_dim,
                                 self.h_dim,
                                 value=init_W,
                                 name=self.name + '-W')
        else:
            self.W = init_weight(self.v_dim, self.h_dim, name=self.name + '-W')
        self.b_v = init_bias(self.v_dim, name=self.name + '-b_v')

        # Ca binary, gaussian, and categorical
        self.params.extend([self.W, self.b_v])

        # Truong hop gaussian co them sigma
        if self.v_type == InputType.gaussian:
            self.sigma_v = T.ones(shape=(self.v_dim, ),
                                  dtype=theano.config.floatX)
            self.sigma_v.name = self.name + "-sigma_v"

    # Result in a vector of (n, 1)
    def v_free_term(self, v):
        if self.v_type == InputType.poisson:
            return -T.sum(T.gammaln(1 + v), axis=1)
        else:
            return 0

    # Result in a vector of (n, 1)
    def v_bias_term(self, v):
        # Note that for gaussian case, the v_bias should be negative
        if self.v_type == InputType.gaussian:
            return -T.sum((v - self.b_v)**2 / (2 * self.sigma_v**2), axis=1)
        else:
            return T.dot(v, self.b_v)

    # Result in a vector of (n, H)
    def v_weight_term(self, v):
        if self.v_type == InputType.gaussian:
            return T.dot((v / (self.sigma_v**2)), self.W)
        else:
            return T.dot(v, self.W)

    # Only support binary, gaussian and categorical
    def v_given_h(self, h):
        if self.v_type == InputType.binary:
            p_v_h = T.nnet.sigmoid(self.b_v + T.dot(h, self.W.T))
            return p_v_h

        elif self.v_type == InputType.gaussian:
            mu_v = self.b_v + T.dot(h, self.W.T)
            return mu_v

        elif self.v_type == InputType.categorical:
            p_v_h = T.nnet.softmax(self.b_v + T.dot(h, self.W.T))
            return p_v_h

        elif self.v_type == InputType.poisson:
            if not hasattr(self, 'total_count') or self.total_count is None:
                raise ValueError(
                    'Total count should be set for constrained Poisson')

            unconstrained_lmbd_v = T.exp(self.b_v + T.dot(h, self.W.T))
            lmbd_v = unconstrained_lmbd_v * 1.0 / T.sum(unconstrained_lmbd_v, axis=1, keepdims=True) \
                     * self.total_count
            return lmbd_v

    # Only support binary, gaussian and categorical
    def sample_v_given_h(self, h0_sample):
        if self.v_type == InputType.binary:
            v1_mean = self.v_given_h(h0_sample)
            v1_sample = self._mrng.binomial(size=v1_mean.shape,
                                            n=1,
                                            p=v1_mean,
                                            dtype=theano.config.floatX)
            return [v1_mean, v1_sample]

        elif self.v_type == InputType.gaussian:
            mu_v1 = self.v_given_h(h0_sample)  # Note that mu_v1 is returned

            v1_sample = self._mrng.normal(size=mu_v1.shape,
                                          avg=mu_v1,
                                          std=self.sigma_v,
                                          dtype=theano.config.floatX)
            return [mu_v1, v1_sample]

        # Note that there is constraint in the case of Multinomial
        elif self.v_type == InputType.categorical:
            prob_v1 = self.v_given_h(h0_sample)
            v1_sample = self._mrng.multinomial(pvals=prob_v1,
                                               n=1,
                                               dtype=theano.config.floatX)
            return [prob_v1, v1_sample]

        elif self.v_type == InputType.poisson:
            lmbd_v1 = self.v_given_h(h0_sample)
            # We have to use RandomStreams, not MRG_RandomStreams
            v1_sample = self._rng.poisson(size=lmbd_v1.shape,
                                          lam=lmbd_v1,
                                          dtype=theano.config.floatX)
            return [lmbd_v1, v1_sample]

    def l1_grad(self, l1):
        gW = l1_grad(self.W, l1)
        return [gW, 0]

    def l2_grad(self, l2):
        gW = l2_grad(self.W, l2)
        return [gW, 0]

    def nll_grad_formula(self, v0, vk, h0, hk):
        n_instances = v0.shape[0]

        gW = (T.dot(vk.T, hk) - T.dot(v0.T, h0)) / n_instances

        if self.v_type == InputType.gaussian:
            gb_v = T.mean((vk - v0) / (self.sigma_v**2), axis=0)
            grads = [gW, gb_v]
        else:
            gb_v = T.mean(vk - v0, axis=0)
            grads = [gW, gb_v]

        return grads

    def get_viewed_cost(self, v0, vk_stat):
        # Binary cross-entropy
        cost = 0
        if self.v_type == InputType.binary:
            # Clip to avoid log(0)
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001),
                                  np.float32(0.999999))
            cost = -T.sum(v0 * T.log(clip_vk_stat) +
                          (1 - v0) * T.log(1 - clip_vk_stat),
                          axis=1)

        # Sum square error
        elif self.v_type == InputType.gaussian:
            cost = T.sum((v0 - vk_stat)**2, axis=1)

        # Categorical cross-entropy
        elif self.v_type == InputType.categorical:
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001),
                                  np.float32(0.999999))
            cost = -T.sum(v0 * T.log(clip_vk_stat), axis=1)

        elif self.v_type == InputType.poisson:
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.inf)
            cost = -T.sum(
                -vk_stat + v0 * T.log(clip_vk_stat) - T.gammaln(1 + v0),
                axis=1)

        return cost

    def get_params(self):
        return self.params
Exemplo n.º 34
0
    def __init__(self, rng, layer_id, shape, X, mask, use_noise=1, p=0.5):
        """
        Basic RNN with dropout

        Parameters
        ----------
        :param rng: can be generated as numpy.random.seed(123)

        :type layer_id: str
        :param layer_id: id of this layer

        :type shape: tuple
        :param shape: (in_size, out_size) where
                      in_size is the input dimension
                      out_size is the hidden units' dimension

        :type X: a 3D or 2D variable, mostly a 3D one
        :param X: model inputs

        :type mask: theano variable
        :param mask: model inputs

        :type use_noise: theano variable
        :param use_noise: whether dropout is random

        :type p: float
        :param p: dropout ratio
        """
        prefix = 'Basic' + layer_id
        self.in_size, self.hid_size = shape

        # weights for input
        self.W = init_weights(shape=(self.in_size, self.hid_size),
                              name=prefix + '#W')
        # weights for hidden states
        self.U = init_weights(shape=(self.hid_size, self.hid_size),
                              name=prefix + '#U')
        # bias
        self.b = init_bias(size=self.hid_size, name=prefix + '#b')

        self.X = X
        self.mask = mask

        nsteps = X.shape[0]
        if X.ndim == 3:
            n_samples = X.shape[1]
        else:
            n_samples = 1

        assert mask is not None

        def _slice(x, n, dim):
            if x.ndim == 3:
                return x[:, :, n * dim:(n + 1) * dim]
            return x[:, n * dim:(n + 1) * dim]

        def _step(x_t, m_t, h_tm1):
            """
            This function computes one step evolution in LSTM

            Parameters
            ----------
            :type m_t: (n_samples, )
            :param m_t: mask

            :type x_t: (n_samples, in_size)
            :param x_t: input at time t

            :type h_tm1: (n_samples, hid_size)
            :param h_tm1: hidden state at time (t - 1)
            """
            # h_t with size (n_samples, hid_size)
            preact = T.dot(x_t, self.W) + T.dot(h_tm1, self.U) + self.b
            h_t = T.tanh(preact)
            # consider the mask
            h_t = m_t[:, None] * h_t + (1. - m_t)[:, None] * h_tm1

            return h_t

        h, updates = theano.scan(
            fn=_step,
            sequences=[self.X, self.mask],
            outputs_info=[T.alloc(floatX(0.), n_samples, self.hid_size)])
        # h here is of size (t, n_samples, hid_size)
        if p > 0:
            trng = RandomStreams(rng.randint(999999))
            drop_mask = trng.binomial(size=h.shape,
                                      n=1,
                                      p=(1 - p),
                                      dtype=theano.config.floatX)
            self.activation = T.switch(T.eq(use_noise, 1), h * drop_mask,
                                       h * (1 - p))
        else:
            self.activation = h

        self.params = [self.W, self.U, self.b]
Exemplo n.º 35
0
class MixedRBM(Model):
    def __init__(self,
                 v_dim=784,
                 h_dim=500,
                 v_types=[],
                 v_indices=[],
                 b_h=None,
                 input_var=None,
                 mrng=None,
                 rng=None,
                 name='',
                 **kwargs):
        name = 'mixed_rbm' if name == '' else name

        super(MixedRBM, self).__init__(name, )
        self.input = T.matrix('input')
        self.n_instances = self.input.shape[0]

        model_file = kwargs.get('model_file')
        if model_file is not None:
            self.load(model_file)
            self._load_params()

        else:
            self.v_dim = v_dim
            self.h_dim = h_dim
            self.v_types = v_types
            self.v_indices = v_indices

            seed = np.random.randint(1, 2**30)
            self._mrng = MRG_RandomStreams(seed) if mrng is None else mrng
            self._rng = RandomStreams(seed) if rng is None else rng
            self._rng = None

            if hasattr(self.v_indices[0], '__iter__'):
                self.v_ranges = self.v_indices
            else:
                self.v_ranges = [None] * len(self.v_indices)
                for i in xrange(len(self.v_indices)):
                    self.v_ranges[i] = range(self.v_indices[i], self.v_indices[i + 1]) \
                        if i < len(self.v_indices) - 1 else range(self.v_indices[i], v_dim)

            self.v_layers = []

            for i in xrange(len(self.v_ranges)):
                self.v_ranges[i] = np.asarray(self.v_ranges[i], dtype=np.int32)
                v_layer = VisibleLayer(v_dim=len(self.v_ranges[i]),
                                       h_dim=self.h_dim,
                                       v_type=self.v_types[i],
                                       name='v_layer({})'.format(i),
                                       mrng=self._mrng,
                                       rng=self._rng)
                if v_types[i] == InputType.poisson:
                    total_count = T.sum(self.input[:, self.v_ranges[i]],
                                        axis=1,
                                        keepdims=True)
                    v_layer.set_total_count(total_count)

                self.v_layers.append(v_layer)

            self._build_mask()
            self._build_params()

    def print_model_info(self):
        print "\nInfo of model {}".format(self.name)
        print "v_dims: {} | h_dim: {}".format(self.v_dim, self.h_dim)
        for i in xrange(len(self.v_types)):
            print "v_types: {} | v_ranges: {}".format(self.v_types[i],
                                                      self.v_ranges[i])

    def get_save(self):
        return [
            self.name, self.v_dim, self.h_dim, self.v_indices, self.v_types,
            self._mrng, self._rng, self.big_mask, self.v_ranges, self.v_layers,
            self.b_h
        ]

    def set_load(self, saved_data):
        [
            self.name, self.v_dim, self.h_dim, self.v_indices, self.v_types,
            self._mrng, self._rng, self.big_mask, self.v_ranges, self.v_layers,
            self.b_h
        ] = saved_data

    def _load_params(self):
        self.params = [self.b_h]
        for i in xrange(len(self.v_layers)):
            self.params.extend(self.v_layers[i].get_params())

    def _build_params(self):
        self.b_h = init_bias(dim=self.h_dim, name=self.name + '-b_h')
        self.params = [self.b_h]

        for i in xrange(len(self.v_layers)):
            self.params.extend(self.v_layers[i].get_params())

    def _build_mask(self):
        big_m = np.zeros((self.v_dim, self.v_dim), dtype=theano.config.floatX)
        k = 0
        for i in xrange(len(self.v_ranges)):
            for j in xrange(len(self.v_ranges[i])):
                big_m[k, self.v_ranges[i][j]] = 1
                k += 1
        # self.big_mask = theano.shared(big_m, name='big_mask')
        # Sparse mask
        self.big_mask = sparse.shared(sp.csc_matrix(big_m), name='big_mask')

    def encode(self, v_data):
        h_code = self.h_given_v(self.input)
        fn = theano.function([self.input], h_code)
        return fn(v_data)

    def get_weight(self):
        Ws = [v_layer.W for v_layer in self.v_layers]
        return sparse.structured_dot(self.big_mask.T, T.concatenate(Ws,
                                                                    axis=0))

    def _vs(self, v):
        # Mac loi ngo ngan o day ma mo mai khong ra
        # return [v[v_range] for v_range in self.v_ranges]
        return [v[:, v_range] for v_range in self.v_ranges]

    def score(self, data):
        free_fn = theano.function([self.input], self.free_energy(self.input))
        return free_fn(data)

    # Energy from many v an 1 h
    def energy(self, vs, h):
        v_free = 0
        v_bias = 0
        v_weight = 0
        for i in xrange(len(vs)):
            v_free += self.v_layers[i].v_free_term(vs[i])
            v_bias += self.v_layers[i].v_bias_term(vs[i])
            v_weight += self.v_layers[i].v_weight_term(vs[i])

        return -(v_free + v_bias + v_weight * h + T.dot(h, self.b_h))

    def free_energy(self, v):
        v_free = 0
        v_bias = 0
        v_weight = 0

        vs = self._vs(v)
        for i in xrange(len(vs)):
            v_free += self.v_layers[i].v_free_term(vs[i])
            v_bias += self.v_layers[i].v_bias_term(vs[i])
            v_weight += self.v_layers[i].v_weight_term(vs[i])

        h_term = T.sum(T.log(1 + T.exp(v_weight + self.b_h)), axis=1)
        return -(v_bias + v_free + h_term)

    def v_given_h(self, h):
        vs_stat = []
        for i in xrange(len(self.v_layers)):
            vs_stat.append(self.v_layers[i].v_given_h(h))
        return sparse.structured_dot(T.concatenate(vs_stat, axis=1),
                                     self.big_mask)

    def h_given_v(self, v):
        vs = self._vs(v)

        v_weight = 0
        for i in xrange(len(vs)):
            v_weight += self.v_layers[i].v_weight_term(vs[i])

        p_h_v = T.nnet.sigmoid(v_weight + self.b_h)
        return p_h_v

    # vs0_sample is  list contain samples of each v_type
    def sample_h_given_v(self, v0_sample):
        h1_mean = self.h_given_v(v0_sample)
        h1_sample = self._mrng.binomial(size=h1_mean.shape,
                                        n=1,
                                        p=h1_mean,
                                        dtype=theano.config.floatX)
        return [h1_mean, h1_sample]

    # sample vs1 given h0_sample
    def sample_v_given_h(self, h0_sample):
        vs_stat = []
        vs_sample = []

        for i in xrange(len(self.v_layers)):
            v1_stat, v1_sample = self.v_layers[i].sample_v_given_h(h0_sample)

            vs_stat.append(v1_stat)
            vs_sample.append(v1_sample)

        v_stat = sparse.structured_dot(T.concatenate(vs_stat, axis=1),
                                       self.big_mask)
        v_sample = sparse.structured_dot(T.concatenate(vs_sample, axis=1),
                                         self.big_mask)

        return [v_stat, v_sample]

    # One step of gibbs sampling
    def gibbs_hvh(self, h0_sample):
        # Here we use v1_stat to show that it is sufficient statistics of v1
        [v1_stat, v1_sample] = self.sample_v_given_h(h0_sample)
        [h1_mean, h1_sample] = self.sample_h_given_v(v1_sample)

        return [v1_stat, v1_sample, h1_mean, h1_sample]

    def gibbs_vhv(self, v0_sample):
        [h1_mean, h1_sample] = self.sample_h_given_v(v0_sample)
        [v1_stat, v1_sample] = self.sample_v_given_h(h1_sample)

        return [h1_mean, h1_sample, v1_stat, v1_sample]

    def run_CD_from_h(self, k, data_h):
        start_h = T.matrix("start_h")
        # [v_stats, v_samples, h_means, h_samples], updates \
        outputs, updates \
            = theano.scan(fn=self.gibbs_hvh, outputs_info=[None, None, None, start_h],
                          n_steps=k, name="gibbs_hvh")
        CD_fn = theano.function([start_h],
                                outputs=outputs[-1],
                                updates=updates)
        return CD_fn(data_h)

    def run_CD_from_v(self, k, data_v):
        start_v = T.matrix("start_v")
        # [h_means, h_samples, v_stats, v_samples], updates \
        outputs, updates \
            = theano.scan(fn=self.gibbs_vhv, outputs_info=[None, None, None, start_v],
                          n_steps=k, name="gibbs_vhv")
        CD_fn = theano.function([start_v],
                                outputs=outputs[-1],
                                updates=updates)
        return CD_fn(data_v)

    # Return visible variables
    def _gibbs_vhv_to_v_fn(self, steps, persis_v, is_sample=True, name=''):
        [h_means, h_samples, v_stats, v_samples], updates \
            = theano.scan(self.gibbs_vhv,
                          outputs_info=[None, None, None, persis_v],
                          n_steps=steps,  # init_gibbs dung de init
                          name='gibbs_vhv')
        updates.update({persis_v: v_samples[-1]})
        if is_sample:
            gibbs_fn = theano.function([],
                                       v_samples[-1],
                                       updates=updates,
                                       name=name)
        else:
            gibbs_fn = theano.function([],
                                       v_stats[-1],
                                       updates=updates,
                                       name=name)
        return gibbs_fn

    # Also return visible variables
    def _gibbs_hvh_to_v_fn(self, steps, persis_h, is_sample=True, name=''):
        [v_stats, v_samples, h_means, h_samples], updates \
            = theano.scan(self.gibbs_hvh,
                          outputs_info=[None, None, None, persis_h],
                          n_steps=steps,  # init_gibbs dung de init
                          name='gibbs_vhv')
        updates.update({persis_h: h_samples[-1]})
        if is_sample:
            gibbs_fn = theano.function([],
                                       v_samples[-1],
                                       updates=updates,
                                       name=name)
        else:
            gibbs_fn = theano.function([],
                                       v_stats[-1],
                                       updates=updates,
                                       name=name)
        return gibbs_fn

    def sample_given_input(self,
                           input_x,
                           init_gibbs=1000,
                           betw_gibbs=100,
                           loops=10,
                           is_sample=False):
        print "Sample data from input using model {}".format(self.name)
        # Neu kich thuoc input la 1 thi phai chuyen no ve kich thuoc 2
        if len(input_x.shape) == 1:
            persis_v = theano.shared(
                np.asarray(input_x.reshape(1, input_x.shape[0]),
                           dtype=theano.config.floatX))
        else:
            persis_v = theano.shared(
                np.asarray(input_x, dtype=theano.config.floatX))

        if init_gibbs > 0:
            init_sampling_fn = self._gibbs_vhv_to_v_fn(init_gibbs,
                                                       persis_v,
                                                       is_sample=True,
                                                       name='init_sampling_fn')
        else:
            init_sampling_fn = None

        sample_fn = self._gibbs_vhv_to_v_fn(betw_gibbs,
                                            persis_v,
                                            is_sample=is_sample,
                                            name='sample_fn')

        rvs_data = []
        if init_sampling_fn is not None:
            init_sampling_fn()
        for idx in range(loops):
            print "Running sampling loop %d" % idx
            rv_data = sample_fn()
            rvs_data.append(rv_data)

        return np.asarray(rvs_data)

    # Sample randomly
    # We start from h and run gibbs chain until it reaches equilibrium
    def sample(self,
               init_gibbs=1000,
               betw_gibbs=100,
               n_samples=20,
               loops=10,
               is_sample=False):
        print "Sample random data using model {}".format(self.name)
        persis_h = theano.shared(
            np.zeros((n_samples, self.h_dim), dtype=theano.config.floatX))

        if init_gibbs > 0:
            init_sampling_fn = self._gibbs_hvh_to_v_fn(init_gibbs,
                                                       persis_h,
                                                       is_sample=True,
                                                       name='init_sampling_fn')
        else:
            init_sampling_fn = None
        sample_fn = self._gibbs_hvh_to_v_fn(betw_gibbs,
                                            persis_h,
                                            is_sample=is_sample,
                                            name='sample_fn')

        rvs_data = []
        if init_sampling_fn is not None:
            init_sampling_fn()
        for idx in range(loops):
            print "Running sampling loop %d" % idx
            rv_data = sample_fn()
            rvs_data.append(rv_data)

        return np.asarray(rvs_data)

    def get_cost_udpates(self, lr, k, persis_h, l1, l2, stable_update,
                         store_grad):
        # Run one sample step to get h
        h_mean, h_sample = self.sample_h_given_v(self.input)

        # Run normal CD
        start_h = persis_h if persis_h is not None else h_sample

        [v_stats, v_samples, h_means, h_samples], updates \
            = theano.scan(fn=self.gibbs_hvh, outputs_info=[None, None, None, start_h],
                          n_steps=k, name="gibbs_hvh")

        vk = v_samples[-1]
        v_stat_k = v_stats[-1]

        if persis_h is not None:
            updates[persis_h] = h_samples[-1]

        cost = self.get_viewed_cost(self.input, v_stat_k)
        cost = T.mean(cost)

        # For stable update, use mean value instead of random sampled value
        if stable_update:
            print "\nStable update is set to be True"
            updates = self.params_updates(self.input, v_stat_k, lr, l1, l2,
                                          updates, store_grad)
        else:
            print "\nStable update is set to be False"
            updates = self.params_updates(self.input, vk, lr, l1, l2, updates,
                                          store_grad)

        # return cost, updates
        return cost, updates

    def get_viewed_cost(self, v0, v_stat):
        cost = 0

        vs0 = self._vs(v0)
        vs_stat = self._vs(v_stat)

        for i in xrange(len(self.v_layers)):
            type_cost = self.v_layers[i].get_viewed_cost(vs0[i], vs_stat[i])
            cost += type_cost

        return cost

    def nll_grad_formula(self, v0, vk):
        h0 = self.h_given_v(v0)
        hk = self.h_given_v(vk)

        gb_h = T.mean(hk - h0, axis=0)
        grads = [gb_h]

        vs0 = self._vs(v0)
        vsk = self._vs(vk)

        for i in xrange(len(self.v_layers)):
            grads.extend(self.v_layers[i].nll_grad_formula(
                vs0[i], vsk[i], h0, hk))

        return grads

    def l1_grad(self, l1):
        grads = [0]
        for i in xrange(len(self.v_layers)):
            grads.extend(self.v_layers[i].l1_grad(l1))
        return grads

    def l2_grad(self, l2):
        grads = [0]
        for i in xrange(len(self.v_layers)):
            grads.extend(self.v_layers[i].l2_grad(l2))
        return grads

    def params_updates(self, v0, vk, lr, l1, l2, updates, store_grad):
        if updates is None:
            updates = OrderedDict()
        if store_grad:
            self.stored_grads = OrderedDict()

        grads = [0 for _ in xrange(len(self.params))]

        o_grads = self.nll_grad_formula(v0, vk)
        grads = [grads[i] + o_grads[i] for i in xrange(len(self.params))]

        if store_grad:
            print "\nGradients over negative log-likelihood are stored in original_grads"
            o_shared_grads, updates = store_grads_in_update(
                self.params, o_grads, updates)
            self.stored_grads['original_grads'] = o_shared_grads

        if l1 is not None:
            print "Add L1 regularization ({}) to parameter updates".format(l1)
            l1_grads = self.l1_grad(l1)
            grads = [grads[i] + l1_grads[i] for i in xrange(len(self.params))]

            if store_grad:
                print "\nGradients over L1 regularization are stored in l1_grads"
                l1_shared_grads, updates = store_grads_in_update(
                    self.params, l1_grads, updates)
                self.stored_grads['l1_grads'] = l1_shared_grads

        if l2 is not None:
            print "Add L2 regularization ({}) to parameter updates".format(l2)
            l2_grads = self.l2_grad(l2)
            grads = [grads[i] + l2_grads[i] for i in xrange(len(self.params))]

            if store_grad:
                print "\nGradients over L2 regularization are stored in l2_grads"
                l2_shared_grads, updates = store_grads_in_update(
                    self.params, l2_grads, updates)
                self.stored_grads['l2_grads'] = l2_shared_grads

        if store_grad:
            print "\nGradients over total cost are stored in total_grads"
            t_shared_grads, updates = store_grads_in_update(
                self.params, grads, updates)
            self.stored_grads['total_grads'] = t_shared_grads

        grads = [grad.astype(theano.config.floatX) for grad in grads]

        if self.check_learning_algor():
            params_updates = self.learning_algor(grads, self.params, lr,
                                                 **self.learning_config)
            updates.update(params_updates)
        else:
            print "\nSimple SGD is used as training algorithm"
            for grad, param in zip(grads, self.params):
                updates[param] = param - grad * lr

        return updates

    def config_train(self, **kwargs):
        k = kwargs.get('CD_k')
        persis_h_data = kwargs.get('persis_h')
        l1 = kwargs.get('L1')
        l2 = kwargs.get('L2')

        if l1 is None:
            print "L1 should be set to enable sparse weight regularization"
        if l2 is None:
            print "L2 should be set to enable sparse weight regularization"

        stable_update = kwargs.get('stable_update')
        if stable_update is None:
            stable_update = False

        store_grad = kwargs.get('store_grad')
        if store_grad is None:
            store_grad = False

        self._build_train(k, persis_h_data, l1, l2, stable_update, store_grad)

    # persis_v_data is a numpy array
    def _build_train(self, k, persis_h_data, l1, l2, stable_update,
                     store_grad):
        persis_h = theano.shared(persis_h_data, borrow=True) \
            if persis_h_data is not None else None

        lr = T.scalar('lr')
        cost, updates = self.get_cost_udpates(lr, k, persis_h, l1, l2,
                                              stable_update, store_grad)
        print "\nBuild computation graph for training function of model {}".format(
            self.name)
        self.train_fn = theano.function([self.input, lr],
                                        cost,
                                        updates=updates)

        rv = self.v_given_h(self.h_given_v(self.input))
        test_cost = self.get_viewed_cost(self.input, rv)
        test_cost = T.mean(test_cost)
        print "Build computation graph for validation function of model {}".format(
            self.name)
        self.valid_fn = theano.function([self.input], test_cost)
Exemplo n.º 36
0
rng = np.random.RandomState(123)
theano_rng = RandomStreams(rng.randint(2**30))

corruption_level = 0.1
training_epochs = 25
learning_rate = 0.1
batch_size = 128

W1 = init_weights(28 * 28, 900)
b1 = init_bias(900)
b1_prime = init_bias(28 * 28)
W1_prime = W1.transpose()
W2 = init_weights(900, 10)
b2 = init_bias(10)

tilde_x = theano_rng.binomial(
    size=x.shape, n=1, p=1 - corruption_level, dtype=theano.config.floatX) * x
y1 = T.nnet.sigmoid(T.dot(tilde_x, W1) + b1)
z1 = T.nnet.sigmoid(T.dot(y1, W1_prime) + b1_prime)
cost1 = -T.mean(T.sum(x * T.log(z1) + (1 - x) * T.log(1 - z1), axis=1))

params1 = [W1, b1, b1_prime]
grads1 = T.grad(cost1, params1)
updates1 = [(param1, param1 - learning_rate * grad1)
            for param1, grad1 in zip(params1, grads1)]
train_da1 = theano.function(inputs=[x],
                            outputs=cost1,
                            updates=updates1,
                            allow_input_downcast=True)

p_y2 = T.nnet.softmax(T.dot(y1, W2) + b2)
y2 = T.argmax(p_y2, axis=1)
Exemplo n.º 37
0
class MultiLayer(Layer):
    """
    Implementing a standard feed forward MLP
    """
    def __init__(self,
                 rng,
                 n_in,
                 n_hids=[500, 500],
                 activation='TT.tanh',
                 scale=0.01,
                 sparsity=-1,
                 rank_n_approx=0,
                 rank_n_activ='lambda x: x',
                 weight_noise=False,
                 dropout=1.,
                 init_fn='sample_weights_classic',
                 bias_fn='init_bias',
                 bias_scale=0.,
                 learn_bias=True,
                 grad_scale=1.,
                 name=None):
        """
        :type rng: numpy random generator
        :param rng: numpy random generator

        :type n_in: int
        :param n_in: number of inputs units

        :type n_hids: list of ints
        :param n_hids: Number of hidden units on each layer of the MLP

        :type activation: string/function or list of
        :param activation: Activation function for the embedding layers. If
            a list it needs to have a value for each layer. If not, the same
            activation will be applied to all layers

        :type scale: float or list of
        :param scale: depending on the initialization function, it can be
            the standard deviation of the Gaussian from which the weights
            are sampled or the largest singular value. If a single value it
            will be used for each layer, otherwise it has to have one value
            for each layer

        :type sparsity: int or list of
        :param sparsity: if a single value, it will be used for each layer,
            otherwise it has to be a list with as many values as layers. If
            negative, it means the weight matrix is dense. Otherwise it
            means this many randomly selected input units are connected to
            an output unit

        :type rank_n_approx: int
        :param rank_n_approx: It applies to the first layer only. If
            positive and larger than 0, the first weight matrix is
            factorized into two matrices. The first one goes from input to
            `rank_n_approx` hidden units, the second from `rank_n_approx` to
            the number of units on the second layer

        :type rank_n_activ: string or function
        :param rank_n_activ: Function that is applied on on the intermediary
            layer formed from factorizing the first weight matrix (Q: do we
            need this?)

        :type weight_noise: bool
        :param weight_noise: If true, the model is used with weight noise
            (and the right shared variable are constructed, to keep track of the
            noise)

        :type dropout: float
        :param dropout: the probability with which hidden units are dropped
            from the hidden layer. If set to 1, dropout is not used

        :type init_fn: string or function
        :param init_fn: function used to initialize the weights of the
            layer. We recommend using either `sample_weights_classic` or
            `sample_weights` defined in the utils

        :type bias_fn: string or function
        :param bias_fn: function used to initialize the biases. We recommend
            using `init_bias` defined in the utils

        :type bias_scale: float
        :param bias_scale: argument passed to `bias_fn`, depicting the scale
            of the initial bias

        :type learn_bias: bool
        :param learn_bias: flag, saying if we should learn the bias or keep
            it constant


        :type grad_scale: float or theano scalar
        :param grad_scale: factor with which the gradients with respect to
            the parameters of this layer are scaled. It is used for
            differentiating between the different parameters of a model.

        :type name: string
        :param name: name of the layer (used to name parameters). NB: in
            this library names are very important because certain parts of the
            code relies on name to disambiguate between variables, therefore
            each layer should have a unique name.
        """

        assert rank_n_approx >= 0, "Please enter a valid rank_n_approx"
        self.rank_n_approx = rank_n_approx

        if isinstance(rank_n_activ, (str, unicode)):
            rank_n_activ = eval(rank_n_activ)
        self.rank_n_activ = rank_n_activ
        if type(n_hids) not in (list, tuple):
            n_hids = [n_hids]
        n_layers = len(n_hids)
        self.n_layers = n_layers
        if type(scale) not in (list, tuple):
            scale = [scale] * n_layers
        if type(sparsity) not in (list, tuple):
            sparsity = [sparsity] * n_layers
        for idx, sp in enumerate(sparsity):
            if sp < 0: sparsity[idx] = n_hids[idx]
        if type(activation) not in (list, tuple):
            activation = [activation] * n_layers
        if type(bias_scale) not in (list, tuple):
            bias_scale = [bias_scale] * n_layers
        if bias_fn not in (list, tuple):
            bias_fn = [bias_fn] * n_layers
        if init_fn not in (list, tuple):
            init_fn = [init_fn] * n_layers

        for dx in xrange(n_layers):
            if isinstance(bias_fn[dx], (str, unicode)):
                bias_fn[dx] = eval(bias_fn[dx])
            if isinstance(init_fn[dx], (str, unicode)):
                init_fn[dx] = eval(init_fn[dx])
            if isinstance(activation[dx], (str, unicode)):
                activation[dx] = eval(activation[dx])
        super(MultiLayer, self).__init__(n_in, n_hids[-1], rng, name)
        self.trng = RandomStreams(self.rng.randint(int(1e6)))
        self.activation = activation
        self.scale = scale
        self.sparsity = sparsity
        self.bias_scale = bias_scale
        self.bias_fn = bias_fn
        self.init_fn = init_fn
        self._grad_scale = grad_scale
        self.weight_noise = weight_noise
        self.dropout = dropout
        self.n_hids = n_hids
        self.learn_bias = learn_bias
        self._init_params()

    def _init_params(self):
        """
        Initialize the parameters of the layer, either by using sparse initialization or small
        isotropic noise.
        """
        self.W_ems = []
        self.b_ems = []
        if self.rank_n_approx:
            W_em1 = self.init_fn[0](self.n_in, self.rank_n_approx,
                                    self.sparsity[0], self.scale[0], self.rng)
            W_em2 = self.init_fn[0](self.rank_n_approx, self.n_hids[0],
                                    self.sparsity[0], self.scale[0], self.rng)
            self.W_em1 = theano.shared(W_em1, name='W1_0_%s' % self.name)
            self.W_em2 = theano.shared(W_em2, name='W2_0_%s' % self.name)
            self.W_ems = [self.W_em1, self.W_em2]

        else:
            W_em = self.init_fn[0](self.n_in, self.n_hids[0], self.sparsity[0],
                                   self.scale[0], self.rng)
            self.W_em = theano.shared(W_em, name='W_0_%s' % self.name)
            self.W_ems = [self.W_em]

        self.b_em = theano.shared(self.bias_fn[0](self.n_hids[0],
                                                  self.bias_scale[0],
                                                  self.rng),
                                  name='b_0_%s' % self.name)
        self.b_ems = [self.b_em]

        for dx in xrange(1, self.n_layers):
            W_em = self.init_fn[dx](self.n_hids[dx - 1] / self.pieces[dx],
                                    self.n_hids[dx], self.sparsity[dx],
                                    self.scale[dx], self.rng)
            W_em = theano.shared(W_em, name='W_%d_%s' % (dx, self.name))
            self.W_ems += [W_em]

            b_em = theano.shared(self.bias_fn[dx](self.n_hids[dx],
                                                  self.bias_scale[dx],
                                                  self.rng),
                                 name='b_%d_%s' % (dx, self.name))
            self.b_ems += [b_em]

        self.params = [x for x in self.W_ems]

        if self.learn_bias and self.learn_bias != 'last':
            self.params = [x for x in self.W_ems] + [x for x in self.b_ems]
        elif self.learn_bias == 'last':
            self.params = [x for x in self.W_ems] + [x
                                                     for x in self.b_ems][:-1]
        self.params_grad_scale = [self._grad_scale for x in self.params]
        if self.weight_noise:
            self.nW_ems = [
                theano.shared(x.get_value() * 0, name='noise_' + x.name)
                for x in self.W_ems
            ]
            self.nb_ems = [
                theano.shared(x.get_value() * 0, name='noise_' + x.name)
                for x in self.b_ems
            ]

            self.noise_params = [x for x in self.nW_ems
                                 ] + [x for x in self.nb_ems]
            self.noise_params_shape_fn = [
                constant_shape(x.get_value().shape) for x in self.noise_params
            ]

    def fprop(self,
              state_below,
              use_noise=True,
              no_noise_bias=False,
              first_only=False):
        """
        Constructs the computational graph of this layer.
        If the input is ints, we assume is an index, otherwise we assume is
        a set of floats.
        """
        if self.weight_noise and use_noise and self.noise_params:
            W_ems = [(x + y) for x, y in zip(self.W_ems, self.nW_ems)]
            if not no_noise_bias:
                b_ems = [(x + y) for x, y in zip(self.b_ems, self.nb_ems)]
            else:
                b_ems = self.b_ems
        else:
            W_ems = self.W_ems
            b_ems = self.b_ems
        if self.rank_n_approx:
            if first_only:
                emb_val = self.rank_n_activ(utils.dot(state_below, W_ems[0]))
                self.out = emb_val
                return emb_val
            emb_val = TT.dot(
                self.rank_n_activ(utils.dot(state_below, W_ems[0])), W_ems[1])
            if b_ems:
                emb_val += b_ems[0]
            st_pos = 1
        else:
            emb_val = utils.dot(state_below, W_ems[0])
            if b_ems:
                emb_val += b_ems[0]
            st_pos = 0

        emb_val = self.activation[0](emb_val)

        if self.dropout < 1.:
            if use_noise:
                emb_val = emb_val * self.trng.binomial(
                    emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype)
            else:
                emb_val = emb_val * self.dropout
        for dx in xrange(1, self.n_layers):
            emb_val = utils.dot(emb_val, W_ems[st_pos + dx])
            if b_ems:
                emb_val = self.activation[dx](emb_val + b_ems[dx])
            else:
                emb_val = self.activation[dx](emb_val)

            if self.dropout < 1.:
                if use_noise:
                    emb_val = emb_val * self.trng.binomial(emb_val.shape,
                                                           n=1,
                                                           p=self.dropout,
                                                           dtype=emb_val.dtype)
                else:
                    emb_val = emb_val * self.dropout
        self.out = emb_val
        return emb_val
Exemplo n.º 38
0
def lstm_decoder_layer(tparams_all, input_state, options, maxlen, dp, prefix="lstm_decoder_layer"):

    tparams_d = tparams_all[0]
    tparams_g = tparams_all[1]

    #rng = numpy.random.RandomState(4567)
    trng = RandomStreams(SEED)

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(x_, m_, h_, c_):

        preact = tensor.dot(x_, tparams_g[_p(prefix, 'W')]) + tparams_g[_p(prefix, 'b')] + \
                 tensor.dot(h_, tparams_g[_p(prefix, 'U')])
        
        i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')]))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')]))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')]))
        c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')]))

        c = f * c_ + i * c
        
        h = o * tensor.tanh(c)

        s = tensor.nnet.softmax(tensor.dot(h, tparams_g['to_idx_emb']))

        #x_t = tensor.dot((s / s.max(axis=1)[:,None]).astype('int32').astype(theano.config.floatX), tparams_d['Wemb'])
        x_t = tensor.dot(tensor.switch(s < s.max(axis=1)[:,None], 0.0, 1.0).astype(theano.config.floatX), 
                         tparams_d['Wemb'])

        x_out = s.argmax(axis=1)

        m = tensor.switch(tensor.eq(x_out, 10), 0.0, 1.0).astype(theano.config.floatX) * m_
        
        #x_t = tensor.dot(h_, tparams[_p(prefix, 'W_x')]) + tparams[_p(prefix, 'b_x')]

        return x_out, x_t, m, h, c


    ##############################################################################################
    rval, updates = theano.scan(_step,
                                outputs_info=[None,
                                              input_state,
                                              tensor.alloc(numpy_floatX(1.), input_state.shape[0]),
                                              tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n']),
                                              tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n'])],
                                name=_p(prefix, '_layers'),
                                n_steps=maxlen)


    #proj_0 = rval[1]#tensor.tanh(rval[0])

    m22 = trng.binomial(size=(input_state.shape[0],), p=dp, n=1, dtype=theano.config.floatX)
    
    #return rval[0]*m2, rval[1]*m2[:,None], rval[2]*m2

    if(tensor.gt(maxlen, 4) == 1):
        x2 = tensor.alloc(numpy.asarray(0, dtype='int32'), maxlen - 4, input_state.shape[0])
        x2 = tensor.concatenate((tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy.asarray(7, dtype='int32'), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy.asarray(10, dtype='int32'), input_state.shape[0])[None, :],
                                 x2),
                                 axis=0)


        m2 = tensor.alloc(numpy_floatX(0.), maxlen - 3, input_state.shape[0])
        m2 = tensor.concatenate((tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :],
                                 m2), 
                                 axis=0)
    
        xt2 = tparams_d['Wemb'][x2]

        return rval[0]*m22+x2*(1-m22), rval[1]*m22[:,None]+xt2*(1-m22[:,None]), rval[2]*m22+m2*(1-m22)

    else:
        return rval[0]*m22, rval[1]*m22[:,None], rval[2]*m22
Exemplo n.º 39
0
def random_binomial(shape, n=0, p=0.5, dtype=K.floatx(), seed=None):
    if seed is None:
        seed = np.random.randint(10e6)
    rng = RandomStreams(seed=seed)
    return rng.binomial(shape, n=n, p=p, dtype=dtype)
Exemplo n.º 40
0
class UnitsDropOut(object):
    """
    Adds Dropout to any unit type.
    """
    def __init__(self, variables, dropout_h=0., dropout_v=0., **kwargs):
        try:
            variables['input']
        except KeyError:
            raise KeyError(
                "Dictionary 'variables' needs an entry with key 'input'")

        rng = np.random.RandomState()
        self.t_rng = RandomStreams(rng.randint(2**30))

        self.level_h_ = theano.shared(np.cast[fx](dropout_h))
        self.level_v_ = theano.shared(np.cast[fx](dropout_v))

        act_fun_h = self.act_fun_h
        self.act_fun_h = lambda x: self.dropout(act_fun_h(x), self.level_h_)

        self.input = self.dropout(variables['input'], self.level_v_)

        self.do_suspended = False
        self.callback_add(partial(self.dropout_suspend, True),
                          Notifier.MAKE_FINISHED,
                          forward=True)
        self.callback_add(partial(self.dropout_suspend, False),
                          Notifier.TRAINING_START,
                          forward=True)
        self.callback_add(partial(self.dropout_suspend, True),
                          Notifier.TRAINING_STOP,
                          forward=True)

    def dropout(self, x, level):
        """ This function keeps '1-level' entries of the inputs the same
        and zero-out randomly selected subset of size 'level'
        """
        return self.t_rng.binomial(size=x.shape, p=1. - level, dtype=fx) * x

    def dropout_suspend(self, suspend=True):
        if suspend:
            if not self.do_suspended:
                self.level_v_tmp = self.level_v
                self.level_h_tmp = self.level_h
                self.W.set_value(self.W.get_value() / (1 / (1 - self.level_h) *
                                                       (1 - self.level_v)))
                self.level_v = 0.
                self.level_h = 0.
                self.do_suspended = True
            else:
                LOGGER.warning("Dropout already suspended, nothing to do.")
        else:
            if self.do_suspended:
                self.level_v = self.level_v_tmp
                self.level_h = self.level_h_tmp
                self.W.set_value(self.W.get_value() * (1 / (1 - self.level_h) *
                                                       (1 - self.level_v)))
                self.do_suspended = False
            else:
                LOGGER.warning("Dropout was not suspended, nothing to do.")

    @property
    def level_h(self):
        return self.level_h_.get_value()

    @level_h.setter
    def level_h(self, value):
        #assert not self.do_suspended, "Please unsuspend dropout to change its level."
        self.level_h_.set_value(value)

    @property
    def level_v(self):
        return self.level_v_.get_value()

    @level_v.setter
    def level_v(self, value):
        #assert not self.do_suspended, "Please unsuspend dropout to change its level."
        self.level_v_.set_value(value)
Exemplo n.º 41
0
class Model(object):
    def __init__(self, config):

        self._params = []  # shared variables for learned parameters
        self._sticky_hidden_states = [
        ]  # shared variables which are reset before each epoch
        self._np_rng = np.random.RandomState(config.seed // 2 + 123)
        self._theano_rng = RandomStreams(
            config.seed // 2 + 321)  # generates random numbers directly on GPU
        self._init_scale = config.init_scale
        self._is_training = tt.iscalar('is_training')
        self._lr = theano.shared(cast_floatX(config.learning_rate), 'lr')

        input_data = tt.imatrix('input_data')  # (batch_size, num_steps)
        targets = tt.imatrix('targets')  # (batch_size, num_steps)
        noise_x = tt.matrix('noise_x')  # (batch_size, num_steps)

        # Embed input words and apply variational dropout (for each sample, the embedding of
        # a dropped word-type consists of all zeros at all occurrences of word-type in sample).
        embedding = self.make_param((config.vocab_size, config.hidden_size),
                                    'uniform')
        inputs = embedding[
            input_data.T]  # (num_steps, batch_size, hidden_size)
        inputs = self.apply_dropout(inputs, tt.shape_padright(noise_x.T))

        rhn_updates = []
        for _ in range(config.num_layers):
            # y shape: (num_steps, batch_size, hidden_size)
            y, sticky_state_updates = self.RHNLayer(
                inputs, config.depth, config.batch_size, config.hidden_size,
                config.drop_i, config.drop_s, config.init_T_bias,
                config.init_other_bias, config.tied_noise)
            rhn_updates += sticky_state_updates
            inputs = y

        noise_o = self.get_dropout_noise(
            (config.batch_size, config.hidden_size), config.drop_o)
        outputs = self.apply_dropout(
            y,
            tt.shape_padleft(noise_o))  # (num_steps, batch_size, hidden_size)

        # logits
        softmax_w = embedding.T if config.tied_embeddings else self.make_param(
            (config.hidden_size, config.vocab_size), 'uniform')
        softmax_b = self.make_param((config.vocab_size, ),
                                    config.init_other_bias)
        logits = tt.dot(
            outputs,
            softmax_w) + softmax_b  # (num_steps, batch_size, vocab_size)

        # probabilities and prediction loss
        flat_logits = logits.reshape(
            (config.batch_size * config.num_steps, config.vocab_size))
        flat_probs = tt.nnet.softmax(flat_logits)
        flat_targets = targets.T.flatten()  # (batch_size * num_steps,)
        xentropies = tt.nnet.categorical_crossentropy(
            flat_probs, flat_targets)  # (batch_size * num_steps,)
        pred_loss = xentropies.sum() / config.batch_size

        # weight decay
        l2_loss = 0.5 * tt.sum(tt.stack([tt.sum(p**2) for p in self._params]))

        loss = pred_loss + config.weight_decay * l2_loss
        grads = theano.grad(loss, self._params)

        # gradient clipping
        global_grad_norm = tt.sqrt(
            tt.sum(tt.stack([tt.sum(g**2) for g in grads])))
        clip_factor = ifelse(
            global_grad_norm < config.max_grad_norm, cast_floatX(1),
            tt.cast(config.max_grad_norm / global_grad_norm, floatX))

        param_updates = [(p, p - self._lr * clip_factor * g)
                         for p, g in zip(self._params, grads)]

        self.train = theano.function([input_data, targets, noise_x],
                                     loss,
                                     givens={self._is_training: np.int32(1)},
                                     updates=rhn_updates + param_updates)

        self.evaluate = theano.function(
            [input_data, targets],
            loss,
            # Note that noise_x is unused in computation graph of this function since _is_training is false.
            givens={
                self._is_training: np.int32(0),
                noise_x: tt.zeros((config.batch_size, config.num_steps))
            },
            updates=rhn_updates)

        self._num_params = np.sum(
            [param.get_value().size for param in self._params])

        if config.load_model:
            self.load(config.load_model)

    @property
    def lr(self):
        return self._lr.get_value()

    @property
    def num_params(self):
        return self._num_params

    def make_param(self, shape, init_scheme):
        """Create Theano shared variables, which are used as trainable model parameters."""
        if isinstance(init_scheme, numbers.Number):
            init_value = np.full(shape, init_scheme, floatX)
        elif init_scheme == 'uniform':
            init_value = self._np_rng.uniform(low=-self._init_scale,
                                              high=self._init_scale,
                                              size=shape).astype(floatX)
        else:
            raise AssertionError('unsupported init_scheme')
        p = theano.shared(init_value)
        self._params.append(p)
        return p

    def apply_dropout(self, x, noise):
        return ifelse(self._is_training, noise * x, x)

    def get_dropout_noise(self, shape, dropout_p):
        keep_p = 1 - dropout_p
        noise = cast_floatX(1. / keep_p) * self._theano_rng.binomial(
            size=shape, p=keep_p, n=1, dtype=floatX)
        return noise

    def assign_lr(self, lr):
        self._lr.set_value(cast_floatX(lr))

    def reset_hidden_state(self):
        for sticky_hidden_state in self._sticky_hidden_states:
            sticky_hidden_state.set_value(
                np.zeros_like(sticky_hidden_state.get_value()))

    def save(self, save_path):
        with open(save_path, 'wb') as f:
            for p in self._params:
                cPickle.dump(p.get_value(),
                             f,
                             protocol=cPickle.HIGHEST_PROTOCOL)

    def load(self, load_path):
        with open(load_path, 'rb') as f:
            for p in self._params:
                p.set_value(cPickle.load(f))

    def linear(self, x, in_size, out_size, bias, bias_init=None):
        assert bias == (bias_init is not None)
        w = self.make_param((in_size, out_size), 'uniform')
        y = tt.dot(x, w)
        if bias:
            b = self.make_param((out_size, ), bias_init)
            y += b
        return y

    def RHNLayer(self, inputs, depth, batch_size, hidden_size, drop_i, drop_s,
                 init_T_bias, init_H_bias, tied_noise):
        """Variational Recurrent Highway Layer (Theano implementation).

    References:
      Zilly, J, Srivastava, R, Koutnik, J, Schmidhuber, J., "Recurrent Highway Networks", 2016
    Args:
      inputs: Theano variable, shape (num_steps, batch_size, hidden_size).
      depth: int, the number of RHN inner layers i.e. the number of micro-timesteps per timestep.
      drop_i: float, probability of dropout over inputs.
      drop_s: float, probability of dropout over recurrent hidden state.
      init_T_bias: a valid bias_init argument for linear(), initialization of bias of transform gate T.
      init_H_bias: a valid bias_init argument for linear(), initialization of bias of non-linearity H.
      tied_noise: boolean, whether to use the same dropout masks when calculating H and when calculating T.
    Returns:
      y: Theano variable, recurrent hidden states at each timestep. Shape (num_steps, batch_size, hidden_size).
      sticky_state_updates: a list of (shared variable, new shared variable value).
    """
        # We first compute the linear transformation of the inputs over all timesteps.
        # This is done outside of scan() in order to speed up computation.
        # The result is then fed into scan()'s step function, one timestep at a time.
        noise_i_for_H = self.get_dropout_noise((batch_size, hidden_size),
                                               drop_i)
        noise_i_for_T = self.get_dropout_noise(
            (batch_size,
             hidden_size), drop_i) if not tied_noise else noise_i_for_H

        i_for_H = self.apply_dropout(inputs, noise_i_for_H)
        i_for_T = self.apply_dropout(inputs, noise_i_for_T)

        i_for_H = self.linear(i_for_H,
                              in_size=hidden_size,
                              out_size=hidden_size,
                              bias=True,
                              bias_init=init_H_bias)
        i_for_T = self.linear(i_for_T,
                              in_size=hidden_size,
                              out_size=hidden_size,
                              bias=True,
                              bias_init=init_T_bias)

        # Dropout noise for recurrent hidden state.
        noise_s = self.get_dropout_noise((batch_size, hidden_size), drop_s)
        if not tied_noise:
            noise_s = tt.stack(
                noise_s,
                self.get_dropout_noise((batch_size, hidden_size), drop_s))

        def step_fn(i_for_H_t, i_for_T_t, y_tm1, noise_s):
            """
      Args:
        Elements of sequences given to scan():
          i_for_H_t: linear trans. of inputs for calculating non-linearity H at timestep t. Shape (batch_size, hidden_size).
          i_for_T_t: linear trans. of inputs for calculating transform gate T at timestep t. Shape (batch_size, hidden_size).
        Result of previous step function invocation (equals the outputs_info given to scan() on first timestep):
          y_tm1: Shape (batch_size, hidden_size).
        Non-sequences given to scan() (these are the same at all timesteps):
          noise_s: (batch_size, hidden_size) or (2, batch_size, hidden_size), depending on value of tied_noise.
      """
            tanh, sigm = tt.tanh, tt.nnet.sigmoid
            noise_s_for_H = noise_s if tied_noise else noise_s[0]
            noise_s_for_T = noise_s if tied_noise else noise_s[1]

            s_lm1 = y_tm1
            for l in range(depth):
                s_lm1_for_H = self.apply_dropout(s_lm1, noise_s_for_H)
                s_lm1_for_T = self.apply_dropout(s_lm1, noise_s_for_T)
                if l == 0:
                    # On the first micro-timestep of each timestep we already have bias
                    # terms summed into i_for_H_t and into i_for_T_t.
                    H = tanh(i_for_H_t + self.linear(s_lm1_for_H,
                                                     in_size=hidden_size,
                                                     out_size=hidden_size,
                                                     bias=False))
                    T = sigm(i_for_T_t + self.linear(s_lm1_for_T,
                                                     in_size=hidden_size,
                                                     out_size=hidden_size,
                                                     bias=False))
                else:
                    H = tanh(
                        self.linear(s_lm1_for_H,
                                    in_size=hidden_size,
                                    out_size=hidden_size,
                                    bias=True,
                                    bias_init=init_H_bias))
                    T = sigm(
                        self.linear(s_lm1_for_T,
                                    in_size=hidden_size,
                                    out_size=hidden_size,
                                    bias=True,
                                    bias_init=init_T_bias))
                s_l = (H - s_lm1) * T + s_lm1
                s_lm1 = s_l

            y_t = s_l
            return y_t

        # The recurrent hidden state of the RHN is sticky (the last hidden state of one batch is carried over to the next batch,
        # to be used as an initial hidden state).  These states are kept in shared variables and are reset before every epoch.
        y_0 = theano.shared(np.zeros((batch_size, hidden_size), floatX))
        self._sticky_hidden_states.append(y_0)

        y, _ = theano.scan(step_fn,
                           sequences=[i_for_H, i_for_T],
                           outputs_info=[y_0],
                           non_sequences=[noise_s])

        y_last = y[-1]
        sticky_state_updates = [(y_0, y_last)]

        return y, sticky_state_updates
Exemplo n.º 42
0
 def make_output(self, output, collapse=True, sample_mean=None, gamma=None):
     self.output = output
     if collapse and self.depth > 1:
         self.output = self.make_consensus(self.output)
         if self.attrs['consensus'] == 'flat':
             self.attrs['n_out'] *= self.depth
     if self.attrs['batch_norm']:
         self.output = self.batch_norm(self.output,
                                       self.attrs['n_out'],
                                       sample_mean=sample_mean,
                                       gamma=gamma)
     if self.attrs['residual']:
         from NetworkHiddenLayer import concat_sources
         z, n_in = concat_sources(self.sources,
                                  unsparse=True,
                                  expect_source=False)
         assert n_in == self.attrs['n_out']
         self.output += z
     if self.attrs['layer_drop'] > 0.0:
         # Stochastic Depth, http://arxiv.org/abs/1603.09382
         from NetworkHiddenLayer import concat_sources
         z, n_in = concat_sources(self.sources,
                                  unsparse=True,
                                  expect_source=False)
         n_out = self.attrs['n_out']
         if n_in != n_out:
             print("Layer drop with additional projection %i -> %i" %
                   (n_in, n_out),
                   file=log.v4)
             if n_in > 0:
                 self.W_drop = self.add_param(
                     self.create_forward_weights(n_in,
                                                 n_out,
                                                 name="W_drop_%s" %
                                                 self.name))
                 z = T.dot(z, self.W_drop)
             else:
                 z = 0
         if self.train_flag:
             from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
             rng = RandomStreams(self.rng.randint(1234) + 1)
             import theano.ifelse
             drop = rng.binomial(n=1,
                                 p=self.attrs['layer_drop'],
                                 size=(1, ),
                                 dtype='int8')[0]
             # drop = theano.printing.Print("drop")(drop)
             self.output = theano.ifelse.ifelse(drop, z, self.output)
         else:
             drop = self.attrs['layer_drop']
             self.output = numpy.float32(drop) * z + numpy.float32(
                 1.0 - drop) * self.output
     if self.attrs['sparse']:
         self.output = T.argmax(self.output, axis=-1, keepdims=True)
     if self.attrs['sparse_filtering']:
         # https://dlacombejr.github.io/programming/2015/09/13/sparse-filtering-implemenation-in-theano.html
         fs = T.sqrt(self.output**2 + 1e-8)  # numerical stability
         l2fs = T.sqrt(T.sum(fs**2, axis=1))  # l2 norm of row
         nfs = fs / l2fs.dimshuffle(0, 'x')  # normalize rows
         l2fn = T.sqrt(T.sum(nfs**2, axis=0))  # l2 norm of column
         self.output = nfs / l2fn.dimshuffle('x', 0)  # normalize columns
     self.output.name = "%s.output" % self.name
     self._output = output
Exemplo n.º 43
0
class ssRBM(Model):
    """Spike & Slab Restricted Boltzmann Machine (RBM)  """

    def load_params(self, model_path):
        fp = open(model_path, 'r')
        model = pickle.load(fp)
        self.Wv.set_value(model.Wv.get_value())
        self.Wh.set_value(model.Wh.get_value())
        self.hbias.set_value(model.hbias.get_value())
        self.mu.set_value(model.mu.get_value())
        self.alpha.set_value(model.alpha.get_value())
        self.beta.set_value(model.beta.get_value())
        # sync random number generators
        self.rng.set_state(model.rng.get_state())
        self.theano_rng.rstate = model.theano_rng.rstate
        for (self_rng_state, model_rng_state) in \
                zip(self.theano_rng.state_updates, 
                    model.theano_rng.state_updates):
            self_rng_state[0].set_value(model_rng_state[0].get_value())
        # reset timestamps
        self.batches_seen = model.batches_seen
        self.examples_seen = model.examples_seen
        fp.close()

    def __init__(self, 
            input=None, Wv=None, hbias=None,
            numpy_rng = None, theano_rng = None,
            n_h=100, n_v=100, bw_h=10, init_from=None,
            neg_sample_steps=1,
            lr = 1e-3, lr_anneal_coeff=0, lr_timestamp=None, lr_mults = {},
            iscales={}, clip_min={}, clip_max={}, l1 = {}, l2 = {},
            sp_moving_avg=0.98, sp_type='KL', sp_weight={}, sp_targ={},
            batch_size = 13,
            scalar_b = False,
            sparse_hmask = None, 
            learn_h_weights = False,
            unit_norm_filters = True,
            compile=True,
            parametrize_sqrt_precision=True,
            debug=False,
            seed=1241234,
            my_save_path=None):
        """
        :param n_h: number of h-hidden units
        :param n_v: number of visible units
        :param iscales: optional dictionary containing initialization scale for each parameter
        :param neg_sample_steps: number of sampling updates to perform in negative phase.
        :param l1: hyper-parameter controlling amount of L1 regularization
        :param l2: hyper-parameter controlling amount of L2 regularization
        :param batch_size: size of positive and negative phase minibatch
        :param compile: compile sampling and learning functions
        :param seed: seed used to initialize numpy and theano RNGs.
        """
        super(ssRBM,self).__init__()
        for k in ['mu','alpha','beta', 'Wv', 'hbias']: assert k in iscales.keys()
        for k in ['h']: assert k in sp_weight.keys()
        for k in ['h']: assert k in sp_targ.keys()

        ### make sure all parameters are floatX ###
        for (k,v) in l1.iteritems(): l1[k] = npy_floatX(v)
        for (k,v) in l2.iteritems(): l2[k] = npy_floatX(v)
        for (k,v) in sp_weight.iteritems(): sp_weight[k] = npy_floatX(v)
        for (k,v) in sp_targ.iteritems(): sp_targ[k] = npy_floatX(v)
        for (k,v) in clip_min.iteritems(): clip_min[k] = npy_floatX(v)
        for (k,v) in clip_max.iteritems(): clip_max[k] = npy_floatX(v)

        # dump initialization parameters to object
        for (k,v) in locals().iteritems():
            if k!='self': setattr(self,k,v)

        # allocate random number generators
        self.rng = numpy.random.RandomState(seed) if numpy_rng is None else numpy_rng
        self.theano_rng = RandomStreams(self.rng.randint(2**30)) if theano_rng is None else theano_rng

        ############### ALLOCATE PARAMETERS #################
        self.n_s = self.n_h * bw_h

        # allocate bilinear-weight matrices
        self.Wh = sharedX(sparse_hmask.mask, name='Wh')

        if Wv is None:
            wv_val =  self.rng.randn(n_v, self.n_s) * iscales['Wv']
            self.Wv = sharedX(wv_val, name='Wv')
        else:
            self.Wv = Wv

        # allocate shared variables for bias parameters
        if hbias is None:
            self.hbias = sharedX(iscales['hbias'] * numpy.ones(n_h), name='hbias') 
        else:
            self.hbias = hbias

        # mean (mu) and precision (alpha) parameters on s
        self.mu = sharedX(iscales['mu'] * numpy.ones(self.n_s), name='mu')
        self.alpha = sharedX(iscales['alpha'] * numpy.ones(self.n_s), name='alpha')
        self.alpha_prec = self.alpha**2 if parametrize_sqrt_precision else self.alpha

        # diagonal of precision matrix of visible units
        self.beta = sharedX(iscales['beta'] * numpy.ones(n_v), name='beta')
        self.beta_prec = self.beta**2 if parametrize_sqrt_precision else self.beta

        #### load layer 1 parameters from file ####
        if init_from:
            self.load_params(init_from)

        # allocate shared variable for persistent chain
        self.neg_v  = sharedX(self.rng.rand(batch_size, n_v), name='neg_v')
        self.neg_ev = sharedX(self.rng.rand(batch_size, n_v), name='neg_ev')
        self.neg_s  = sharedX(self.rng.rand(batch_size, self.n_s), name='neg_s')
        self.neg_h  = sharedX(self.rng.rand(batch_size, n_h), name='neg_h')
       
        # moving average values for sparsity
        self.sp_pos_v = sharedX(self.rng.rand(1,self.n_v), name='sp_pos_v')
        self.sp_pos_h = sharedX(self.rng.rand(1,self.n_h), name='sp_pog_h')

        # learning rate - implemented as shared parameter for GPU
        self.lr_shrd = sharedX(lr, name='lr_shrd')
        self.lr_mults_it = {}
        self.lr_mults_shrd = {}
        for (k,v) in lr_mults.iteritems():
            # make sure all learning rate multipliers are float64
            self.lr_mults_it[k] = tools.HyperParamIterator(lr_timestamp, lr_mults[k])
            self.lr_mults_shrd[k] = sharedX(self.lr_mults_it[k].value, 
                                            name='lr_mults_shrd'+k)

        # allocate symbolic variable for input
        self.input = T.matrix('input') if input is None else input
        
        # configure input-space (new pylearn2 feature?)
        self.input_space = VectorSpace(n_v)

        # counters used by pylearn2 trainers
        self.batches_seen = 0                    # incremented on every batch
        self.examples_seen = 0                   # incremented on every training example
        self.force_batch_size = batch_size  # force minibatch size

        self.error_record = []

        ## ESTABLISH LIST OF LEARNT MODEL PARAMETERS ##
        self.params = [self.Wv, self.hbias, self.mu, self.alpha, self.beta]
        if self.learn_h_weights:
            self.params += [self.Wh]
        
        if compile: self.do_theano()

    def do_theano(self):
        """ Compiles all theano functions needed to use the model"""

        init_names = dir(self)

        ###### All fields you don't want to get pickled (e.g., theano functions) should be created below this line

        # SAMPLING: NEGATIVE PHASE
        neg_updates = self.neg_sampling_updates(n_steps=self.neg_sample_steps)
        self.sample_neg_func = function([], [], updates=neg_updates, name='sample_neg_func')

        pos_updates = {}

        # determing maximum likelihood cost
        main_cost = [self.ml_cost(),
                     self.get_sparsity_cost(),
                     self.get_reg_cost(self.l2, self.l1)]
 
        ##
        # COMPUTE GRADIENTS WRT. TO ALL COSTS
        ##
        learning_grads = utils_cost.compute_gradients(*main_cost)

        ##
        # BUILD UPDATES DICTIONARY
        ##
        learning_updates = utils_cost.get_updates(
                learning_grads,
                self.lr_shrd,
                multipliers = self.lr_mults_shrd)
        if self.learn_h_weights:
            learning_updates[self.Wh] *= self.sparse_hmask.mask
        learning_updates.update(pos_updates)
      
        # build theano function to train on a single minibatch
        self.batch_train_func = function([self.input], [],
                                         updates=learning_updates, name='train_rbm_func')

        # enforce constraints function
        constraint_updates = {}
        ## clip parameters to maximum values (if applicable)
        for (k,v) in self.clip_max.iteritems():
            assert k in [param.name for param in self.params]
            param = getattr(self, k)
            constraint_updates[param] = T.clip(param, param, v)
        ## clip parameters to minimum values (if applicable)
        for (k,v) in self.clip_min.iteritems():
            assert k in [param.name for param in self.params]
            param = getattr(self, k)
            constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param)
        ## Residual variance on beta is scalar valued
        if self.scalar_b:
            beta = constraint_updates.get(self.beta, self.beta)
            constraint_updates[self.beta] = T.mean(beta) * T.ones_like(beta)
        # constraint filters to have unit norm
        if self.unit_norm_filters:
            Wv = constraint_updates.get(self.Wv, self.Wv)
            constraint_updates[self.Wv] = Wv / T.sqrt(T.sum(Wv**2, axis=0))
        self.enforce_constraints = theano.function([],[], updates=constraint_updates)

        ###### All fields you don't want to get pickled should be created above this line
        final_names = dir(self)
        self.register_names_to_del( [ name for name in (final_names) if name not in init_names ])

        # Before we start learning, make sure constraints are enforced
        self.enforce_constraints()

    def learn(self, dataset, batch_size):

        x = dataset.get_batch_design(batch_size, include_labels=False)
        self.learn_mini_batch(x)

        # accounting...
        self.examples_seen += self.batch_size
        self.batches_seen += 1

        # modify learning rate multipliers
        for (k, iter) in self.lr_mults_it.iteritems():
            if iter.next():
                print 'self.batches_seen = ', self.batches_seen
                self.lr_mults_shrd[k].set_value(iter.value)
                print 'lr_mults_shrd[%s] = %f' % (k,iter.value)

        self.enforce_constraints()

        # save to different path each epoch
        if self.my_save_path and self.batches_seen%1000==0:
            fname = self.my_save_path + '_e%i.pkl' % (self.batches_seen/1000)
            print 'Saving to %s ...' %fname,
            serial.save(fname, self)
            print 'done'


    def learn_mini_batch(self, x):

        # anneal learning rate
        self.lr_shrd.set_value(self.lr / (1. + self.lr_anneal_coeff * self.batches_seen))

        # perform negative phase sampling
        self.sample_neg_func()
        if self.debug and (
            numpy.isnan(self.neg_h.get_value()).any() or
            numpy.isnan(self.neg_s.get_value()).any() or
            numpy.isnan(self.neg_v.get_value()).any()):
            import pdb; pdb.set_trace()

        # update parameters
        self.batch_train_func(x)


    def energy(self, h_sample, s_sample, v_sample):
        """
        Computes energy for a given configuration of (g,h,v,x,y).
        :param h_sample: T.matrix of shape (batch_size, n_h)
        :param s_sample: T.matrix of shape (batch_size, bw_h * n_h)
        :param v_sample: T.matrix of shape (batch_size, n_v)
        """
        energy = -T.sum(s_sample * 
                        T.dot(v_sample, self.Wv) *
                        T.dot(h_sample, self.Wh), axis=1)
        energy += T.sum(0.5 * self.alpha_prec * s_sample**2, axis=1)
        energy += T.sum(0.5 * self.beta_prec * v_sample**2, axis=1)

        energy -= T.sum(self.alpha_prec * self.mu * s_sample *
                        T.dot(h_sample, self.Wh), axis=1)

        energy += T.sum(0.5 * self.alpha_prec * self.mu**2 *
                        T.dot(h_sample, self.Wh), axis=1)

        energy -= T.dot(h_sample, self.hbias)

        return energy

    def __call__(self, v, output_type='hs'):
        assert output_type in ['h', 'hs']
        h_mean = self.h_given_v(v)
        s_mean = self.s_given_hv(h_mean, v_sample)
        output_prods = {
                'h': h_mean,
                'hs': T.dot(h_mean, self.Wh) * s_mean
                }
        return output_prods[output_type]

    ######################################
    # MATH FOR CONDITIONAL DISTRIBUTIONS #
    ######################################

    def h_given_v(self, v_sample):
        """
        Compute mean activation of h given v.
        :param v_sample: T.matrix of shape (batch_size, n_v matrix)
        """
        from_v = T.dot(v_sample, self.Wv)

        temp =  0.5 * 1./self.alpha_prec * from_v**2
        temp += from_v * self.mu
        h_mean = T.dot(temp, self.Wh.T) + self.hbias
        
        return T.nnet.sigmoid(h_mean)

    def sample_h_given_v(self, v_sample):
        """
        Generates sample from p(h|v)
        """
        h_mean = self.h_given_v(v_sample)
        h_sample = self.theano_rng.binomial(size=(self.batch_size,self.n_h),
                                            n=1, p=h_mean, dtype=floatX)
        return h_sample

    def s_given_hv(self, h_sample, v_sample):
        from_h = T.dot(h_sample, self.Wh)
        from_v = T.dot(v_sample, self.Wv)
        s_mean = (1./self.alpha_prec * from_v + self.mu) * from_h
        return s_mean

    def sample_s_given_hv(self, h_sample, v_sample):
        s_mean = self.s_given_hv(h_sample, v_sample)
        s_sample = self.theano_rng.normal(
                size=(self.batch_size, self.n_s),
                avg = s_mean, 
                std = T.sqrt(1./self.alpha_prec), dtype=floatX)
        return s_sample

    def v_given_hs(self, h_sample, s_sample):
        """
        Computes the mean-activation of visible units, given all other variables.
        :param h_sample: T.matrix of shape (batch_size, n_h)
        :param s_sample: T.matrix of shape (batch_size, n_s)
        """
        from_h = T.dot(h_sample, self.Wh)
        v_mean = 1./self.beta_prec * T.dot(s_sample * from_h, self.Wv.T)
        return v_mean

    def sample_v_given_hs(self, h_sample, s_sample):
        v_mean = self.v_given_hs(h_sample, s_sample)
        v_sample = self.theano_rng.normal(
                size=(self.batch_size, self.n_v),
                avg = v_mean, 
                std = T.sqrt(1./self.beta_prec), dtype=floatX)
        return v_sample


    ##################
    # SAMPLING STUFF #
    ##################

    def neg_sampling(self, h_sample, s_sample, v_sample, n_steps=1):
        """
        Gibbs step for negative phase, which alternates: 
        p(h|b,g,v), p(s|b,g,h,v) and p(v|b,g,h,s)
        :param f_sample: T.matrix of shape (batch_size, n_f)
        :param g_sample: T.matrix of shape (batch_size, n_g)
        :param h_sample: T.matrix of shape (batch_size, n_h)
        :param s_sample: T.matrix of shape (batch_size, n_s)
        :param v_sample: T.matrix of shape (batch_size, n_v)
        :param n_steps: number of Gibbs updates to perform in negative phase.
        """

        def gibbs_iteration(h1, s1, v1):
            h2 = self.sample_h_given_v(v1)
            s2 = self.sample_s_given_hv(h2, v1)
            v2 = self.sample_v_given_hs(h2, s2)
            return [h2, s2, v2]

        [new_h, new_s, new_v] , updates = theano.scan(
                gibbs_iteration,
                outputs_info = [h_sample, s_sample, v_sample],
                n_steps=n_steps)

        return [new_h[-1], new_s[-1], new_v[-1]]

    def neg_sampling_updates(self, n_steps=1):
        """
        Implements the negative phase, generating samples from p(h,s,v).
        :param n_steps: scalar, number of Gibbs steps to perform.
        """
        [new_h, new_s, new_v] =  self.neg_sampling(self.neg_h, self.neg_s,
                                  self.neg_v, n_steps=n_steps)

        # we want to plot the expected value of the samples
        new_ev = self.v_given_hs(new_h, new_s)

        updates = {self.neg_h : new_h,
                   self.neg_s : new_s,
                   self.neg_v : new_v,
                   self.neg_ev: new_ev}

        return updates

    def ml_cost(self):
        """
        Variational approximation to the maximum likelihood positive phase.
        :param v: T.matrix of shape (batch_size, n_v), training examples
        :return: tuple (cost, gradient)
        """
        pos_h = self.h_given_v(self.input)
        pos_s = self.s_given_hv(pos_h, self.input)

        pos_cost = T.sum(self.energy(pos_h, pos_s, self.input)) 
        neg_cost = T.sum(self.energy(self.neg_h, self.neg_s, self.neg_v))
        batch_cost = pos_cost - neg_cost
        cost = batch_cost / self.batch_size

        # build gradient of cost with respect to model parameters
        cte = [pos_h, pos_s, self.neg_h, self.neg_s, self.neg_v]

        return utils_cost.Cost(cost, self.params, cte)

    def get_sparsity_cost(self):

        # update mean activation using exponential moving average
        hack_h   = self.h_given_v(self.sp_pos_v)

        # define loss based on value of sp_type
        if self.sp_type == 'KL':
            eps = 1./self.batch_size
            loss = lambda targ, val: - targ * T.log(eps + val) - (1.-targ) * T.log(1. - val + eps)
        elif self.sp_type.startswith('Lee07'):
            loss = lambda targ, val: abs(targ - val)
        else:
            raise NotImplementedError('Sparsity type %s is not implemented' % self.sp_type)

        cost = T.zeros((), dtype=floatX)

        params = []
        if self.sp_weight['h']: 
            cost += self.sp_weight['h']  * T.sum(loss(self.sp_targ['h'], hack_h.mean(axis=0)))
            params += [self.hbias]

        if self.sp_type in ['KL','Lee07'] and self.sp_weight['h']:
            params += [self.Wv, self.alpha, self.mu]

        return utils_cost.Cost(cost, params)


    ##############################
    # GENERIC OPTIMIZATION STUFF #
    ##############################
    def get_reg_cost(self, l2=None, l1=None):
        """
        Builds the symbolic expression corresponding to first-order gradient descent
        of the cost function ``cost'', with some amount of regularization defined by the other
        parameters.
        :param l2: dict containing amount of L2 regularization for Wg, Wh and Wv
        :param l1: dict containing amount of L1 regularization for Wg, Wh and Wv
        """
        cost = T.zeros((), dtype=floatX)
        params = []

        for p in self.params:

            if l1.get(p.name, 0):
                cost += l1[p.name] * T.sum(abs(p))
                params += [p]

            if l2.get(p.name, 0):
                cost += l2[p.name] * T.sum(p**2)
                params += [p]
            
        return utils_cost.Cost(cost, params)

    def monitor_matrix(self, w, name=None):
        if name is None: assert hasattr(w, 'name')
        name = name if name else w.name

        return {name + '.min':  w.min(axis=[0,1]),
                name + '.max':  w.max(axis=[0,1]),
                name + '.absmean': abs(w).mean(axis=[0,1])}

    def monitor_vector(self, b, name=None):
        if name is None: assert hasattr(b, 'name')
        name = name if name else b.name

        return {name + '.min':  b.min(),
                name + '.max':  b.max(),
                name + '.absmean': abs(b).mean()}

    def get_monitoring_channels(self, x):
        chans = {}
        chans.update(self.monitor_matrix(self.Wv))
        chans.update(self.monitor_vector(self.hbias))
        chans.update(self.monitor_vector(self.alpha))
        chans.update(self.monitor_vector(self.mu))
        chans.update(self.monitor_vector(self.beta))
        chans.update(self.monitor_matrix(self.neg_h))
        chans.update(self.monitor_matrix(self.neg_s))
        chans.update(self.monitor_matrix(self.neg_v))
        return chans
Exemplo n.º 44
0
class RBMrv_T:
    #class var goes here, instance var goes in constructor
    
    def __init__(self, noOfVisibleUnits, noOfHiddenUnits, CD_n, aRate, bRate, omegaRate, sigmaRate, omega=None, b=None, a=None, z=None, rprop_e = 0.01, rprop_en =0.005, sparseTargetp=0.01):
        '''
        constructor
        RBMrv_T(self, noOfVisibleUnits, noOfHiddenUnits, CD_n, aRate, bRate, omegaRate, sigmaRate, omega=None, b=None, a=None, z=None, rprop_e = 0.01, rprop_en =0.005, sparseTargetp=0.01):
        
        noOfVisibleUnits (int):         must be perfect square
        noOfHiddenUnits (int):          must be perfect square
        CD_n (int):                     no. of iterations in MCMC simulation during training, check if model means are used if CD_n = 1
        aRate (float32):                update rate of parameter \underline{a} during training
        bRate (float32):                update rate of parameter \underline{b} during training
        omegaRate (float32):            update rate of parameter \boldsymbol{\omega} during training
        sigmaRate (float32):            update rate of parameter \underline{z} during training
        omega (numpy array of float32): \omega parameter matrix with noOfVisible unit rows x noOfHiddenUnits columns  
        b (numpy array of float32):     b parameter vector, size = noOfHiddenUnits
        a (numpy array of float32):     b parameter vector, size = noOfVisibleUnits
        z (numpy array of float32):     z parameter vector, size = noOfVisibleUnits
        rprop_e (float32):              
        rprop_en (float32):             
        sparseTargetp (float32):        target mean hidden unit activation for training. between (0,1)
        
        '''
        
        self.epsilon = 0.0000001

        theano.config.exception_verbosity = 'high'
        #rprop parameters and variables, rprop not used 
        self.T_rprop_e = theano.shared(value=np.float32(rprop_e), name='T_rprop_e', borrow = True, allow_downcast=True)
        self.T_rprop_en = theano.shared(value=np.float32(rprop_en), name='T_rprop_en', borrow = True, allow_downcast=True)
        self.T_posUpdate = theano.shared(value=np.float32(0.5*(1.0+rprop_e)), name='T_posUpdate', borrow = True, allow_downcast=True)
        self.T_negUpdate = theano.shared(value=np.float32(0.5*(1.0-rprop_en)), name='T_negUpdate', borrow = True, allow_downcast=True)
        
        #network geometry and training parameters
        self.miniBatchSize = 0 #will be set in self.trainMB(...)
        self.parameterLoaded = False
        self.parameterSaved = False
        self.sparseTargetp = sparseTargetp
        self.CD_n = CD_n
        self.nv = noOfVisibleUnits
        self.nh = noOfHiddenUnits
        self.dimV = int(math.sqrt(self.nv))
        self.dimH = int(math.sqrt(self.nh))
        self.aRate = np.float32(aRate)
        self.bRate = np.float32(bRate)
        self.omegaRate = np.float32(omegaRate)
        self.sigmaRate = np.float32(sigmaRate)
        #initialise v and h 
        self.v = np.float32(np.random.uniform(0, 1.0, self.nv))
        self.h = np.float32(np.random.binomial(1.0,0.5,self.nh))
        self.logLikelihood = []
        self.likelihood4plot = []
        
        
        self.T_aRate = theano.shared(value=np.float32(aRate), name='T_aRate', borrow = True, allow_downcast=True)
        self.T_bRate = theano.shared(value=np.float32(bRate), name='T_bRate', borrow = True, allow_downcast=True)
        self.T_omgRate = theano.shared(value=np.float32(omegaRate), name='T_omgRate', borrow = True, allow_downcast = True)
        self.T_sigRate = theano.shared(value=np.float32(sigmaRate), name='T_sigRate', borrow = True, allow_downcast = True)
        
        self.loadedRates = [aRate, bRate, omegaRate, sigmaRate]#for load/saveparameters(), can load to see previous rates but differes from constructor declared rates
   
        self.T_rng = RandomStreams() #use_cuda parameter set if on GPU
        #succesive calls on this T_rng will keep returning new values, so for MCMC even with
        #same start v vector value called twice consecutively you'll have different outputs
        #this is normal as the same T_rng gets called, without reset, giving different outputs everytime.
        
        self.T_CD_n = theano.shared(value=CD_n, name='T_CD_n', borrow = True, allow_downcast=True)
              
        if omega is None: #careful! use "1.0" instead of "1" below else it all rounds to zeros!!!
            omega = np.float32(np.random.uniform((-1.0)*(1.0/(np.sqrt(self.nh+self.nv))),(1.0/(np.sqrt(self.nh+self.nv))),self.nv*self.nh).reshape((self.nv,self.nh)))
        self.omega = omega
        self.T_omega = theano.shared(value=omega,name='T_omega',borrow=True, allow_downcast=True)
        #rprop previous gradient
        self.Tomg_grad_prev = theano.shared(value=np.float32(np.abs(omega*omegaRate)+omegaRate), name='Tomg_grad_prev', borrow = True, allow_downcast=True)
        #RMSprop accumulated gradient RMS
        self.Tomg_rmsH = theano.shared(value=omega,name='Tomg_rmsH', borrow=True, allow_downcast=True)
        
        if b is None:
            b = np.float32(np.random.uniform((-1.0)*(1.0/(self.nv)),(1.0/(self.nv)),self.nh))
        self.b = b
        self.T_b = theano.shared(value=b,name='T_b',borrow=True, allow_downcast=True)
        #rprop previous gradient
        self.Tb_grad_prev = theano.shared(value=np.float32(np.abs(bRate*b)+bRate), name='Tb_grad_prev', borrow = True, allow_downcast=True)
        #RMSprop accumulated gradient RMS
        self.Tb_rmsH = theano.shared(value = b, name = 'Tb_rmsH', borrow = True, allow_downcast = True)
        
        if a is None:
            a = np.float32(np.random.uniform((-1.0)*(1.0/(self.nh)),(1.0/(self.nh)),self.nv))
        self.a = a
        self.T_a = theano.shared(value=a,name='T_a',borrow=True, allow_downcast=True)
        #rprop previous gradient
        self.Ta_grad_prev = theano.shared(value=np.float32(np.abs(aRate*a)+aRate), name='Ta_grad_prev', borrow = True, allow_downcast=True)
        #RMSprop accumulated gradient RMS
        self.Ta_rms = theano.shared(value=a, name='Ta_rms', borrow=True, allow_downcast=True)
        
        # for sigma parameter we train z instead with e^z = \sigma^2
        if z is None:
            z = np.float32(np.random.normal(0.0,(1.0/(self.nh*self.nh)),self.nv))#np.asarray([0.0]*self.nv, dtype=theano.config.floatX)
        self.z = z
        self.T_z = theano.shared(value=z,name='T_z',borrow=True, allow_downcast=True) 
        self.T_sigmaSqr = T.exp(self.T_z)
        #rprop previous gradient
        self.Tz_grad_prev = theano.shared(value=np.float32(np.float32(np.abs(z*sigmaRate)+sigmaRate)), name='Tz_grad_prev', borrow = True, allow_downcast=True)
        #RMSprop accumulated gradient RMS
        self.Tz_rmsH = theano.shared(value=z, name = 'Tz_rmsH', borrow=True, allow_downcast=True)
               
        self.T_logZk = theano.shared(value = np.float32(0.0), name = 'T_logZk', borrow=True, allow_downcast=True)

        #will print in ipython notebook:
        print("RBMrv constructed for " + str(len(self.v)) + " visible units and " + str(len(self.h)) + " hidden units.")
        #print(", with Energy function:")
        #display(Math(r'E(\vec{v},\vec{h}) = \sum_i \frac{(v_i-a_i)^2}{2\sigma_i^2} - \sum_i \sum_j \omega_{ij}h_j\frac{v_i}{\sigma_i^2} - \sum_j b_j h_j'))


        
    def genSamples(self, noOfsamples, separation):
        """ 
        Generated samples from loaded parameters: genSamples(self, noOfsamples, separation) 
        
        Args:
        separation (int):             number of MCMC separation of samples
        noOFsamples (int):            total number of samples returned
        
        Return:
        geneartedSamples (np array): if images, use "generatedSamples[#sample].reshape((noOfvisibleUnits,noOfvisibleUnits))" for ploting
        
        """
        generatedSamples = []
        initSample = T.vector("initSample", dtype=theano.config.floatX)
        [scan_resV, scan_resH, H_meanStub, V_meanStub] , scan_updates = theano.scan(self.vtovMBall, outputs_info=[initSample, None, None, None] , n_steps=separation*(noOfsamples+1))
        genSampleFn = theano.function(inputs=[initSample], outputs =[scan_resV, scan_resH], allow_input_downcast = True, updates = scan_updates)
        
        [currentV, currentH] = genSampleFn(np.asarray([0.0]*self.nv, dtype=theano.config.floatX)) 
        generatedSamples = currentV[separation:separation*(noOfsamples+1):separation] 

        return generatedSamples

    def checkNaN(self):
        """
        prints NaN tests 
        works on parameters a, b, z, omega of current object
        """
        print("NaN test on omega: " + str(np.isnan(np.sum(np.sum(np.asarray(self.T_omega.eval()))))))
        print("NaN test on a: " + str(np.isnan(np.dot(np.asarray(self.T_a.eval()),np.asarray(self.T_a.eval())))))
        print("NaN test on b: " + str(np.isnan(np.dot(np.asarray(self.T_b.eval()),np.asarray(self.T_b.eval())))))
        print("NaN test on z: " + str(np.isnan(np.dot(np.asarray(self.T_z.eval()),np.asarray(self.T_z.eval())))))
        print("max z = " + str(np.max(np.asarray(self.T_z.eval()))) + ", min z =" + str(np.min(np.asarray(self.T_z.eval()))))

 
        
    def printParameters(self):
        """
        prints parameters a, b, z \sigma^2, omega
        """
        print("a = " + str(self.T_a.get_value()))
        print("b = " + str(self.T_b.get_value()))
        print("z = " + str(self.T_z.get_value()))
        print("sigma^2 = " + str([math.exp(zi) for zi in self.T_z.get_value()]))
        print("omega = " + str(self.T_omega.get_value()))
        
        
        
      
       
    def plotAllRF(self, noOfRFs = 25):
        """
        plots \omega_{ij} elements. With i=0,1,... noOfRFs as a square image
        
        args:
        noOfRFs (int): have to be perfect square and up to number of hidden units
        """
        
        inputIndex = noOfRFs + 1
        fig, myAxis = plt.subplots(int(np.sqrt(noOfRFs)),int(np.sqrt(noOfRFs)))
            
        xpt, ypt = myAxis.shape
        fig.tight_layout()
        for xind in range(0,xpt):
            for yind in range(0, ypt):
                myAxis[xind][yind].imshow(self.T_omega.eval()[:,inputIndex].reshape((self.dimV,self.dimV)), cmap = cm.Greys_r, interpolation='nearest')
                inputIndex = inputIndex + 1
        plt.show()
        #print("weights are between (" + str(np.min(np.min(self.T_omega.eval()))) + "," + str(np.max(np.max(self.T_omega.eval()))) + ")")
        
        
    def plotSD(self):
        """
        plot \sigma standard deviation parameter as sqaure image
        """
        SDparameter = np.exp((np.asarray(self.T_z.eval())))
        fig=plt.figure()
        im=plt.imshow(SDparameter.reshape((self.dimV,self.dimV)), cmap = cm.Greys_r, interpolation='nearest')
        fig.colorbar(im)
        
        
    def plot_a(self):
        """
        plot a parameter as an image
        """
        SDparameter = np.asarray(self.T_a.eval())
        fig=plt.figure()
        im=plt.imshow(SDparameter.reshape((self.dimV,self.dimV)), cmap = cm.Greys_r, interpolation='nearest')
        fig.colorbar(im)
        
        
    def plot_b(self):
        """
        plot b parameter as an image
        """
        SDparameter = np.asarray(self.T_b.eval())
        fig = plt.figure()
        im = plt.imshow(SDparameter.reshape((self.dimH,self.dimH)), cmap = cm.Greys_r, interpolation='nearest')
        fig.colorbar(im)


    
    def saveParameters(self, fileName):
        """
        saves all essential parameters so simulation can resume after calling loadParameters()
        file saved in npz format
        
        ars:
        fileName (string): in single quotes '...' and excluding extensions.
        """
        np.savez(fileName, T_omega = self.T_omega.eval(), Tomg_rmsH = self.Tomg_rmsH.eval(),
                             T_a = self.T_a.eval(), Ta_rms = self.Ta_rms.eval(),
                             T_b = self.T_b.eval(), Tb_rmsH = self.Tb_rmsH.eval(),
                             T_z = self.T_z.eval(), Tz_rmsH = self.Tz_rmsH.eval(),
                             Ta_grad_prev = self.Ta_grad_prev.eval(),
                             Tb_grad_prev = self.Ta_grad_prev.eval(),
                             Tz_grad_prev = self.Tz_grad_prev.eval(),
                             Tomg_grad_prev = self.Tomg_grad_prev.eval(),
                             logLikelihood = self.logLikelihood, likelihood4plot = self.likelihood4plot,
                             T_logZk = self.T_logZk.eval(),
                             loadedRates = self.loadedRates, miniBatchSize = self.miniBatchSize,
                             aRate = self.aRate, bRate = self.bRate, omegaRate = self.omegaRate, sigmaRate = self.sigmaRate,
                             CD_n = self.CD_n, sparseTargetp = self.sparseTargetp) 
        #print("parameters saved in: " + str(fileName) + ".npz")
        self.parameterSaved = True

        
        
    def loadParameters(self, fileName):
        """
        loads npz file to restore all simulation parameters
        make sure the parameters you're loading fits the current object (e.g. same #visible/#hidden units)
        
        ars:
        fileName (string): in single quotes '...' and excluding extensions.
        """
        loadedFile = np.load(fileName + '.npz')
        self.miniBatchSize = loadedFile['miniBatchSize']
        self.aRate = np.float32(loadedFile['aRate']) #without explicit cast it turns into float64?!
        self.bRate = np.float32(loadedFile['bRate'])
        self.omegaRate = np.float32(loadedFile['omegaRate'])
        self.sigmaRate = np.float32(loadedFile['sigmaRate'])
        self.CD_n = loadedFile['CD_n']
        self.sparseTargetp = loadedFile['sparseTargetp']
        self.T_omega.set_value(loadedFile['T_omega'])
        self.Tomg_rmsH.set_value(loadedFile['Tomg_rmsH'])
        self.T_a.set_value(loadedFile['T_a'])
        self.Ta_rms.set_value(np.float32(loadedFile['Ta_rms']))
        self.T_b.set_value(loadedFile['T_b'])
        self.Tb_rmsH.set_value(loadedFile['Tb_rmsH'])
        self.T_z.set_value(loadedFile['T_z'])
        self.Tz_rmsH.set_value(loadedFile['Tz_rmsH'])
        self.Ta_grad_prev.set_value(loadedFile['Ta_grad_prev'])
        self.Tb_grad_prev.set_value(loadedFile['Tb_grad_prev'])
        self.Tz_grad_prev.set_value(loadedFile['Tz_grad_prev'])
        self.Tomg_grad_prev.set_value(loadedFile['Tomg_grad_prev'])
        self.logLikelihood = loadedFile['logLikelihood']
        self.likelihood4plot = loadedFile['likelihood4plot']
        self.likelihood4plot = self.likelihood4plot.tolist()
        self.T_logZk.set_value(loadedFile['T_logZk'])
        self.loadedRates = loadedFile['loadedRates']
        #print("after loading, omega = " + str(self.T_omega.eval()))   
        self.parameterLoaded = True
        
    def energyFnMB(self, VM, HM):
        """
        evaluates the energy functions of the RBM given row vector(s) of v and h
        
        
        args:
        VM (T.matrix): rows of visible layer values
        HM (T.matrix): rows of hidden layer values        
        
        return:
        a row Theano vector, elements being E(v_row, h_row)
        """
        T_bh = T.dot(HM, self.T_b)
        T_omghv = T.transpose(T.sum(T.mul(T.dot(T.mul(T.fill(VM, T.exp(-self.T_z)), VM), self.T_omega), HM), axis=1,acc_dtype=theano.config.floatX))
        T_Vsqr = T.mul(VM-T.fill(VM, self.T_a),VM-T.fill(VM, self.T_a))
        T_VsqrOmg = T.transpose(T.sum(T.mul(T.fill(T_Vsqr,np.float32(0.5)*T.exp(-self.T_z)),T_Vsqr),axis=1, acc_dtype=theano.config.floatX))
        return -T_VsqrOmg + T_omghv + T_bh
    
    
    def vtohMB(self, VsampM):
        """
        computes hidden unit outputs given visible unit outputs ("half" a MCMC iteration)
        computes in parallel given input rows of visible units
       
        args:
        VsampM (T.matrix): rows of visible unit outputs
        
        returns:
        a T.matrix, rows of hidden unit outputs
        
        """
        Vomg = T.matrix(name="Vomg", dtype=theano.config.floatX)
        vtohMBres = T.matrix(name ="vtohMBres", dtype=theano.config.floatX)
        T_HP = T.matrix(name="T_HP", dtype=theano.config.floatX)
        
        Vomg = T.dot(T.mul(T.fill(VsampM, T.exp(-self.T_z)), VsampM), self.T_omega)
        T_Hp = T.nnet.ultra_fast_sigmoid(T.fill(Vomg, self.T_b) + Vomg)
        vtohMBres = self.T_rng.binomial(size = T_Hp.shape, p=T_Hp, dtype=theano.config.floatX)
        return vtohMBres
        
        

    
    def vtovMBall(self, VsampM):
        """
        computes visible unit outputs given visible unit inputs (single MCMC iteration)
        multiple paralle MCMC iterations using rows of the input matrix
        
        args:
        VsampM (T.matrix): rows of this matrix are visible unit inputs
        
        return:
        ahtovMBres (T.matrix): rows of this matrix are visible unit outputs after a single MCMC iteration
        """
        #v to h part
        aVomg = T.matrix(name="Vomg", dtype=theano.config.floatX)
        avtohMBres = T.matrix(name ="vtohMBres", dtype=theano.config.floatX)
        aT_HP = T.matrix(name="T_HP", dtype=theano.config.floatX)
        
        aVomg = T.dot(T.mul(T.fill(VsampM, T.exp(-self.T_z)), VsampM), self.T_omega)
        aT_Hp = T.nnet.ultra_fast_sigmoid(T.fill(aVomg, self.T_b) + aVomg)
        avtohMBres = self.T_rng.binomial(size = aT_Hp.shape, p=aT_Hp, dtype=theano.config.floatX)
        
        #h to v part:
        aT_omgH = T.matrix(name="T_omgH", dtype=theano.config.floatX)
        aT_means = T.matrix(name="T_means", dtype=theano.config.floatX)
        ahtovMBres = T.matrix(name="htovMBres", dtype=theano.config.floatX)
        
        aT_omgH = T.transpose(T.dot(self.T_omega, T.transpose(avtohMBres)))
        aT_means = T.fill(aT_omgH, self.T_a) + aT_omgH
        ahtovMBres = self.T_rng.normal(size=aT_means.shape, avg=aT_means, std=T.fill(aT_means,T.sqrt(T.exp(self.T_z))), dtype=theano.config.floatX)
        return [ahtovMBres, avtohMBres, aT_Hp, aT_means]
        
    
    def htovMB(self, HsampM):
        """
        computes visible unit outputs given hidden unit inputs ("half" a MCMC iteration)
        computes in parallel given input rows of hidden units
       
        args:
        HsampM (T.matrix): rows of hidden unit inputs
        
        returns:
        a T.matrix, rows of visible unit outputs
        
        """
        
        T_omgH = T.matrix(name="T_omgH", dtype=theano.config.floatX)
        T_means = T.matrix(name="T_means", dtype=theano.config.floatX)
        htovMBres = T.matrix(name="htovMBres", dtype=theano.config.floatX)
        
        T_omgH = T.transpose(T.dot(self.T_omega, T.transpose(HsampM)))
        T_means = T.fill(T_omgH, self.T_a) + T_omgH
        htovMBres = self.T_rng.normal(size=T_means.shape, avg=T_means, std=T.fill(T_means,T.sqrt(T.exp(self.T_z))), dtype=theano.config.floatX)
        return htovMBres
        
    def trainMB(self, V_egMin, noOfEpoch, noOfMiniBatchEx):
        """
        trains the current RBM object, returns nothing with parameter updates being internal
        
        args:
        V_egMin (theano.shared 2D array): call eval() to supply as argument. rows of this are input examples. V_egMin[N:M] extracts M-N examples, each of size noOfVisible units
        noOfEpoch (int): total number of Epoch to simulate, each Epoch goes through V_egMin
        noOfMiniBatchEx (int): number of examples to be grouped into minibatches
        
        """
        self.miniBatchSize = noOfMiniBatchEx
        print("size of input example is: " + str(V_egMin.shape))
        V_egM = T.matrix(name="T_egM", dtype=theano.config.floatX)
        [V_CDmAcc, H_CDmAcc, H_CDmean, V_CDmean] , scan_updates = theano.scan(self.vtovMBall, outputs_info=[V_egM, None, None, None] , n_steps=self.CD_n)
        V_CDm = V_CDmAcc[-1] #these are matrixes
        H_CDm = H_CDmAcc[-1] #these are matrixes
        
       
        H_egM = self.vtohMB(V_egM)
        energyVector_eg = self.energyFnMB(V_egM, H_egM)
        energyVector_cd = self.energyFnMB(V_CDm, H_CDm)
        costFn = T.mean(energyVector_eg, dtype=theano.config.floatX, acc_dtype=theano.config.floatX) - T.mean(energyVector_cd, dtype=theano.config.floatX, acc_dtype=theano.config.floatX) 
        
        Ta_grad, Tb_grad, Tz_grad, Tomg_grad = T.grad(cost=costFn,
                                                        wrt=[self.T_a, self.T_b, self.T_z, self.T_omega],
                                                        consider_constant=[V_egM, H_egM, V_CDm, H_CDm])
        
        #regular gradient
        gradFromMB = theano.function(inputs=[V_egM], outputs=[Ta_grad, Tb_grad, Tz_grad, Tomg_grad], 
                                     allow_input_downcast=True, 
                                     updates = scan_updates + [(self.T_a, self.T_a + self.aRate*Ta_grad),
                                                               (self.T_b, self.T_b + self.bRate*Tb_grad),
                                                               (self.T_z, self.T_z + self.sigmaRate*Tz_grad),
                                                               (self.T_omega, self.T_omega + self.omegaRate*Tomg_grad)],
                                     mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True))
        
        #rprop: Code not used
        Ta_rpropMag = T.mul(T.abs_(self.Ta_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Ta_grad_prev)+T.sgn(Ta_grad))) + 
                            T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Ta_grad_prev)+T.sgn(Ta_grad))-np.float32(2.0))))      
        Ta_rprop = T.mul(T.sgn(Ta_grad),Ta_rpropMag.clip(np.float32(self.epsilon),50))
        Tb_rpropMag = T.mul(T.abs_(self.Tb_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Tb_grad_prev)+T.sgn(Tb_grad))) + 
                            T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Tb_grad_prev)+T.sgn(Tb_grad))-np.float32(2.0))))      
        Tb_rprop = T.mul(T.sgn(Tb_grad),Tb_rpropMag.clip(np.float32(self.epsilon),50))
        Tz_rpropMag = T.mul(T.abs_(self.Tz_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Tz_grad_prev)+T.sgn(Tz_grad))) + 
                            T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Tz_grad_prev)+T.sgn(Tz_grad))-np.float32(2.0))) )     
        Tz_rprop = T.mul(T.sgn(Tz_grad),Tz_rpropMag.clip(np.float32(self.epsilon),50))
        Tomg_rpropMag = T.mul(T.abs_(self.Tomg_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Tomg_grad_prev)+T.sgn(Tomg_grad))) + 
                            T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Tomg_grad_prev)+T.sgn(Tomg_grad))-np.float32(2.0))))      
        Tomg_rprop = T.mul(T.sgn(Tomg_grad),Tomg_rpropMag.clip(np.float32(self.epsilon),50)) 
        gradFromMBrprop = theano.function(inputs=[V_egM], outputs=[Ta_rprop, Tb_rprop, Tz_rprop, Tomg_rprop], 
                                     allow_input_downcast=True, 
                                     updates = scan_updates + [(self.T_a, self.T_a + Ta_rprop),
                                                               (self.T_b, self.T_b + Tb_rprop),
                                                               (self.T_z, self.T_z + Tz_rprop),
                                                               (self.T_omega, self.T_omega + Tomg_rprop),
                                                               (self.Ta_grad_prev, Ta_rprop),
                                                               (self.Tb_grad_prev, Tb_rprop),
                                                               (self.Tz_grad_prev, Tz_rprop),
                                                               (self.Tomg_grad_prev, Tomg_rprop)],
                                     mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True))
        
        #RMSprop only: 
        [a_grad, b_grad, z_grad, omg_grad] = gradFromMB(V_egMin[0:noOfMiniBatchEx]) #initial RMS correction
        if (not(self.parameterLoaded) and not(self.parameterSaved)):
            self.Ta_rms.set_value(np.float32(np.abs(a_grad))) # =  theano.shared(value = np.float32(np.abs(a_grad)), name = 'Ta_rms', borrow=True, allow_downcast=True)
        Tb_rms =  theano.shared(value = np.float32(np.abs(b_grad)), name = 'Tb_rms', borrow=True, allow_downcast=True)
        Tz_rms =  theano.shared(value = np.float32(np.abs(z_grad)), name = 'Tz_rms', borrow=True, allow_downcast=True)
        Tomg_rms =  theano.shared(value = np.float32(np.abs(omg_grad)), name = 'Tomg_rms', borrow=True, allow_downcast=True)
        gradFromMBRMSprop = theano.function(inputs=[V_egM], outputs=[Ta_grad, Tb_grad, Tz_grad, Tomg_grad], 
                                     allow_input_downcast=True, 
                                     updates = scan_updates + [(self.Ta_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Ta_rms,self.Ta_rms))+T.mul(np.float32(0.1),T.mul(Ta_grad,Ta_grad)))),
                                                               (Tb_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(Tb_rms,Tb_rms))+T.mul(np.float32(0.1),T.mul(Tb_grad,Tb_grad)))),
                                                               (Tz_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(Tz_rms,Tz_rms))+T.mul(np.float32(0.1),T.mul(Tz_grad,Tz_grad)))),
                                                               (Tomg_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(Tomg_rms,Tomg_rms))+T.mul(np.float32(0.1),T.mul(Tomg_grad,Tomg_grad)))),
                                                               (self.T_a, self.T_a + self.aRate*T.mul(Ta_grad,T.maximum(np.float32(self.epsilon),self.Ta_rms)**-1)),
                                                               (self.T_b, self.T_b + self.bRate*T.mul(Tb_grad,T.maximum(np.float32(self.epsilon),Tb_rms)**-1)),
                                                               (self.T_z, self.T_z + self.sigmaRate*T.mul(Tz_grad,T.maximum(np.float32(self.epsilon),Tz_rms)**-1)),
                                                               (self.T_omega, self.T_omega + self.omegaRate*T.mul(Tomg_grad,T.maximum(np.float32(self.epsilon),Tomg_rms)**-1))],
                                                                 mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True))  
        
        #sparse hidden units optimization + RMSprop:   
        #first calculate probability of hidden units firing given visible examples:
        aVomg = T.dot(T.mul(T.fill(V_egM, T.exp(-self.T_z)), V_egM), self.T_omega)
        aT_Hp = T.nnet.sigmoid(T.fill(aVomg, self.T_b) + aVomg)#T.nnet.ultra_fast_sigmoid() did not work for us 
        aT_HpMean = T.mean(aT_Hp) # mean activation over minibatch and all Hk
        #cross entropy between mean hidden unit activation and target mean activation probability "self.sparseTargetp" 
        sparseHcost = T.mul(np.float32(-self.sparseTargetp), T.log(aT_HpMean)) - T.mul((np.float32(1.0)-self.sparseTargetp), T.log(np.float32(1.0)-aT_HpMean))
        
        Tb_gradH, Tz_gradH, Tomg_gradH = T.grad(cost=sparseHcost,
                                                        wrt=[self.T_b, self.T_z, self.T_omega],
                                                        consider_constant=[V_egM])
        sparseGradFn = theano.function(inputs = [V_egM], outputs =[Tb_gradH, Tz_gradH, Tomg_gradH], allow_input_downcast=True, mode = 'FAST_RUN')
        
        [b_gradH, z_gradH, omg_gradH] = sparseGradFn(V_egMin[0:noOfMiniBatchEx]) #initial RMS correction
        
        if (not(self.parameterLoaded) and not(self.parameterSaved)):
            self.Tb_rmsH.set_value(np.float32(np.abs(b_grad - b_gradH))) 
            self.Tz_rmsH.set_value(np.float32(np.abs(z_grad - z_gradH))) 
            self.Tomg_rmsH.set_value(np.float32(np.abs(omg_grad - omg_gradH))) 
        gradSparseH = theano.function(inputs=[V_egM], outputs=[Ta_grad, Tb_grad, Tz_grad, Tomg_grad, Tb_gradH, Tz_gradH, Tomg_gradH], 
                                     allow_input_downcast=True, 
                                     updates = scan_updates + [(self.Ta_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Ta_rms,self.Ta_rms))+T.mul(np.float32(0.1),T.mul(Ta_grad,Ta_grad)))),
                                                               (self.Tb_rmsH, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Tb_rmsH,self.Tb_rmsH))+T.mul(np.float32(0.1),T.mul(Tb_grad-Tb_gradH,Tb_grad-Tb_gradH)))),
                                                               (self.Tz_rmsH, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Tz_rmsH,self.Tz_rmsH))+T.mul(np.float32(0.1),T.mul(Tz_grad-Tz_gradH,Tz_grad-Tz_gradH)))),
                                                               (self.Tomg_rmsH, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Tomg_rmsH,self.Tomg_rmsH))+T.mul(np.float32(0.1),T.mul(Tomg_grad-Tomg_gradH,Tomg_grad-Tomg_gradH)))),
                                                               (self.T_a, self.T_a + self.aRate*T.mul(Ta_grad,T.maximum(np.float32(self.epsilon),self.Ta_rms)**-1)),
                                                               (self.T_b, self.T_b + self.bRate*T.mul(Tb_grad-Tb_gradH,T.maximum(np.float32(self.epsilon),self.Tb_rmsH)**-1)),
                                                               (self.T_z, self.T_z + self.sigmaRate*T.mul(Tz_grad-Tz_gradH,T.maximum(np.float32(self.epsilon),self.Tz_rmsH)**-1)),
                                                               (self.T_omega, self.T_omega + self.omegaRate*T.mul(Tomg_grad-Tomg_gradH,T.maximum(np.float32(self.epsilon),self.Tomg_rmsH)**-1))],
                                     mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) 
        
        #reconstruction errors:
        [V_egM_recon, H_egM_reconStub, H_meanStubC, V_meanStubC] = self.vtovMBall(V_egM)
        V_error = V_egM - V_egM_recon
        V_errorSqr = T.mul(V_error, V_error)
        reconError = theano.function(inputs = [V_egM], outputs = [T.mean(T.sum(V_errorSqr,axis=1, acc_dtype=theano.config.floatX), acc_dtype=theano.config.floatX)], 
                                     allow_input_downcast=True,
                                     mode='FAST_RUN')

        print("***************************************************************************************************")
        print("training network with " + str(self.nv) + " real visible units and " + str(self.nh) + " binary hidden units")
        print("reconstruction error before training = " + str(np.array(reconError(V_egMin))[0]))
        noOfMiniBatches = np.int(len(V_egMin)/noOfMiniBatchEx)
        print("number of mini-batches = " + str(noOfMiniBatches) + ", with " + str(noOfMiniBatchEx) + " examples per mini-batch")
        print("number of Epochs = " + str(noOfEpoch))
        print("***************************************************************************************************")        

        #input images already randomised with consecutive images belonging to different class, use directly as minibatch.
        for j in xrange(noOfEpoch):
            pretime=time.time()
            for i in xrange(noOfMiniBatches):
                [a_upDate, b_upDate, z_upDate, omg_upDate, b_upDateH, z_upDateH, omg_upDateH] = gradSparseH(V_egMin[i*noOfMiniBatchEx:(i+1)*noOfMiniBatchEx])
                
            myErr = reconError(V_egMin)
            self.likelihood4plot = self.likelihood4plot + [np.float32(myErr)]
            print("epoch " + str(j) + ": reconstruction error = " + str(myErr[0])  + ", time taken = " + str(time.time() - pretime))

        print("\n***************************************************************************************************") 
        print("reconstruction error after training for " + str(noOfEpoch) + " epochs = " + str(np.array(reconError(V_egMin))[0]))
        self.checkNaN()
        print("***************************************************************************************************")         
        
        plt.figure
        plt.plot(np.arange(0.0, len(self.likelihood4plot), 1), self.likelihood4plot)
        plt.show()
Exemplo n.º 45
0
class SparseDropoutLayer(Layer):
    """Dropout layer

    Sets values to zero with probability p. See notes for disabling dropout
    during testing.

    Parameters
    ----------
    incoming : a :class:`Layer` instance or a tuple
        the layer feeding into this layer, or the expected input shape
    p : float or scalar tensor
        The probability of setting a value to zero
    rescale : bool
        If true the input is rescaled with input / (1-p) when deterministic
        is False.

    Notes
    -----
    The dropout layer is a regularizer that randomly sets input values to
    zero; see [1]_, [2]_ for why this might improve generalization.
    During training you should set deterministic to false and during
    testing you should set deterministic to true.

    If rescale is true the input is scaled with input / (1-p) when
    deterministic is false, see references for further discussion. Note that
    this implementation scales the input at training time.

    References
    ----------
    .. [1] Hinton, G., Srivastava, N., Krizhevsky, A., Sutskever, I.,
           Salakhutdinov, R. R. (2012):
           Improving neural networks by preventing co-adaptation of feature
           detectors. arXiv preprint arXiv:1207.0580.

    .. [2] Srivastava Nitish, Hinton, G., Krizhevsky, A., Sutskever,
           I., & Salakhutdinov, R. R. (2014):
           Dropout: A Simple Way to Prevent Neural Networks from Overfitting.
           Journal of Machine Learning Research, 5(Jun)(2), 1929-1958.
    """
    def __init__(self, incoming, p=0.5, rescale=True, **kwargs):
        super(SparseDropoutLayer, self).__init__(incoming, **kwargs)
        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
        self.p = p
        self.rescale = rescale

    def get_output_for(self, input, deterministic=False, **kwargs):
        """
        Parameters
        ----------
        input : tensor
            output from the previous layer
        deterministic : bool
            If true dropout and scaling is disabled, see notes
        """
        if deterministic or self.p == 0:
            return input
        else:
            retain_prob = 1 - self.p
            if self.rescale:
                input *= 1 / retain_prob

            # use nonsymbolic shape for dropout mask if possible
            input_shape = self.input_shape
            if any(s is None for s in input_shape):
                input_shape = input.shape

            return sp.row_scale(
                input,
                self._srng.binomial(input_shape[:1],
                                    p=retain_prob,
                                    dtype=theano.config.floatX))
Exemplo n.º 46
0
def discrete_grads(loss, network, LR, update_type, best_params, H, N, th):

    W_params = lasagne.layers.get_all_params(
        network, discrete=True)  #Get all the weight parameters
    layers = lasagne.layers.get_all_layers(network)

    W_grads = []
    for layer in layers:
        params = layer.get_params(discrete=True)
        if params:
            W_grads.append(theano.grad(
                loss, wrt=layer.W))  #Here layer.W = weight_tune(param)
    updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                   params=W_params,
                                   learning_rate=LR)

    for param, parambest in izip(W_params, best_params):

        L = 2 * H / pow(2, N)  #state step length in Z_N

        a = random.random()  #c is a random variable with binary value
        if a < 0.8:
            c = 1
        else:
            c = 0

        b = random.random()
        state_rand = T.round(
            b * pow(2, N)
        ) * L - H  #state_rand is a random state in the discrete weight space Z_N

        delta_W1 = c * (
            state_rand - parambest
        )  #parambest would transfer to state_rand with probability of a, or keep unmoved with probability of 1-a
        delta_W1_direction = T.cast(T.sgn(delta_W1), theano.config.floatX)
        dis1 = T.abs_(delta_W1)  #the absolute distance
        k1 = delta_W1_direction * T.floor(dis1 / L)  #the integer part
        v1 = delta_W1 - k1 * L  #the decimal part
        Prob1 = T.abs_(v1 / L)  #the transfer probability
        Prob1 = T.tanh(
            th * Prob1
        )  #the nonlinear tanh() function accelerates the state transfer

        delta_W2 = updates[param] - param
        delta_W2_direction = T.cast(T.sgn(delta_W2), theano.config.floatX)
        dis2 = T.abs_(delta_W2)  #the absolute distance
        k2 = delta_W2_direction * T.floor(dis2 / L)  #the integer part
        v2 = delta_W2 - k2 * L  #the decimal part
        Prob2 = T.abs_(v2 / L)  #the transfer probability
        Prob2 = T.tanh(
            th * Prob2
        )  #the nonlinear tanh() function accelerates the state transfer

        srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
        Gate1 = T.cast(
            srng.binomial(n=1, p=Prob1,
                          size=T.shape(Prob1)), theano.config.floatX
        )  # Gate1 is a binary variable with probability of Prob1 to be 1
        Gate2 = T.cast(
            srng.binomial(n=1, p=Prob2,
                          size=T.shape(Prob2)), theano.config.floatX
        )  # Gate2 is a binary variable with probability of Prob2 to be 1

        delta_W1_new = (k1 + delta_W1_direction *
                        Gate1) * L  #delta_W1_new = k*L where k is an integer
        updates_param1 = T.clip(parambest + delta_W1_new, -H, H)
        updates_param1 = weight_tune(
            updates_param1, -H, H
        )  #fine tuning for guaranteeing each element strictly constrained in the discrete space

        delta_W2_new = (k2 + delta_W2_direction *
                        Gate2) * L  #delta_W2_new = k*L where k is an integer
        updates_param2 = T.clip(param + delta_W2_new, -H, H)
        updates_param2 = weight_tune(
            updates_param2, -H, H
        )  #fine tuning for guaranteeing each element strictly constrained in the discrete space

        # if update_type<100, the weight probabilistically tranfers from parambest to state_rand, which helps to search the global minimum
        # elst it would probabilistically transfer from param to a state nearest to updates[param]
        updates[param] = T.switch(T.lt(update_type, 100), updates_param1,
                                  updates_param2)

    return updates
Exemplo n.º 47
0
class DeepFishNet150:
    '''
        this class represents a convolutional neural network with 3 convolutional layers
        and 2 fully connected layers connected with a final softmax layer.

        Predicts 2 classes (objects and non-objects)
    '''
    def __init__(self,
                 imgSize=None,
                 crossvalidid=None,
                 loadData=True,
                 mode=None,
                 modelToLoad=None,
                 randomData=False,
                 dropout_params=None,
                 caffeModelName=None,
                 total_epochs=10):
        '''
            DeepLearningNode constructor.
            Initializes variables and the weights associated with the network
        '''
        assert (mode != None)

        self.mode = mode
        self.srng = RandomStreams()
        self.imgSize = imgSize
        self.dataMatTrain = None
        self.labelMatTrain = None
        self.dataMatTest = None
        self.labelMatTest = None
        self.dropout_params = dropout_params
        self.total_epochs = total_epochs
        self.crossvalidid = crossvalidid

        self.totalTrainSamples = None
        self.totalTestSamples = None
        self.randomData = None
        self.caffeModelName = caffeModelName
        self.modelToLoad = modelToLoad
        # initialize your model
        self.initializeModel()

        if (mode == 'Train'):
            print 'call train your model'
            self.trainThisModel()
            # save your model for future use
            #self.saveThisModel()
        elif (mode == 'Test'):
            assert (modelToLoad != None)
            self.loadThisModel(modelToLoad)
            pass
        pass

    def moveToCaffeDir(self):
        '''
            util function to move to cur dir to
        '''
        os.chdir(expanduser('~') + '/Programs/caffe/')
        pass

    def matrifyMyData(self):
        '''
            returns labels in one hot form
            [1, 0], [0, 1], [0, 1], .....
        '''

        nTrainLabel = []
        # nTestLabel = []
        self.uniqueClasses = np.unique(self.labelMatTrain)
        self.nClasses = self.uniqueClasses.shape[0]
        # print "nClasses ",self.nClasses, self.uniqueClasses
        dLabel = np.eye(self.nClasses)

        for i in range(self.totalTrainSamples):
            #print self.labelMatTrain[i][0], dLabel[self.labelMatTrain[i][0]]
            nTrainLabel.append(dLabel[self.labelMatTrain[i][0]])
        nTrainLabel = np.array(nTrainLabel)

        self.labelMatTrain = nTrainLabel

        return

    def floatX(self, X):
        '''
            float casting your input
        '''
        return np.asarray(X, dtype=theano.config.floatX)

    def init_weights(self,
                     shape,
                     weightType=None,
                     typeLayer=None,
                     caffeLayerName=None):
        '''
            return randomly initialized weights
        '''
        return theano.shared(self.floatX(np.random.randn(*shape) * 0.01),
                             borrow=True)

    def sigmoid(self, X):
        '''
            # 1 apply non-linear activation function of the given input.
        '''
        return 1.0 / (1.0 + T.exp(-X))

    def rectify(self, X):
        '''
            # 1 apply non-linear activation function of the given input.
            # 2 aparently ReLU is faster than signoid/tanh
        '''
        return T.maximum(X, 0.)

    def softmax(self, X):
        '''
            # get your final softmax classifier so that it returns probabilities
        '''
        # make it mathematically easier
        e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
        return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

    def dropout(self, X, p=0.):
        '''
            # define dropout function with the given probability of retaining
        '''
        if p > 0:
            retain_prob = 1 - p
            X *= self.srng.binomial(X.shape,
                                    p=retain_prob,
                                    dtype=theano.config.floatX)
            X /= retain_prob
        return X

    def RMSprop(self, costC, paramsC, lr=0.02, rho=0.9, epsilon=1e-6):
        '''
            # your cost minimizing function.
        '''
        grads = T.grad(cost=costC, wrt=paramsC)
        updates = []
        ii = 0
        # print len(params)
        # print len(grads)
        for p, g in zip(paramsC, grads):
            # ii += 1
            # print ii, p.get_value()
            acc = theano.shared(p.get_value() * 0.)
            acc_new = rho * acc + (1 - rho) * g**2
            gradient_scaling = T.sqrt(acc_new + epsilon)
            g = g / gradient_scaling
            updates.append((acc, acc_new))
            updates.append((p, p - lr * g))
        return updates

    def model(self,
              X,
              w1,
              w2,
              w3,
              w4,
              w5,
              w_output,
              p_drop_conv=0,
              p_drop_hidden=0):
        '''
            # your main model
            3 convolutional layers
            2 fully connected layers
            1 SoftMax layer
        '''
        # first convolutional layer
        if (self.mode == "Train"):
            l1a = self.rectify(conv2d(X, w1, border_mode='full'))
            l1 = max_pool_2d(l1a, (2, 2), st=None, padding=(0, 0), mode='max')
            l1 = self.dropout(l1, p_drop_conv)
        elif (self.mode == "Test"):
            l1a = self.rectify(
                conv2d(X, w1 * (1 - p_drop_conv), border_mode='full'))
            l1 = max_pool_2d(l1a, (2, 2), st=None, padding=(0, 0), mode='max')
        # convOut1 = conv2d(X, w1)
        convOut1 = l1a

        # second convolutional layer

        if (self.mode == "Train"):
            l2a = self.rectify(conv2d(l1, w2))
            l2 = max_pool_2d(l2a, (2, 2), st=None, padding=(0, 0), mode='max')
            l2 = self.dropout(l2, p_drop_conv)
        elif (self.mode == "Test"):
            l2a = self.rectify(conv2d(l1, w2 * (1 - p_drop_conv)))
            l2 = max_pool_2d(l2a, (2, 2), st=None, padding=(0, 0), mode='max')

        # # third convolutional layer
        if (self.mode == "Train"):
            l3a = self.rectify(conv2d(l2, w3))
            l3 = max_pool_2d(l3a, (2, 2), st=None, padding=(0, 0), mode='max')
            l3 = self.dropout(l3, p_drop_conv)
        elif (self.mode == "Test"):
            l3a = self.rectify(conv2d(l2, w3 * (1 - p_drop_conv)))
            l3 = max_pool_2d(l3a, (2, 2), st=None, padding=(0, 0), mode='max')

        # # flatten the output
        l3 = T.flatten(l3, outdim=2)

        # 1st fully connected layer
        if (self.mode == "Train"):
            l4 = self.rectify(T.dot(l3, w4))
            l4 = self.dropout(l4, p_drop_hidden)
        elif (self.mode == "Test"):
            l4 = self.rectify(T.dot(l3, w4 * (1 - p_drop_hidden)))

        # 2nd fully connected layer
        if (self.mode == "Train"):
            l5 = self.rectify(T.dot(l4, w5))
            l5 = self.dropout(l5, p_drop_hidden)
        elif (self.mode == "Test"):
            l5 = self.rectify(T.dot(l4, w5 * (1 - p_drop_hidden)))

        # connected the above output to softmax layer
        pyx = self.softmax(T.dot(l5, w_output))

        return l1, l2, l3, l4, l5, pyx, convOut1

    def getL1Norm(self, params, scaleForm=0.0001):
        '''
            get L1 normalization on the weights.
            performing regularization using L1 reduces the size of the weights and makes them sparse
        '''
        tsum = 0
        for eachParam in params:
            tsum += abs(eachParam).sum()
        return tsum * scaleForm
        pass

    def getL2Norm(self, params, scaleForm=0.0001):
        '''
            get L2 normalization on the weights.
            performing regularization using L2 gives rise to uniqe solution for the weights
        '''
        tsum = 0
        for eachParam in params:
            tsum += T.sqrt(eachParam**2).sum()
        return tsum * scaleForm

    def getL2NormSquare(self, params, scaleForm=0.0001):
        '''
            get L2 (square) normalization  on the weights.
            less computationally expensive than pure L2 normalization
        '''
        tsum = 0
        for eachParam in params:
            tsum += abs(eachParam**2).sum()
        return tsum * scaleForm

    def initializeModel(self):
        '''
            define your deep learning model
        '''
        print 'defining model'

        X = T.ftensor4()
        Y = T.fmatrix()

        #initialize your weghts, kernels
        # format n kernels, n channels, kernel_w x kernel_h
        # 20 kernels on gray scale image with 5 x 5 sized kernel
        w1 = self.init_weights((20, 3, 5, 5),
                               weightType='Xavier',
                               caffeLayerName='conv1')

        # 50 20-channel 5 x 5 sized kernel
        w2 = self.init_weights((50, 20, 5, 5),
                               weightType='Xavier',
                               caffeLayerName='conv2')

        # 50 50-channel 4 y_x 4 sized kernel
        w3 = self.init_weights((50, 50, 4, 4),
                               weightType='Xavier',
                               caffeLayerName='conv3')

        # flatten the inputs and pass to fully connected layer
        w4 = self.init_weights((14450, 1000), weightType='Xavier')

        # flatten the inputs and pass to fully connected layer
        w5 = self.init_weights((1000, 500), weightType='Xavier')

        # flatten the inputs and pass to fully connected layer
        w_output = self.init_weights((500, 2), weightType='Xavier')

        # define your deep model
        if (self.dropout_params == None):
            # if there is no default dropout params mentioned, just set them manually
            self.dropout_params = {}
            self.dropout_params['conv'] = 0.1
            self.dropout_params['fc'] = 0.2
        print 'initializing with dropout_params: ', self.dropout_params[
            'conv'], self.dropout_params['fc']
        noise_l1, noise_l2, noise_l3, noise_l4, noise_l5, noise_py_x, convOut1 = self.model(
            X,
            w1,
            w2,
            w3,
            w4,
            w5,
            w_output,
            p_drop_conv=self.dropout_params['conv'],
            p_drop_hidden=self.dropout_params['fc'])

        # get your label from the predicted probabilties
        y_x = T.argmax(noise_py_x, axis=1)
        # y_x = noise_py_x >= 0.5

        self.learning_rate = 0.0001

        self.params = [w1, w2, w3, w4, w5, w_output]

        L1_norm = self.getL1Norm(self.params)
        L2_norm = self.getL2Norm(self.params)

        # pd = np.array(self.params)
        # mean cross entropy with L2 regularization
        self.cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))

        self.paramUpdates = self.RMSprop(self.cost,
                                         self.params,
                                         lr=self.learning_rate)
        #self.paramUpdates = self.MomentumOptimizer(self.cost, self.params, lr = self.learning_rate)

        if (self.modelToLoad != None):
            self.loadThisModel(self.modelToLoad)

        # self.cost = T.mean((T.nnet.binary_crossentropy(noise_py_x, Y)))

        print 'compiling functions'
        print 'current learning rate: ', self.learning_rate
        start_compilation_time = time.clock()
        if (self.mode == "Train"):
            print 'compiling train function startin at ', strftime(
                "%Y-%m-%d %H:%M:%S")
            self.train = theano.function(inputs=[X, Y],
                                         outputs=self.cost,
                                         updates=self.paramUpdates,
                                         allow_input_downcast=True)
        print 'compiling predict function'
        self.predict = theano.function(inputs=[X],
                                       outputs=y_x,
                                       allow_input_downcast=True)
        print 'compiling predictProb function'
        self.predictProb = theano.function(inputs=[X],
                                           outputs=noise_py_x,
                                           allow_input_downcast=True)
        end_compilation_time = time.clock()
        self.getFirstLayerOutput = theano.function(inputs=[X],
                                                   outputs=convOut1)
        print 'compiled the functions, ended at ', strftime(
            "%Y-%m-%d %H:%M:%S")
        print 'time takent compile the functions: ', end_compilation_time - start_compilation_time

    def trainThisModel(self):
        '''
            iterate through the data
            train the classifier
        '''
        print 'training the model'
        # self.total_epochs = 10
        self.mini_batch_size = 32

        still_looping = True
        for each_epoch in range(self.total_epochs):
            if (still_looping == False):
                print 'exiting each_epoch loop'
                break

            print '--' * 5
            print 'epoch: ', each_epoch
            print '--' * 5
            iterId = 0
            costList = []
            self.cost = 0

            # trainFile = '/home/ganymede/Datasets/cross validation/'+str(self.crossvalidid)+'/train_lmdb_'+str(self.imgSize)+'/'
            # self.testMean = np.load('/home/jason/Desktop/Robotics/Dataset/lmdb generator/data_lmdb.npy')

            # print trainMean.shape

            # self.trainMean = np.reshape(trainMean, (1, 3, self.imgSize, self.imgSize))

            # print trainMean.shape
            # print trainFile

            lmdb_env = lmdb.open(trainFile)
            lmdb_txn = lmdb_env.begin()
            lmdb_cursor = lmdb_txn.cursor()
            datum = caffe_pb2.Datum()
            dataList = []
            labelList = []
            for key, value in lmdb_cursor:
                datum.ParseFromString(value)
                label = datum.label
                data = caffe.io.datum_to_array(datum)
                dataList.append(data)
                labelList.append(label)

                if (len(dataList) == self.mini_batch_size
                        and len(labelList) == self.mini_batch_size):

                    self.dataMatTrain = np.array(dataList)
                    self.labelMatTrain = np.array(labelList)
                    self.totalTrainSamples = self.dataMatTrain.shape[0]
                    # print self.dataMatTrain.shape, self.labelMatTrain.shape
                    self.labelMatTrain = self.labelMatTrain.reshape(
                        (self.mini_batch_size, 1))
                    # print self.dataMatTrain.shape, self.labelMatTrain.shape
                    self.matrifyMyData()
                    # print self.dataMatTrain.shape, self.labelMatTrain.shape
                    trX, trY = self.dataMatTrain, self.labelMatTrain
                    trX = self.getMeanNormalizedData(trX, 'Train')
                    ccost = self.train(trX, trY)

                    # print 'mean: ',np.mean(trX,axis=0)
                    predVals = self.predict(trX)
                    if (iterId % 100 == 0):
                        print 'epoch: ', each_epoch, 'iter: ' + str(
                            iterId) + ', cost: ', ccost
                        costList.append(ccost)
                        pd2 = np.argmax(trY, axis=1)
                        print '--'
                        pass
                    pass
                    iterId += 1
                    dataList = []
                    labelList = []
            avg_cost = sum(costList) / float(len(costList))
            self.saveThisModel('c' + str(self.crossvalidid) + "_" +
                               str(each_epoch) + '_' + str(avg_cost) +
                               '_TempModel_DeepLearningNode_' +
                               str(self.imgSize) + '.npz')

    def saveThisModel(self, fileName=None):
        '''
            get the variables from the classifier model
            save the model to disk
        '''
        os.chdir(curDirName)
        params = []
        for eachParam in self.params:
            params.append(eachParam.get_value())
        print len(params)
        print params[0].shape
        # try saving with npz format
        if (fileName == None):
            fileName = 'MyLeNet_' + str(
                strftime("%Y-%m-%d %H:%M:%S")) + 'total_epochs_' + str(
                    self.total_epochs) + '.npz'
        np.savez_compressed(fileName, params=params)
        print 'saving to ', os.getcwd()
        print 'saved to ', fileName
        self.fileName = fileName
        # self.moveToCaffeDir()
        pass

    def loadThisModel(self, modelToLoad):
        '''
            take the path of the classifier
            load the classifier model
            assign the variables to initialized model
        '''
        print 'loading this model'
        print modelToLoad
        print os.path.exists(modelToLoad) == True
        params = np.load(modelToLoad)
        allParams = params['params']
        print type(allParams)
        print allParams.shape
        for eachParam in range(allParams.shape[0]):
            print allParams[eachParam].shape
        #self.params = None
        for eachParam in range(allParams.shape[0]):
            self.params[eachParam].set_value(allParams[eachParam])
        print 'loaded the saved convnet classifier params'
        pass

    def writeFirstLayerToDisk(self, imgArray):
        '''
            take image
            convolve the image with first layer filters
            write filters to disk
        '''
        # print imgArray.shape
        convOut1 = self.getFirstLayerOutput(imgArray)
        # print convOut1.shape
        # cv2.imwrite(curDirName+"/vis/0.jpg", convOut1[0, 0,:,:].reshape(204, 204))
        # cv2.imwrite(curDirName+"/vis/1.jpg", convOut1[0, 1,:,:].reshape(204, 204))
        pass

    def predictThisImage(self, imgArray=None, meanSubtracted=True):
        '''
            1 take image
            get mean normalized image
            return class of the image
        '''
        if (meanSubtracted == False):
            imgArray = self.getMeanNormalizedData(imgArray, 'Test')
        return self.predict(imgArray)

    def predictThisImageWithProbability(self,
                                        imgArray=None,
                                        meanSubtracted=True):
        '''
            1 take image
            get mean normalized image
            return probabilties of the image being an object, non object
        '''
        if (meanSubtracted == False):
            imgArray = self.getMeanNormalizedData(imgArray, 'Test')
        return self.predictProb(imgArray)

    def getMeanNormalizedData(self, data, mode):
        '''
            # 1 take image
            # 2 reshape it
            # 3 subtract mean image
            # 4 divide by 255
            # 5 return image
        '''
        self.testMean = np.load('/home/ganymede/Datasets/cross validation/' +
                                str(self.crossvalidid) + '/test_lmdb_' +
                                str(self.imgSize) + '.npy')
        self.testMean = np.reshape(self.testMean,
                                   (1, 3, self.imgSize, self.imgSize))
        data = data.astype('float32')
        # print 'subtracting mean'
        if (mode == "Train"):
            data -= self.trainMean
        elif (mode == "Test"):
            data -= self.testMean
        data = data / float(255.0)
        return data
Exemplo n.º 48
0
class RVal(Elem, TensorWrapped, Masked):  # random value
    def __init__(self, seed=None, **kw):
        super(RVal, self).__init__(**kw)
        if seed is None:
            seed = np.random.randint(0, 1e6)
        self.rng = RandomStreams(seed=seed)
        self.value = None

    def binomial(self, shape, n=1, p=0.5, ndim=None, dtype="int32"):
        if isinstance(shape, Elem):
            shape = shape.d
        self.value = self.rng.binomial(shape, n, p, ndim, dtype)
        return self

    def normal(self, shape, avg=0.0, std=1.0, ndim=None, dtype=None):
        if isinstance(shape, Elem):
            shape = shape.d
        self.value = self.rng.normal(shape, avg, std, ndim, dtype)
        return self

    def multinomial(self,
                    shape,
                    n=1,
                    pvals=None,
                    without_replacement=False,
                    ndim=None,
                    dtype="int32"):
        if isinstance(shape, Elem):
            shape = shape.d
        if without_replacement:
            self.value = self.rng.multinomial_wo_replacement(
                shape, n, pvals, ndim, dtype)
        else:
            self.value = self.rng.multinomial(shape, n, pvals, ndim, dtype)
        return self

    def gumbel(self, shape, eps=1e-10):
        if isinstance(shape, Elem):
            shape = shape.d
        x = self.rng.uniform(shape, 0.0, 1.0)
        self.value = -theano.tensor.log(-theano.tensor.log(x + eps) + eps)
        return self

    @property
    def d(self):
        return self.value

    @property
    def v(self):
        return self.value.eval()

    @property
    def allparams(self):
        return set()

    @property
    def allupdates(self):
        return {}

    @property
    def all_extra_outs(self):
        return {}
Exemplo n.º 49
0
class FourwayLstm(superclass.RNN):
    '''
    Fields:
    '''

    hidden_dimension = 200
    input_dimension = 81
    srng = None
    use_cross_entropy_loss = True
    '''
    Initialization:
    '''
    def __init__(self, optimizer_config_path, loss="cross-entropy"):
        n_layers = 4

        self.input_lstm_layer = network_ops.fourdirectional_lstm_layer(
            'input_layer_', self.input_dimension * 2 + 1,
            self.hidden_dimension)

        self.lstm_layers = [
            network_ops.fourdirectional_lstm_layer('layer_' + str(l),
                                                   self.hidden_dimension * 4,
                                                   self.hidden_dimension)
            for l in range(n_layers - 1)
        ]

        self.output_convolution = network_ops.linear_layer_on_tensor(
            'output_layer', self.hidden_dimension * 4, 1)

        self.layers = [self.input_lstm_layer
                       ] + self.lstm_layers + [self.output_convolution]

        self.use_cross_entropy_loss = loss == "cross-entropy"

        super().__init__('sentence', optimizer_config_path)

    '''
    Theano functions:
    '''

    def __pairwise_features(self, V, Vs, sentence_length):
        thingy, _ = theano.scan(
            fn=lambda x, y: T.concatenate([y, T.zeros(1), x]),
            sequences=Vs,
            non_sequences=V)

        root_feature = T.concatenate(
            (T.ones(1), T.zeros(self.input_dimension)))
        root_features = T.concatenate((V, root_feature))

        flat_version = thingy.flatten()
        with_root = T.concatenate((root_features, flat_version))
        in_shape = T.reshape(with_root,
                             newshape=(sentence_length + 1,
                                       self.input_dimension * 2 + 1))
        return in_shape

    def theano_sentence_loss(self, Vs, gold):
        preds = self.theano_sentence_prediction(Vs)

        if self.use_cross_entropy_loss:
            losses = T.nnet.categorical_crossentropy(preds, gold)
        else:
            losses = T.pow(preds - gold, 2)

        return T.sum(losses)

    def dropout(self, tensor, dropout_prob=0.5, training=True):
        if not training:
            return tensor

        if self.srng is None:
            self.srng = RandomStreams(seed=12345)

        keep_prob = 1.0 - dropout_prob
        mask = self.srng.binomial(size=tensor.shape,
                                  p=keep_prob,
                                  dtype='floatX')
        return tensor * mask / keep_prob

    def theano_sentence_prediction(self, Vs):
        pairwise_vs, _ = theano.scan(fn=self.__pairwise_features,
                                     outputs_info=None,
                                     sequences=Vs,
                                     non_sequences=[Vs, Vs.shape[0]])

        pairwise_vs = self.dropout(pairwise_vs,
                                   dropout_prob=0.2,
                                   training=self.input_lstm_layer.training)

        full_matrix = self.input_lstm_layer.function(pairwise_vs)

        for layer in self.lstm_layers:
            full_matrix = self.dropout(full_matrix,
                                       dropout_prob=0.5,
                                       training=self.input_lstm_layer.training)
            full_matrix = layer.function(full_matrix)

        full_matrix = self.dropout(full_matrix,
                                   dropout_prob=0.5,
                                   training=self.input_lstm_layer.training)

        final_matrix = self.output_convolution.function(full_matrix)[:, :, 0]

        if self.use_cross_entropy_loss:
            final_matrix = T.nnet.softmax(final_matrix)

        return final_matrix
Exemplo n.º 50
0
def test_binomial():
    #TODO: test size=None, ndim=X
    #TODO: test size=X, ndim!=X.ndim
    #TODO: test random seed in legal value(!=0 and other)
    #TODO: test sample_size not a multiple of guessed #streams
    #TODO: test size=Var, with shape that change from call to call
    #we test size in a tuple of int and a tensor.shape.
    #we test the param p with int.

    if mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE']:
        sample_size = (10, 50)
        steps = 50
        rtol = 0.02
    else:
        sample_size = (500, 50)
        steps = int(1e3)
        rtol = 0.01

    x = tensor.matrix()
    v = tensor.vector()
    for mean in [0.1, 0.5]:
        for size, var_input, input in [
            (sample_size, [], []),
            (x.shape, [x], [numpy.zeros(sample_size, dtype=config.floatX)])
        ]:

            #print ''
            #print 'ON CPU with size=(%s) and mean(%d):' % (str(size), mean)
            R = MRG_RandomStreams(234, use_cuda=False)
            # Note: we specify `nstreams` to avoid a warning.
            u = R.binomial(size=size,
                           p=mean,
                           nstreams=rng_mrg.guess_n_streams(size, warn=False))
            f = theano.function(var_input, u, mode=mode)
            #theano.printing.debugprint(f)
            out = f(*input)
            #print 'random?[:10]\n', out[0, 0:10]
            #print 'random?[-1,-10:]\n', out[-1, -10:]
            basictest(f,
                      steps,
                      sample_size,
                      prefix='mrg  cpu',
                      inputs=input,
                      allow_01=True,
                      target_avg=mean,
                      mean_rtol=rtol)

            if mode != 'FAST_COMPILE' and cuda_available:
                #print ''
                #print 'ON GPU with size=(%s) and mean(%d):' % (str(size), mean)
                R = MRG_RandomStreams(234, use_cuda=True)
                u = R.binomial(size=size,
                               p=mean,
                               dtype='float32',
                               nstreams=rng_mrg.guess_n_streams(size,
                                                                warn=False))
                #well, it's really that this test w GPU doesn't make sense otw
                assert u.dtype == 'float32'
                f = theano.function(
                    var_input,
                    theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(u),
                               borrow=True),
                    mode=mode_with_gpu)
                #theano.printing.debugprint(f)
                gpu_out = numpy.asarray(f(*input))
                #print 'random?[:10]\n', gpu_out[0, 0:10]
                #print 'random?[-1,-10:]\n', gpu_out[-1, -10:]
                basictest(f,
                          steps,
                          sample_size,
                          prefix='mrg  gpu',
                          inputs=input,
                          allow_01=True,
                          target_avg=mean,
                          mean_rtol=rtol)
                numpy.testing.assert_array_almost_equal(out,
                                                        gpu_out,
                                                        decimal=6)

            #print ''
            #print 'ON CPU w NUMPY with size=(%s) and mean(%d):' % (str(size),
            #                                                       mean)
            RR = theano.tensor.shared_randomstreams.RandomStreams(234)

            uu = RR.binomial(size=size, p=mean)
            ff = theano.function(var_input, uu, mode=mode)
            # It's not our problem if numpy generates 0 or 1
            basictest(ff,
                      steps,
                      sample_size,
                      prefix='numpy',
                      allow_01=True,
                      inputs=input,
                      target_avg=mean,
                      mean_rtol=rtol)
Exemplo n.º 51
0
class SpatialDropoutLayer(Layer):
    """Spatial dropout layer
    Sets whole filter activations to zero with probability p. See notes for
    disabling dropout during testing.
    Parameters
    ----------
    incoming : a :class:`Layer` instance or a tuple
        the layer feeding into this layer, or the expected input shape
    p : float or scalar tensor
        The probability of setting a value to zero
    rescale : bool
        If true the input is rescaled with input / (1-p) when deterministic
        is False.
    Notes
    -----
    The spatial dropout layer is a regularizer that randomly sets whole the
    values of whole features to zero. This is an adaptation of normal dropout,
    which is generally useful in fully convolutional settings, such as [1]_.

    It is also called a feature dropout layer.

    During training you should set deterministic to false and during
    testing you should set deterministic to true.
    If rescale is true the input is scaled with input / (1-p) when
    deterministic is false, see references for further discussion. Note that
    this implementation scales the input at training time.
    References
    ----------
    .. [1] Oliveira, G. Valada, A., Bollen, C., Bugard, W., Brox. T. (2016):
           Deep Learning for Human Part Discovery in Images. IEEE
           International Conference on Robotics and Automation (ICRA), IEEE,
           2016.
    """
    def __init__(self, incoming, p=0.5, rescale=True, **kwargs):
        super(SpatialDropoutLayer, self).__init__(incoming, **kwargs)
        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
        self.p = p
        self.rescale = rescale

    def get_output_for(self, input, deterministic=False, **kwargs):
        """
        Parameters
        ----------
        input : tensor
            output from the previous layer
        deterministic : bool
            If true dropout and scaling is disabled, see notes
        """
        if deterministic or self.p == 0:
            return input
        else:
            # Using theano constant to prevent upcasting
            one = T.constant(1)

            retain_prob = one - self.p
            if self.rescale:
                input /= retain_prob

            mask = self._srng.binomial(input.shape[:2],
                                       p=retain_prob,
                                       dtype=input.dtype)
            axes = [0, 1] + (['x'] * (input.ndim - 2))
            mask = mask.dimshuffle(*axes)

            return input * mask
Exemplo n.º 52
0
class DropoutLayer(Layer):
    """Dropout layer
    Sets values to zero with probability p. See notes for disabling dropout
    during testing.
    Parameters
    ----------
    incoming : a :class:`Layer` instance or a tuple
        the layer feeding into this layer, or the expected input shape
    p : float or scalar tensor
        The probability of setting a value to zero
    rescale : bool
        If ``True`` (the default), scale the input by ``1 / (1 - p)`` when
        dropout is enabled, to keep the expected output mean the same.
    shared_axes : tuple of int
        Axes to share the dropout mask over. By default, each value can be
        dropped individually. ``shared_axes=(0,)`` uses the same mask across
        the batch. ``shared_axes=(2, 3)`` uses the same mask across the
        spatial dimensions of 2D feature maps.
    Notes
    -----
    The dropout layer is a regularizer that randomly sets input values to
    zero; see [1]_, [2]_ for why this might improve generalization.
    The behaviour of the layer depends on the ``deterministic`` keyword
    argument passed to :func:`lasagne.layers.get_output`. If ``True``, the
    layer behaves deterministically, and passes on the input unchanged. If
    ``False`` or not specified, dropout (and possibly scaling) is enabled.
    Usually, you would use ``deterministic=False`` at train time and
    ``deterministic=True`` at test time.
    See also
    --------
    dropout_channels : Drops full channels of feature maps
    spatial_dropout : Alias for :func:`dropout_channels`
    dropout_locations : Drops full pixels or voxels of feature maps
    References
    ----------
    .. [1] Hinton, G., Srivastava, N., Krizhevsky, A., Sutskever, I.,
           Salakhutdinov, R. R. (2012):
           Improving neural networks by preventing co-adaptation of feature
           detectors. arXiv preprint arXiv:1207.0580.
    .. [2] Srivastava Nitish, Hinton, G., Krizhevsky, A., Sutskever,
           I., & Salakhutdinov, R. R. (2014):
           Dropout: A Simple Way to Prevent Neural Networks from Overfitting.
           Journal of Machine Learning Research, 5(Jun)(2), 1929-1958.
    """
    def __init__(self, incoming, p=0.5, rescale=True, shared_axes=(),
                 **kwargs):
        super(DropoutLayer, self).__init__(incoming, **kwargs)
        #TODO: use same random
        #self._srng = RandomStreams(get_rng().randint(1, 2147462579))
        r = get_rng().randint(1, 2147462579)
        self._srng = RandomStreams(r)
        print(self, r)
        self.p = p
        self.rescale = rescale
        self.shared_axes = tuple(shared_axes)

    def get_output_for(self, input, deterministic=False, **kwargs):
        if deterministic or self.p == 0:
            return input
        else:
            # Using theano constant to prevent upcasting
            one = T.constant(1, dtype='int8')

            retain_prob = one - self.p
            if self.rescale:
                input /= retain_prob

            # use nonsymbolic shape for dropout mask if possible
            mask_shape = self.input_shape
            if any(s is None for s in mask_shape):
                mask_shape = input.shape

            # apply dropout, respecting shared axes
            if self.shared_axes:
                shared_axes = tuple(a if a >= 0 else a + input.ndim
                                    for a in self.shared_axes)
                mask_shape = tuple(1 if a in shared_axes else s
                                   for a, s in enumerate(mask_shape))
            mask = self._srng.binomial(mask_shape, p=retain_prob,
                                       dtype=input.dtype)
            if self.shared_axes:
                bcast = tuple(bool(s == 1) for s in mask_shape)
                mask = T.patternbroadcast(mask, bcast)
            return input * mask
    def reinit(self):
        r = get_rng().randint(1, 2147462579)
        self._srng = RandomStreams(r)
        print(self, r)
Exemplo n.º 53
0
class GRU(Model):
    """ A standard GRU model with no output layer.

    See GRU_softmax or GRU_regression for implementations with an output layer.

    The output is simply the state of the last hidden layer.
    """
    def __init__(self,
                 input_size,
                 hidden_sizes,
                 activation='tanh',
                 use_layer_normalization=False,
                 drop_prob=0.,
                 use_zoneout=False,
                 use_skip_connections=False,
                 seed=1234):
        """
        Parameters
        ----------
        input_size : int
            Number of units each element Xi in the input sequence X has.
        hidden_sizes : int, list of int
            Number of hidden units each GRU should have.
        activation : str
            Activation function to apply on the "cell candidate"
        use_layer_normalization : bool
            Use LayerNormalization to normalize preactivations and stabilize hidden layer evolution
        drop_prob : float
            Dropout/Zoneout probability for recurrent networks. See: https://arxiv.org/pdf/1512.05287.pdf & https://arxiv.org/pdf/1606.01305.pdf
        use_zoneout : bool
            Use zoneout implementation instead of dropout (a different zoneout mask will be use at each timestep)
        use_skip_connections : bool
            Use skip connections from the input to all hidden layers in the network, and from all hidden layers to the output layer
        seed : int
            Random seed used for dropout normalization
        """
        self.graph_updates = OrderedDict()
        self._gen = None

        self.input_size = input_size
        self.hidden_sizes = [hidden_sizes
                             ] if type(hidden_sizes) is int else hidden_sizes
        self.activation = activation
        self.use_layer_normalization = use_layer_normalization
        self.drop_prob = drop_prob
        self.use_zoneout = use_zoneout
        self.use_skip_connections = use_skip_connections
        self.seed = seed
        self.srng = MRG_RandomStreams(self.seed)

        layer_class = LayerGRU
        if self.use_layer_normalization:
            layer_class = LayerGruNormalized

        self.layers = []
        last_hidden_size = self.input_size
        for i, hidden_size in enumerate(self.hidden_sizes):
            self.layers.append(
                layer_class(last_hidden_size,
                            hidden_size,
                            activation=activation,
                            name="GRU{}".format(i)))
            last_hidden_size = hidden_size + (input_size if
                                              self.use_skip_connections else 0)

        self.dropout_vectors = {}
        if self.drop_prob and not self.use_zoneout:
            p = 1 - self.drop_prob
            for layer in self.layers:
                self.dropout_vectors[layer.name] = self.srng.binomial(
                    size=(layer.hidden_size, ), n=1, p=p, dtype=floatX) / p

    def initialize(self, weights_initializer=initer.UniformInitializer(1234)):
        for layer in self.layers:
            layer.initialize(weights_initializer)

    @property
    def updates(self):
        return self.graph_updates

    @property
    def hyperparameters(self):
        hyperparameters = {
            'version': 2,
            'input_size': self.input_size,
            'hidden_sizes': self.hidden_sizes,
            'activation': self.activation,
            'use_layer_normalization': self.use_layer_normalization,
            'drop_prob': self.drop_prob,
            'use_zoneout': self.use_zoneout,
            'use_skip_connections': self.use_skip_connections,
            'seed': self.seed
        }

        return hyperparameters

    @property
    def parameters(self):
        parameters = []
        for layer in self.layers:
            parameters += layer.parameters

        return parameters

    def get_init_states(self, batch_size):
        states_h = []
        for i, hidden_size in enumerate(self.hidden_sizes):
            state_h = np.zeros((batch_size, hidden_size), dtype=floatX)
            states_h.append(state_h)

        return states_h

    def _fprop(self, Xi, *args):
        layers_h = []

        input = Xi
        for i, layer in enumerate(self.layers):
            drop_states = None
            drop_value = None
            if self.drop_prob:
                if self.use_zoneout:
                    drop_value = 1.
                    drop_states = self.srng.binomial((layer.hidden_size, ),
                                                     n=1,
                                                     p=1 - self.drop_prob,
                                                     dtype=floatX)
                else:
                    drop_value = 0.
                    drop_states = self.dropout_vectors[layer.name]

            last_h = args[i]
            h = layer.fprop(input, last_h, drop_states, drop_value)
            layers_h.append(h)
            if self.use_skip_connections:
                input = T.concatenate([h, Xi], axis=-1)
            else:
                input = h

        return tuple(layers_h)

    def get_output(self, X):

        outputs_info_h = []
        for hidden_size in self.hidden_sizes:
            outputs_info_h.append(T.zeros((X.shape[0], hidden_size)))

        results, updates = theano.scan(
            fn=self._fprop,
            outputs_info=outputs_info_h,
            sequences=[
                T.transpose(X, axes=(1, 0, 2))
            ])  # We want to scan over sequence elements, not the examples.

        self.graph_updates = updates
        # Put back the examples so they are in the first dimension.
        self.h = T.transpose(results[0], axes=(1, 0, 2))
        return self.h

    def save(self, path):
        savedir = smartutils.create_folder(pjoin(path, type(self).__name__))
        smartutils.save_dict_to_json_file(pjoin(savedir, "hyperparams.json"),
                                          self.hyperparameters)

        params = {param.name: param.get_value() for param in self.parameters}
        assert len(self.parameters) == len(
            params)  # Implies names are all unique.
        np.savez(pjoin(savedir, "params.npz"), **params)

        state = {
            "version":
            1,
            "_srng_rstate":
            self.srng.rstate,
            "_srng_state_updates": [
                state_update[0].get_value()
                for state_update in self.srng.state_updates
            ]
        }
        np.savez(pjoin(savedir, "state.npz"), **state)

    def load(self, path):
        loaddir = pjoin(path, type(self).__name__)

        parameters = np.load(pjoin(loaddir, "params.npz"))
        for param in self.parameters:
            param.set_value(parameters[param.name])

        state = np.load(pjoin(loaddir, 'state.npz'))
        self.srng.rstate[:] = state['_srng_rstate']
        for state_update, saved_state in zip(self.srng.state_updates,
                                             state["_srng_state_updates"]):
            state_update[0].set_value(saved_state)

    @classmethod
    def create(cls, path, **kwargs):
        loaddir = pjoin(path, cls.__name__)
        hyperparams = smartutils.load_dict_from_json_file(
            pjoin(loaddir, "hyperparams.json"))
        hyperparams.update(kwargs)

        if hyperparams['version'] < 2:
            hyperparams['drop_prob'] = hyperparams['dropout_prob']
            del hyperparams['dropout_prob']

        model = cls(**hyperparams)
        model.load(path)
        return model
Exemplo n.º 54
0
def test_undefined_grad():
    srng = MRG_RandomStreams(seed=1234)

    # checking uniform distribution
    low = tensor.scalar()
    out = srng.uniform((), low=low)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, low)

    high = tensor.scalar()
    out = srng.uniform((), low=0, high=high)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, high)

    out = srng.uniform((), low=low, high=high)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (low, high))

    # checking binomial distribution
    prob = tensor.scalar()
    out = srng.binomial((), p=prob)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, prob)

    # checking multinomial distribution
    prob1 = tensor.scalar()
    prob2 = tensor.scalar()
    p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad,
                  theano.tensor.sum(out), prob1)

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad,
                  theano.tensor.sum(out), (prob1, prob2))

    # checking choice
    p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  prob1)

    # checking normal distribution
    avg = tensor.scalar()
    out = srng.normal((), avg=avg)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg)

    std = tensor.scalar()
    out = srng.normal((), avg=0, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std)

    out = srng.normal((), avg=avg, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (avg, std))
Exemplo n.º 55
0
    def __init__(self, nam, W=0, maxlen=0, load=False, training=False):

        self.W = W
        # 创建2个LSTM单元(参数:WUb)放入词典中,并初始化参数
        # Generate 2 LSTM unit with Guassian innitialization
        # Type: Dictionary
        self.maxlen = maxlen
        newp = creatrnnx()
        self.model_name = nam
        # 让两个LSTM单元的参数WUb的初始相同
        # Make the weights(WUb) of both LSTM unit same
        for i in newp.keys():
            if i[0] == '1':
                newp['2' + i[1:]] = newp[i]

        # Create 5 tensors (symoblic) variables (y, mask11, mask21, emb11, emb21)
        # Here, config.floatX = 'float32'
        y = T.vector('y', dtype=config.floatX)
        mask11 = T.matrix('mask11', dtype=config.floatX)
        mask21 = T.matrix('mask21', dtype=config.floatX)
        emb11 = T.ftensor3('emb11')
        emb21 = T.ftensor3('emb21')  # 3-D float-type tensor

        # Load the existed model (pre-trained weights) if needed
        if load == True:
            newp = pickle.load(open(nam, 'rb'))

        # Convert 'newp' to shared-tensor-type dictionary 'tnewp'
        # Shared tenasor variable
        self.tnewp = init_tparams(newp)

        # Set tensor-type noise
        use_noise = theano.shared(numpy_floatX(0.))

        # Set tensor-type random number generator
        # rng -> random number generator
        trng = RandomStreams(1234)

        # ??? rrng?
        # create a 3-D random tensor for "dropout"?
        rate = 0.5
        rrng = trng.binomial(emb11.shape, p=1 - rate, n=1, dtype=emb11.dtype)
        # print "rrng:"
        # print "type of rrng:", type(rrng)
        # print rrng

        # 具体化LSTM模型的结构和参数(核心)proj代表着一个mini-batch输入以后的输出值
        # Implement the LSTM module;
        # Here 'False' -> NOT apply DROPOUT layers;
        # Since the input is in the format: (Max No. of words in batch, No. of Samples, 300)
        # Note: that the 1st term and 2nd term are exchanged!
        # 只需要getp()即scan循环以后的最后一次(timesteps)结果,之前记录LSTM输出的结果都抛弃
        # proj11[-1] -> (No. of samples[N], Hidden unit dimension[timesteps]) -> (N, 50)
        # proj11 takes the inputs as embedding matrix emb1 and gives the o/p of the LSTM_A
        proj11 = getpl2(emb11, '1lstm1', mask11, False, rrng, 50,
                        self.tnewp)[-1]
        proj21 = getpl2(emb21, '2lstm1', mask21, False, rrng, 50,
                        self.tnewp)[-1]

        # Define the cost function
        dif = (proj21 - proj11).norm(L=1, axis=1)
        s2 = T.exp(-dif)
        sim = T.clip(s2, 1e-7, 1.0 - 1e-7)  # Similarity
        lr = tensor.scalar(name='lr')  # learning rate
        ys = T.clip((y - 1.0) / 4.0, 1e-7, 1.0 - 1e-7)
        cost = T.mean((sim - ys)**2)
        ns = emb11.shape[1]
        self.f2sim = theano.function([emb11, mask11, emb21, mask21],
                                     sim,
                                     allow_input_downcast=True)
        self.f_proj11 = theano.function([emb11, mask11],
                                        proj11,
                                        allow_input_downcast=True)  # NOT used
        self.f_cost = theano.function([emb11, mask11, emb21, mask21, y],
                                      cost,
                                      allow_input_downcast=True)  # NOT used

        # Prepare for the backpropogation and gradiant descend
        if training == True:

            # 计算cost对不同参数的导数,并且平均两个LSTM模型的参数
            # The gradi refers to gradients wrt. cost, and is a list containing gradients to be update weights
            # We average out the gradients by appending to another list grads[]
            # So, we average out the gradients : wrt LSTM_A and wrt LSTM_B
            # i.e, gradient= (grad(wrt(LSTM_A)+grad(wrt(LSTM_B))/2.0 to maintain the symmetricity between either LSTMs
            # wrt: (variable or list of variables) – term[s] for which we want gradients
            gradi = tensor.grad(
                cost, wrt=self.tnewp.values())  # T.grad -> differential
            grads = []
            l = len(gradi)
            for i in range(0, l / 2):
                gravg = (gradi[i] + gradi[i + l / 2]) / (4.0)
                #print i,i+9
                grads.append(gravg)
            for i in range(0, len(self.tnewp.keys()) / 2):
                grads.append(grads[i])

            # Here, the f_grad_shared and f_update are theano functions
            self.f_grad_shared, self.f_update = adadelta(
                lr, self.tnewp, grads, emb11, mask11, emb21, mask21, y, cost)
Exemplo n.º 56
0
 def __init__(self, sources, n_out, index, y_in=None, target=None, target_index=None,
              sparse=False, cost_scale=1.0, input_scale=1.0,
              L1=0.0, L2=0.0, L2_eye=None, varreg=0.0,
              output_L2_reg=0.0, output_entropy_reg=0.0, output_entropy_exp_reg=0.0,
              with_bias=True,
              mask="unity", dropout=0.0, batch_drop=False, batch_norm=False, bn_use_sample=False, layer_drop=0.0, residual=False,
              carry=False,
              sparse_filtering=False, gradient_scale=1.0, trainable=True, device=None,
              dtype='float32',
              **kwargs):
   """
   :param list[NetworkBaseLayer.Layer] sources: list of source layers
   :param int n_out: output dim of W_in and dim of bias
   :param float L1: l1-param-norm regularization
   :param float L2: l2-param-norm regularization
   :param str mask: "unity" or "dropout"
   :type dropout: float
   """
   super(Layer, self).__init__(**kwargs)
   self.index = index
   self.sources = sources; ":type: list[Layer]"
   self.num_sources = len(sources)
   self.D = max([s.D for s in sources if isinstance(s,Layer)] + [0])
   if mask is None: mask = 'none'
   self.set_attr('mask', mask)
   self.set_attr('dropout', dropout)
   self.set_attr('sparse', sparse)
   self.set_attr('bn_use_sample', bn_use_sample)
   self.set_attr('sparse_filtering', sparse_filtering)
   if not trainable:
     self.set_attr('trainable', trainable)  # only store if not default
     self.gradient_scale = 0.0  # just to be sure
   else:
     self.gradient_scale = gradient_scale
   if gradient_scale != 1.0:
     self.set_attr('gradient_scale', gradient_scale)
   self.set_attr('layer_drop', layer_drop)
   assert not carry, "not supported anymore"
   self.set_attr('residual', residual)
   self.set_attr('n_out', n_out)
   self.set_attr('L1', L1)
   self.set_attr('L2', L2)
   if L2_eye:
     self.set_attr('L2_eye', L2_eye)
   self.device = device # if device else str(theano.config.device)
   for s in self.sources:
     s.transfer_output(self.device)
   self.set_attr('varreg', varreg)
   if output_L2_reg:
     self.set_attr('output_L2_reg', output_L2_reg)
   if output_entropy_reg:
     self.set_attr('output_entropy_reg', output_entropy_reg)
   if output_entropy_exp_reg:
     self.set_attr('output_entropy_exp_reg', output_entropy_exp_reg)
   self.set_attr('batch_norm', batch_norm)
   self.set_attr('input_scale', input_scale)
   if y_in is not None:
     self.y_in = {}
     for k in y_in:
       if not isinstance(y_in[k], T.Variable): continue
       self.y_in[k] = time_batch_make_flat(y_in[k])  # TODO: better not flatten here...
       self.y_in[k].n_out = getattr(y_in[k], "n_out", None)
   else:
     self.y_in = None
   self.constraints = T.constant(0)
   if target:
     self.set_attr('target', target)
   if target_index:
     self.set_attr('target_index', target_index)
     assert target_index in self.network.j
     self.index = index = self.network.j[target_index]
   if cost_scale != 1:
     self.set_attr("cost_scale", cost_scale)
   if with_bias:
     self.b = self.add_param(self.create_bias(n_out), 'b_%s'%self.name)
   else:
     self.set_attr('with_bias', False)
     self.b = numpy.float32(0)
   self.mass = T.constant(1., name = "mass_%s" % self.name, dtype='float32')
   self.masks = [None] * len(self.sources)
   assert mask in ['dropout', 'unity', 'none'], "invalid mask: %s" % mask
   if mask == "dropout" or (mask == 'none' and dropout > 0):
     assert 0.0 < dropout < 1.0
     # If we apply this mass during training then we don't need any mask or mass for testing.
     # The expected weight should be 1 in
     #   E[x] = mass * (1-dropout)
     # so mass has to be 1 / (1 - dropout).
     self.mass = T.constant(1.0 / (1.0 - dropout), dtype='float32')
     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
     srng = RandomStreams(self.rng.randint(1234) + 1)
     if self.depth > 1:
       self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],self.depth)), theano.config.floatX) for s in self.sources]
     else:
       if batch_drop:
         self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=s.output.shape), theano.config.floatX) for s in self.sources]
       else:
         self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],)), theano.config.floatX) for s in self.sources]
Exemplo n.º 57
0
def random_binomial(shape, p=0.0, dtype=_FLOATX, seed=None):
    if seed is None:
        seed = np.random.randint(1, 10e6)
    rng = RandomStreams(seed=seed)
    return rng.binomial(shape, p=p, dtype=dtype)
Exemplo n.º 58
0
class Generator(object):
    def __init__(self, args, embedding_layer, nclasses, encoder):
        self.args = args
        self.embedding_layer = embedding_layer
        self.nclasses = nclasses
        self.encoder = encoder

    def ready(self):
        encoder = self.encoder
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = encoder.dropout

        # len*batch
        x = self.x = encoder.x
        z = self.z = encoder.z

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation,
                    order=args.order)
            elif layer_type == "lstm":
                l = LSTM(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation)
            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0, 1, "x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=1,
                                                 activation=sigmoid)

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        probs2 = probs.reshape(x.shape)
        self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(
            self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        self.z_pred = theano.gradient.disconnected_grad(z_pred)

        z2 = z.dimshuffle((0, 1, "x"))
        logpz = -T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                      axis=0,
                      dtype=theano.config.floatX)

        loss_mat = encoder.loss_mat
        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:, args.aspect]
        self.loss_vec = loss_vec

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg

        cost = self.cost = cost_logpz * 10 + l2_cost
        print "cost.dtype", cost.dtype

        self.cost_e = loss * 10 + encoder.l2_cost
Exemplo n.º 59
0
def apply_dropout(computation_graph,
                  variables,
                  drop_prob,
                  rng=None,
                  seed=None,
                  custom_divisor=None):
    """Apply dropout to specified variables in a graph.

    Parameters
    ----------
    computation_graph : instance of :class:`ComputationGraph`
        The computation graph.
    variables : list of :class:`~tensor.TensorVariable`
        Variables to be dropped out.
    drop_prob : float
        Probability of dropping out. If you want to apply the dropout
        with different probabilities for different layers, call it
        several times.
    rng : :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams`
        Random number generator.
    seed : int
        Random seed to be used if `rng` was not specified.
    custom_divisor : float or None, optional
        Divide dropped variables by a given scalar value. If `None`,
        (default) dropped variables will be divided by `(1 - drop_prob)`
        which is equivalent to scaling by `(1 - drop_prob)` at test
        time as recommended in [DROPOUT]_.

    Returns
    -------
    dropped_computation_graph : instance of :class:`ComputationGraph`
        A new computation graph with dropout applied to the specified
        variables. In order to train with, or monitor, the outputs
        of the original computation graph with dropout applies, use
        the variables contained in `dropped_computation_graph.outputs`.

    Notes
    -----
    For more information, see [DROPOUT]_.

    .. [DROPOUT] Hinton et al. *Improving neural networks by preventing
       co-adaptation of feature detectors*, arXiv:1207.0580.

    Examples
    --------
    >>> import numpy
    >>> from theano import tensor, function
    >>> from blocks.bricks import MLP, Identity
    >>> from blocks.filter import VariableFilter
    >>> from blocks.initialization import Constant
    >>> from blocks.roles import INPUT
    >>> linear = MLP([Identity(), Identity()], [2, 10, 2],
    ...              weights_init=Constant(1), biases_init=Constant(2))
    >>> x = tensor.matrix('x')
    >>> y = linear.apply(x)
    >>> cg = ComputationGraph(y)

    We are going to drop out all the input variables

    >>> inputs = VariableFilter(roles=[INPUT])(cg.variables)

    Here we apply dropout with default setting to our computation graph

    >>> cg_dropout = apply_dropout(cg, inputs, 0.5)

    Dropped out variables have role `DROPOUT` and are tagged with
    `replacement_of` tag. Let's filter these variables and check if they
    have the links to original ones.

    >>> dropped_out = VariableFilter(roles=[DROPOUT])(cg_dropout.variables)
    >>> inputs_referenced = [var.tag.replacement_of for var in dropped_out]
    >>> set(inputs) == set(inputs_referenced)
    True

    Compiling theano functions to forward propagate in original and dropped
    out graphs

    >>> fprop = function(cg.inputs, cg.outputs[0])
    >>> fprop_dropout = function(cg_dropout.inputs, cg_dropout.outputs[0])

    Initialize an MLP and apply these functions

    >>> linear.initialize()
    >>> fprop(numpy.ones((3, 2),
    ...       dtype=theano.config.floatX))  # doctest:+ELLIPSIS
    array([[ 42.,  42.],
           [ 42.,  42.],
           [ 42.,  42.]]...
    >>> fprop_dropout(numpy.ones((3, 2),
    ...               dtype=theano.config.floatX))  # doctest:+ELLIPSIS
    array([[ 0.,  0.],
           [ 0.,  0.],
           [ 0.,  0.]]...

    And after the second run answer is different

    >>> fprop_dropout(numpy.ones((3, 2),
    ...               dtype=theano.config.floatX))  # doctest:+ELLIPSIS
    array([[   0.,   52.],
           [ 100.,    0.],
           [   0.,    0.]]...

    """
    if not rng and not seed:
        seed = 1
    if not rng:
        rng = MRG_RandomStreams(seed)
    if custom_divisor is None:
        divisor = (1 - drop_prob)
    else:
        divisor = custom_divisor
    replacements = [
        (var, var *
         rng.binomial(var.shape, p=1 - drop_prob, dtype=theano.config.floatX) /
         divisor) for var in variables
    ]
    for variable, replacement in replacements:
        add_role(replacement, DROPOUT)
        replacement.tag.replacement_of = variable

    return computation_graph.replace(replacements)
Exemplo n.º 60
0
    def fprop(self, state_below):
        """
		:development: 
			(1) what is the shape of state_below? Does it account for batches?
				- let's assume that it uses the (time, batch, data) approach in the original code, so need some changes
			(2) do _scan_updates do anything important?

		"""

        z0 = T.alloc(np.cast[theano.config.floatX](0), self.n_hid)
        c0 = T.alloc(np.cast[theano.config.floatX](0), self.n_hid)
        # z0 = T.alloc(np.cast[theano.config.floatX](0), state_below.shape[0], self.n_hid)
        # c0 = T.alloc(np.cast[theano.config.floatX](0), state_below.shape[0], self.n_hid)

        if state_below.shape[0] == 1:
            z0 = T.unbroadcast(z0, 0)
            c0 = T.unbroadcast(c0, 0)

        Wxh = self.Wxh
        Whh = self.Whh
        bxh = self.bxh
        state_below_input = T.dot(state_below, self.I_x) + self.I_b
        state_below_forget = T.dot(state_below, self.F_x) + self.F_b
        state_below_output = T.dot(state_below, self.O_x) + self.O_b
        state_below = T.dot(state_below, Wxh) + bxh

        # probability that a given connection is dropped is self.dropout_prob
        # the 'p' parameter to binomial determines the likelihood of returning a 1
        # is the mask value is a 1, then the connection is not dropped
        # therefore 1 - dropout_prob gives the prob of droping a node (aka prob of 0)
        theano_rng = MRG_RandomStreams(max(self.rng.randint(2**15), 1))
        mask = theano_rng.binomial(p=self.dropout_prob,
                                   size=state_below.shape,
                                   dtype=state_below.dtype)

        def fprop_step(state_below, state_below_input, state_below_forget,
                       state_below_output, mask, state_before, cell_before,
                       Whh):

            i_on = T.nnet.sigmoid(state_below_input +
                                  T.dot(state_before, self.I_h) +
                                  T.dot(cell_before, self.I_c))

            f_on = T.nnet.sigmoid(state_below_forget +
                                  T.dot(state_before, self.F_h) +
                                  T.dot(cell_before, self.F_c))

            c_t = state_below + T.dot(state_before, Whh)
            c_t = f_on * cell_before + i_on * T.tanh(c_t)

            o_on = T.nnet.sigmoid(state_below_output +
                                  T.dot(state_before, self.O_h) +
                                  T.dot(c_t, self.O_c))
            z = o_on * T.tanh(c_t)

            # either carry the new values (z) or carry the old values (state_before)
            z = z * mask + (1 - mask) * state_before

            return z, c_t

        ((z, c), updates) = scan(fn=fprop_step,
                                 sequences=[
                                     state_below, state_below_input,
                                     state_below_forget, state_below_output,
                                     mask
                                 ],
                                 outputs_info=[z0, c0],
                                 non_sequences=[Whh])

        if self.return_indices is not None:
            if len(self.return_indices) > 1:
                return [z[i] for i in self.return_indices]
            else:
                return z[self.return_indices[0]]
        else:
            return z