예제 #1
0
def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed):
    """Create Theano tensor of approximate ELBO by Monte Carlo sampling.
    """
    l = (uw.size / 2).astype('int64')
    u = uw[:l]
    w = uw[l:]

    # Callable tensor
    logp_ = lambda input: theano.clone(logp, {inarray: input}, strict=False)

    # Naive Monte-Carlo
    r = MRG_RandomStreams(seed=random_seed)

    if n_mcsamples == 1:
        n = r.normal(size=inarray.tag.test_value.shape)
        q = n * exp(w) + u
        elbo = logp_(q) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi))
    else:
        n = r.normal(size=(n_mcsamples, u.tag.test_value.shape[0]))
        qs = n * exp(w) + u
        logps, _ = theano.scan(fn=lambda q: logp_(q),
                               outputs_info=None,
                               sequences=[qs])
        elbo = tt.mean(logps) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi))

    return elbo
예제 #2
0
 def compute_output(self, network, in_vw):
     deterministic = network.find_hyperparameter(["deterministic"])
     sigma = network.find_hyperparameter(["sigma"], None)
     if sigma is None:
         p = network.find_hyperparameter(["dropout_probability", "probability", "p"], 0)
         if p == 0:
             sigma = 0
         else:
             # derive gaussian dropout variance from bernoulli dropout
             # probability
             sigma = T.sqrt(p / (1 - p))
     if deterministic or sigma == 0:
         network.copy_vw(name="default", previous_vw=in_vw, tags={"output"})
     else:
         mask_shape = in_vw.shape
         if any(s is None for s in mask_shape):
             # NOTE: this uses symbolic shape - can be an issue with
             # theano.clone and random numbers
             # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs
             warnings.warn("using symbolic shape for dropout mask, " "which can be an issue with theano.clone")
             mask_shape = in_vw.variable.shape
         # TODO save this state so that we can seed the rng
         srng = MRG_RandomStreams()
         mask = srng.normal(mask_shape, avg=1.0, std=sigma, dtype=fX)
         network.create_vw("default", variable=in_vw.variable * mask, shape=in_vw.shape, tags={"output"})
예제 #3
0
파일: base.py 프로젝트: EderSantana/seya
class GaussianProd(MaskedLayer):
    '''
        Multiply by Gaussian noise.
        Similar to dropout but with gaussians instead of binomials.
        The way they have this at Keras is not the way we need for
        Variational AutoEncoders.
    '''
    def __init__(self, avg=0., std=1., **kwargs):
        super(GaussianProd, self).__init__(**kwargs)
        self.std = std
        self.avg = avg
        self.srng = RandomStreams(seed=np.random.randint(10e6))

    def get_output(self, train=False):
        X = self.get_input(train)
        X *= self.srng.normal(size=X.shape,
                              avg=self.avg,
                              std=self.std,
                              dtype=floatX)
        return X

    def get_config(self):
        return {"name": self.__class__.__name__,
                "avg": self.avg,
                "std": self.std}
예제 #4
0
def compare_speed():
    # To run this speed comparison
    # cd <directory of this file>
    # THEANO_FLAGS=device=gpu \
    #   python -c 'import test_rng_curand; test_rng_curand.compare_speed()'

    mrg = MRG_RandomStreams()
    crn = CURAND_RandomStreams(234)

    N = 1000 * 100

    dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX))

    mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))},
            profile='mrg uniform')
    crn_u = theano.function([], [], updates={dest: crn.uniform((N,))},
            profile='crn uniform')
    mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))},
            profile='mrg normal')
    crn_n = theano.function([], [], updates={dest: crn.normal((N,))},
            profile='crn normal')

    for f in mrg_u, crn_u, mrg_n, crn_n:
        # don't time the first call, it has some startup cost
        print('DEBUGPRINT')
        print('----------')
        theano.printing.debugprint(f)

    for i in range(100):
        for f in mrg_u, crn_u, mrg_n, crn_n:
            # don't time the first call, it has some startup cost
            f.fn.time_thunks = (i > 0)
            f()
예제 #5
0
 def compute_output(self, network, mu_vw, sigma_vw):
     deterministic = network.find_hyperparameter(["deterministic"], False)
     if deterministic:
         res = mu_vw.variable
     else:
         # TODO look at shape of both mu and sigma
         shape = mu_vw.shape
         if any(s is None for s in shape):
             # NOTE: this uses symbolic shape - can be an issue with
             # theano.clone and random numbers
             # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs
             warnings.warn("using symbolic shape for random number shape, "
                           "which can be an issue with theano.clone")
             shape = mu_vw.variable.shape
         # TODO save this state so that we can seed the rng
         srng = MRG_RandomStreams()
         res = srng.normal(shape,
                           avg=mu_vw.variable,
                           std=sigma_vw.variable,
                           dtype=fX)
     network.create_vw(
         "default",
         variable=theano.gradient.disconnected_grad(res),
         shape=mu_vw.shape,
         tags={"output"},
     )
예제 #6
0
파일: noise.py 프로젝트: CaptainAL/Spyder
class GaussianDropout(MaskedLayer):
    '''
        Multiplicative Gaussian Noise
        Reference:
            Dropout: A Simple Way to Prevent Neural Networks from Overfitting
            Srivastava, Hinton, et al. 2014
            http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
    '''
    def __init__(self, p, **kwargs):
        super(GaussianDropout, self).__init__(**kwargs)
        self.p = p
        self.srng = RandomStreams(seed=np.random.randint(10e6))

    def get_output(self, train):
        X = self.get_input(train)
        if train:
            # self.p refers to drop probability rather than retain probability (as in paper) to match Dropout layer syntax
            X *= self.srng.normal(size=X.shape, avg=1.0, std=T.sqrt(self.p / (1.0 - self.p)), dtype=theano.config.floatX)
        return X

    def get_config(self):
        config = {"name": self.__class__.__name__,
                  "p": self.p}
        base_config = super(GaussianDropout, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
예제 #7
0
파일: graph.py 프로젝트: Fdenpc/blocks
def apply_noise(computation_graph, variables, level, seed=None):
    """Add Gaussian noise to certain variable of a computation graph.

    Parameters
    ----------
    computation_graph : instance of :class:`ComputationGraph`
        The computation graph.
    variables : :class:`~tensor.TensorVariable`
        Variables to add noise to.
    level : float
        Noise level.
    seed : int, optional
        The seed with which
        :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` is initialized,
        is set to 1 by default.

    """
    if not seed:
        seed = config.default_seed
    rng = MRG_RandomStreams(seed)
    replace = {}
    for variable in variables:
        replace[variable] = (variable +
                             rng.normal(variable.shape, std=level))
    return computation_graph.replace(replace)
예제 #8
0
파일: sample.py 프로젝트: dribnet/parmesan
class SimpleSampleLayer(lasagne.layers.MergeLayer):
    """
    Simple sampling layer drawing a single Monte Carlo sample to approximate
    E_q [log( p(x,z) / q(z|x) )]. This is the approach described in [KINGMA]_.

    Parameters
    ----------
    mu, log_var : class:`Layer` instances
        Parameterizing the mean and log(variance) of the distribution to sample
        from as described in [KINGMA]_. The code assumes that these have the
        same number of dimensions

    References
    ----------
        ..  [KINGMA] Kingma, Diederik P., and Max Welling.
            "Auto-encoding variational bayes."
            arXiv preprint arXiv:1312.6114 (2013).
    """
    def __init__(self, mu, log_var, **kwargs):
        super(SimpleSampleLayer, self).__init__([mu, log_var], **kwargs)

        self._srng = RandomStreams(
            lasagne.random.get_rng().randint(1, 2147462579))

    def get_output_shape_for(self, input_shapes):
        return input_shapes[0]

    def get_output_for(self, input, **kwargs):
        mu, log_var = input
        eps = self._srng.normal(mu.shape)
        z = mu + T.exp(0.5 * log_var) * eps
        return z
예제 #9
0
파일: mnd.py 프로젝트: vlb/pylearn
class AdditiveDiagonalMND:

    def __init__(self, init_beta, nvis):
        """ A conditional distribution that adds
        gaussian noise with diagonal precision
        matrix beta to another variable that it
        conditions on
        """

        self.__dict__.update(locals())
        del self.self

        self.beta = sharedX(np.ones((nvis,))*init_beta)
        assert self.beta.ndim == 1

        self.s_rng = RandomStreams(17)

    def random_design_matrix(self, X):
        """ X: a theano variable containing a design matrix
        of observations of the random vector to condition on."""
        Z = self.s_rng.normal(size=X.shape,
                              avg=X, std=1./T.sqrt(self.beta), dtype=config.floatX)
        return Z

    def is_symmetric(self):
        """ A property of conditional distributions
        P(Y|X)
        Return true if P(y|x) = P(x|y) for all x,y
        """

        return True
예제 #10
0
파일: ff_layers.py 프로젝트: mbartoli/nmt
class GaussianNoise(Layer):
    """
    This layer is used to construct the embedding of the encoder by taking
    the last state of the recurrent model
    """
    def __init__(self, rng, std = 0.1, ndim=0, avg =0, shape_fn=None):
        """
        """
        assert rng is not None, "random number generator should not be empty!"
        super(GaussianNoise, self).__init__(0, 0, rng)

        self.std = scale
        self.avg = self.avg
        self.ndim = ndim
        self.shape_fn = shape_fn
        if self.shape_fn:
            # Name is not important as it is not a parameter of the model
            self.noise_term = theano.shared(numpy.zeros((2,)*ndim,
                                                    dtype=theano.config.floatX),
                                        name='ndata')
            self.noise_params += [self.noise_term]
            self.noise_params_shape_fn += [shape_fn]
        self.trng = RandomStreams(rng.randint(1e5))

    def fprop(self, x):
        self.out = x
        if self.scale:
            if self.shape_fn:
                self.out += self.noise_term
            else:
                self.out += self.trng.normal(self.out.shape, std=self.std,
                                             avg = self.avg,
                                        dtype=self.out.dtype)
        return self.out
예제 #11
0
class NoiseInputLayer(Layer):
    def __init__(self, shape, input_var=None, name=None, **kwargs):
        self.shape = shape
        self._srng = RandomStreams(get_rng().randint(1, 2147462579))

        if any(d is not None and d <= 0 for d in self.shape):
            raise ValueError((
                "Cannot create InputLayer with a non-positive shape "
                "dimension. shape=%r, self.name=%r") % (
                    self.shape, name))

        ndim = len(shape)
        if input_var is None:
            # create the right TensorType for the given number of dimensions
            input_var_type = T.TensorType(theano.config.floatX, [False] * ndim)
            var_name = ("%s.input" % name) if name is not None else "input"
            input_var = input_var_type(var_name)
        else:
            # ensure the given variable has the correct dimensionality
            if input_var.ndim != ndim:
                raise ValueError("shape has %d dimensions, but variable has "
                                 "%d" % (ndim, input_var.ndim))
        self.input_var = self._srng.normal(self.shape, avg = 0., std = 0.1)
        self.name = name
        self.params = OrderedDict()

    @Layer.output_shape.getter
    def output_shape(self):
        return self.shape
예제 #12
0
class GaussianBandit(Environment):
    """
    An n-armed bandit whose rewards are drawn from a different Gaussian
    distribution for each arm.
    The mean and standard deviation of the reward for each arm is drawn
    at initialization time from N(0, <corresponding std arg>).
    (For the standard deviation we use the absolute value of the Gaussian
    sample)
    """

    def __init__(self, num_arms, mean_std = 1.0, std_std = 1.0):
        self.rng = np.random.RandomState([2013, 11, 12])
        self.means = sharedX(self.rng.randn(num_arms) * mean_std)
        self.stds = sharedX(np.abs(self.rng.randn(num_arms) * std_std))
        self.theano_rng = MRG_RandomStreams(self.rng.randint(2 ** 16))

    def get_action_func(self):
        """
        Returns a theano function that takes an action and returns a reward.
        """

        action = T.iscalar()
        reward_mean = self.means[action]
        reward_std = self.stds[action]
        reward = self.theano_rng.normal(avg=reward_mean, std=reward_std,
                dtype=config.floatX, size=reward_mean.shape)
        rval = function([action], reward)
        return rval
예제 #13
0
    def prediction(self, h, bias):
        srng = RandomStreams(seed=42)

        prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \
            self.compute_parameters(h, bias)

        mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1)

        v = T.arange(0, mean_x.shape[0])
        m_x = mean_x[v, mode]
        m_y = mean_y[v, mode]
        s_x = std_x[v, mode]
        s_y = std_y[v, mode]
        r = rho[v, mode]
        # cov = r * (s_x * s_y)

        normal = srng.normal((h.shape[0], 2))
        x = normal[:, 0]
        y = normal[:, 1]

        # x_n = T.shape_padright(s_x * x + cov * y + m_x)
        # y_n = T.shape_padright(s_y * y + cov * x + m_y)

        x_n = T.shape_padright(m_x + s_x * x)
        y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1.-r**2)))

        uniform = srng.uniform((h.shape[0],))
        pin = T.shape_padright(T.cast(bernoulli > uniform, floatX))

        return T.concatenate([x_n, y_n, pin], axis=1)
예제 #14
0
파일: dNDF.py 프로젝트: diogo149/treeano
    def compute_output(self, network, in_vw):
        axis = network.find_hyperparameter(["axis"])
        deterministic = network.find_hyperparameter(["deterministic"], False)

        # calculate output shape
        output_shape = list(in_vw.shape)
        output_shape.pop(axis)

        if deterministic:
            out_var = in_vw.variable.mean(axis=axis)
        else:
            # TODO save this state so that we can seed the rng
            srng = MRG_RandomStreams()
            if in_vw.shape[axis] is None:
                # NOTE: this uses symbolic shape - can be an issue with
                # theano.clone and random numbers
                # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs
                warnings.warn("using symbolic shape for random variable size "
                              "which can be an issue with theano.clone")
            idx = T.argmax(srng.normal([in_vw.symbolic_shape()[axis]]))
            slices = tuple([slice(None) for _ in range(axis)] + [idx])
            out_var = in_vw.variable[slices]

        network.create_vw(
            "default",
            variable=out_var,
            shape=tuple(output_shape),
            tags={"output"},
        )
예제 #15
0
파일: mnd.py 프로젝트: JakeMick/pylearn2
class MND(object):
    """A Multivariate Normal Distribution"""
    def __init__(self, sigma, mu, seed=42):
        """
        .. todo::

            WRITEME properly
        
        Parameters
        -----------
        sigma: a numpy ndarray of shape (n,n)
        mu: a numpy ndarray of shape (n,)
        seed: the seed for the theano random number generator used to sample from this distribution"""
        self.sigma = sigma
        self.mu = mu
        if not (len(mu.shape) == 1):
            raise Exception('mu has shape ' + str(mu.shape) +
                            ' (it should be a vector)')

        self.sigma_inv = solve(self.sigma, N.identity(mu.shape[0]),
                               sym_pos=True)
        self.L = cholesky(self.sigma)
        self.s_rng = RandomStreams(seed)

        #Compute logZ
        #log Z = log 1/( (2pi)^(-k/2) |sigma|^-1/2 )
        # = log 1 - log (2pi^)(-k/2) |sigma|^-1/2
        # = 0 - log (2pi)^(-k/2) - log |sigma|^-1/2
        # = (k/2) * log(2pi) + (1/2) * log |sigma|
        k = float(self.mu.shape[0])
        self.logZ = 0.5 * (k * N.log(2. * N.pi) + N.log(det(sigma)))

    def free_energy(self, X):
        """
        .. todo::

            WRITEME
        """
        #design matrix format
        return .5 * T.sum(T.dot(X - self.mu,
                                T.dot(self.sigma_inv,
                                      T.transpose(X - self.mu))))

    def log_prob(self, X):
        """
        .. todo::

            WRITEME
        """
        return - self.free_energy(X) - self.logZ

    def random_design_matrix(self, m):
        """
        .. todo::

            WRITEME
        """
        Z = self.s_rng.normal(size=(m, self.mu.shape[0]),
                              avg=0., std=1., dtype=config.floatX)
        return self.mu + T.dot(Z, self.L.T)
예제 #16
0
def gaussian_noise(input_shape):
	from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
	_srng=RandomStreams()
	mask=_srng.normal(input_shape,avg=0.0,std=1.0,dtype=theano.config.floatX);#variable
	mask=mask.eval()#get value from variable
	noise=mask.reshape((-1,1,28,28))
	return noise	
    def construct_graph_ref(self, args, x, length, popstats=None):

        p = self.allocate_parameters(args)

        if args.baseline:
            def bn(x, gammas, betas):
                return x + betas
        else:
            def bn(x, gammas, betas):
                mean, var = x.mean(axis=0, keepdims=True), x.var(axis=0, keepdims=True)
                # if only
                mean.tag.batchstat, var.tag.batchstat = True, True
                #var = T.maximum(var, args.epsilon)
                var = var + args.epsilon
                return (x - mean) / T.sqrt(var) * gammas + betas

        def stepfn(x, dummy_h, dummy_c, h, c):
            # a_mean, b_mean, c_mean,
            # a_var, b_var, c_var):

            a_mean, b_mean, c_mean = 0, 0, 0
            a_var, b_var, c_var = 0, 0, 0

            atilde = T.dot(h, p.Wa)
            btilde = x
            a_normal = bn(atilde, p.a_gammas, p.ab_betas)
            b_normal = bn(btilde, p.b_gammas, 0)
            ab = a_normal + b_normal
            g, f, i, o = [fn(ab[:, j * args.num_hidden:(j + 1) * args.num_hidden])
                          for j, fn in enumerate([self.activation] + 3 * [T.nnet.sigmoid])]
            c = dummy_c + f * c + i * g
            c_normal = bn(c, p.c_gammas, p.c_betas)
            h = dummy_h + o * self.activation(c_normal)
            return h, c, atilde, btilde, c_normal



        xtilde = T.dot(x, p.Wx)

        if args.noise:
            # prime h with white noise
            Trng = MRG_RandomStreams()
            h_prime = Trng.normal((xtilde.shape[1], args.num_hidden), std=args.noise)
        elif args.summarize:
            # prime h with mean of example
            h_prime = x.mean(axis=[0, 2])[:, None]
        else:
            h_prime = 0

        dummy_states = dict(h=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden)),
                            c=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden)))

        [h, c, atilde, btilde, htilde], _ = theano.scan(
            stepfn,
            sequences=[xtilde, dummy_states["h"], dummy_states["c"]],
            outputs_info=[T.repeat(p.h0[None, :], xtilde.shape[1], axis=0) + h_prime,
                          T.repeat(p.c0[None, :], xtilde.shape[1], axis=0),
                          None, None, None])
        return dict(h=h, c=c,
                    atilde=atilde, btilde=btilde, htilde=htilde), [], dummy_states, popstats
예제 #18
0
def test_normal0():

    steps = 50
    std = 2.
    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
            config.mode == 'Mode' and config.linker in ['py']):
        sample_size = (25, 30)
        default_rtol = .02
    else:
        sample_size = (999, 50)
        default_rtol = .01
    sample_size_odd = (sample_size[0], sample_size[1] - 1)
    x = tensor.matrix()

    for size, const_size, var_input, input, avg, rtol, std_tol in [
        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
        (x.shape, sample_size, [x],
         [np.zeros(sample_size, dtype=config.floatX)],
         -5., default_rtol, default_rtol),
        # test odd value
        (x.shape, sample_size_odd, [x],
         [np.zeros(sample_size_odd, dtype=config.floatX)],
         -5., default_rtol, default_rtol),
        (sample_size, sample_size, [], [],
         np.arange(np.prod(sample_size),
                   dtype='float32').reshape(sample_size),
         10. * std / np.sqrt(steps), default_rtol),
        # test empty size (scalar)
        ((), (), [], [], -5., default_rtol, 0.02),
        # test with few samples at the same time
        ((1,), (1,), [], [], -5., default_rtol, 0.02),
        ((3,), (3,), [], [], -5., default_rtol, 0.02),
            ]:

        R = MRG_RandomStreams(234)
        # Note: we specify `nstreams` to avoid a warning.
        n = R.normal(size=size, avg=avg, std=std,
                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, n)
        f(*input)

        # Increase the number of steps if size implies only a few samples
        if np.prod(const_size) < 10:
            steps_ = steps * 50
        else:
            steps_ = steps
        basictest(f, steps_, const_size, target_avg=avg, target_std=std,
                  prefix='mrg ', allow_01=True, inputs=input,
                  mean_rtol=rtol, std_tol=std_tol)

        sys.stdout.flush()

        RR = theano.tensor.shared_randomstreams.RandomStreams(234)

        nn = RR.normal(size=size, avg=avg, std=std)
        ff = theano.function(var_input, nn)

        basictest(ff, steps_, const_size, target_avg=avg, target_std=std,
                  prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)
예제 #19
0
파일: advi.py 프로젝트: 2php/pymc3
def sample_vp(vparams, draws=1000, model=None, random_seed=20090425, 
              hide_transformed=True):
    """Draw samples from variational posterior.

    Parameters
    ----------
    vparams : dict or pymc3.variational.ADVIFit
        Estimated variational parameters of the model.
    draws : int
        Number of random samples.
    model : pymc3.Model
        Probabilistic model.
    random_seed : int
        Seed of random number generator.
    hide_transformed : bool
        If False, transformed variables are also sampled. Default is True. 

    Returns
    -------
    trace : pymc3.backends.base.MultiTrace
        Samples drawn from the variational posterior.
    """
    model = modelcontext(model)

    if isinstance(vparams, ADVIFit):
        vparams = {
            'means': vparams.means,
            'stds': vparams.stds
        }

    # Make dict for replacements of random variables
    r = MRG_RandomStreams(seed=random_seed)
    updates = {}
    for var in model.free_RVs:
        u = theano.shared(vparams['means'][str(var)]).ravel()
        w = theano.shared(vparams['stds'][str(var)]).ravel()
        n = r.normal(size=u.tag.test_value.shape)
        updates.update({var: (n * w + u).reshape(var.tag.test_value.shape)})
    vars = model.free_RVs

    # Replace some nodes of the graph with variational distributions
    samples = theano.clone(vars, updates)
    f = theano.function([], samples)

    # Random variables which will be sampled
    vars_sampled = [v for v in model.unobserved_RVs if not str(v).endswith('_')] \
                   if hide_transformed else \
                   [v for v in model.unobserved_RVs]

    varnames = [str(var) for var in model.unobserved_RVs]
    trace = NDArray(model=model, vars=vars_sampled)
    trace.setup(draws=draws, chain=0)

    for i in range(draws):
        # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...}
        point = {varname: value for varname, value in zip(varnames, f())}
        trace.record(point)

    return MultiTrace([trace])
예제 #20
0
class NormalApproximation(object):
    def __init__(self, mu=0, std=np.exp(-3),seed=None):
        """
        Approximation that samples network weights from factorized normal distribution.
        
        :param mu: prior mean for gaussian weights
        :param std: prior std for gaussian weights
        :param seed: random seed
        """
        self.prior_mu = mu
        self.prior_std = std
        self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579))
        
    def log_normal(self,x, mean, std, eps=0.0):
        """computes log-proba of normal distribution"""
        std += eps
        return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - (x - mean) ** 2 / (2 * std ** 2)

    def log_prior(self, weights):
        """
        Logarithm of prior probabilities for weights: 
        log P(weights) aka log P(theta)
        """
        return self.log_normal(weights, self.prior_mu, self.prior_std)

    def log_posterior_approx(self,weights, mean, rho):
        """
        Logarithm of ELBO on posterior probabilities:
        log q(weights|learned mu and rho) aka log q(theta|x)
        """
        std = T.log1p(T.exp(rho))  #rho to std
        return self.log_normal(weights, mean, std)

    def __call__(self, layer, spec, shape, name=None, **tags):
        # case when user uses default init specs
        assert tags.get('variational',False) == True, "Please declare param as variational to avoid confusion"
        
        if not isinstance(spec, dict):
            initial_rho = np.log(np.expm1(self.prior_std))   #std to rho
            assert np.isfinite(initial_rho),"too small std to initialize correctly. Please pass explicit"\
                                            " initializer (dict with {'mu':mu_init, 'rho':rho_init})."
            spec = {'mu': spec,'rho':init.Constant(initial_rho)}
            

        mu_spec,rho_spec = spec['mu'],spec['rho']
        
        rho = layer.add_param(rho_spec, shape,name=(name or 'unk')+'.rho', **tags)
        mean = layer.add_param(mu_spec, shape,name=(name or 'unk')+'.mu', **tags)

        #Reparameterization trick
        e = self.srng.normal(shape, std=1)  
        W = mean + T.log1p(T.exp(rho)) * e 

        #KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka variational cost
        q_p = T.sum(self.log_posterior_approx(W, mean, rho) - self.log_prior(W))
            
        #accumulate variational cost
        layer._bbwrap_var_cost += q_p
        return W
예제 #21
0
    def fprop(self, state_below, add_noise=True):
        self.input_space.validate(state_below)

        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below, self.desired_space)
        
        self.x = state_below
        
        # linear part
        if isinstance(self.x, S.SparseVariable):
            z = S.dot(self.x,self.W[0]) + self.b[0]
        else:
            z = T.dot(self.x,self.W[0]) + self.b[0]
        
        self.z = self.activate(z, self.expert_activation)
        
        # first layer non-linear part
        if isinstance(self.x, S.SparseVariable):
            h = S.dot(self.x,self.W[1]) + self.b[1]
        else:
            h = T.dot(self.x,self.W[1]) + self.b[1]
        
        # activate hidden units of non-linear part
        self.h = self.activate(h, self.hidden_activation)
            
        noise = 0.
        if add_noise:
            rng = MRG_RandomStreams(self.mlp.rng.randint(2**15))
            noise = rng.normal(size = self.z.shape, 
                                    std=self.noise_stdev ,
                                    dtype=self.z.type.dtype) 
        
        # second layer non-linear part
        self.a = T.dot(self.h,self.W[2]) + self.b[2] + noise
        
        # activate non-linear part
        self.m_mean = self.activate(self.a, self.gater_activation)
        
        # how many are over 0:
        self.effective_sparsity = T.cast(T.gt(self.m_mean, 0), 
                                         theano.config.floatX).mean()
           
        # mix output of linear part with output of non-linear part
        self.p = self.m_mean * self.z
        
        if self.layer_name is not None:
            self.z.name = self.layer_name + '_z'
            self.h.name = self.layer_name + '_h'
            self.a.name = self.layer_name + '_a'
            self.m_mean.name = self.layer_name + '_m_mean'
            self.p.name = self.layer_name + '_p'
        
        return self.p
예제 #22
0
파일: sample.py 프로젝트: dribnet/parmesan
class SampleLayer(lasagne.layers.MergeLayer):
    """
    Samplelayer supporting importance sampling as described in [BURDA]_ and
    multiple monte carlo samples for the approximation of
    E_q [log( p(x,z) / q(z|x) )]

    Parameters
    ----------
    mu, log_var : class:`Layer` instances
        Parameterizing the mean and log(variance) of the distribution to sample
        from as described in [BURDA]. The code assumes that these have the same
        number of dimensions

    eq_samples: Int or T.scalar
        Number of Monte Carlo samples used to estimate the expectation over
        q(z|x) in eq. (8) in [BURDA]

    iw_samples: Int or T.scalar
        Number of importance samples in the sum over k in eq. (8) in [BURDA]

    References
    ----------
        ..  [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov.
            "Importance Weighted Autoencoders."
            arXiv preprint arXiv:1509.00519 (2015).

    """

    def __init__(self, mu, log_var, eq_samples=1, iw_samples=1, **kwargs):
        super(SampleLayer, self).__init__([mu, log_var], **kwargs)

        self.eq_samples = eq_samples
        self.iw_samples = iw_samples

        self._srng = RandomStreams(
            lasagne.random.get_rng().randint(1, 2147462579))


    def get_output_shape_for(self, input_shapes):
        batch_size, num_latent = input_shapes[0]
        if isinstance(batch_size, int) and \
           isinstance(self.iw_samples, int) and \
           isinstance(self.eq_samples, int):
            out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent)
        else:
            out_dim = (None, num_latent)
        return out_dim

    def get_output_for(self, input, **kwargs):
        mu, log_var = input
        batch_size, num_latent = mu.shape
        eps = self._srng.normal(
            [batch_size, self.eq_samples, self.iw_samples, num_latent],
             dtype=theano.config.floatX)

        z = mu.dimshuffle(0,'x','x',1) + \
            T.exp(0.5 * log_var.dimshuffle(0,'x','x',1)) * eps

        return z.reshape((-1,num_latent))
예제 #23
0
파일: random.py 프로젝트: mthrok/luchador
class NormalRandom(object):
    """Implements normal random sampling in Tensorflow"""
    def __init__(self):
        self._rng = RandomStreams(seed=self.seed or 123456)

    def _sample(self, shape, dtype):
        return self._rng.normal(
            size=shape, avg=self.mean, std=self.std, dtype=dtype)
예제 #24
0
    def get_samples_and_objectives(self, model, data):
        space, sources = self.get_data_specs(model)
        space.validate(data)
        assert isinstance(model, AdversaryPair)
        g = model.generator
        d = model.discriminator

        # Note: this assumes data is design matrix
        X = data
        m = data.shape[space.get_batch_axis()]
        y1 = T.alloc(1, m, 1)
        y0 = T.alloc(0, m, 1)
        # NOTE: if this changes to optionally use dropout, change the inference
        # code below to use a non-dropped-out version.
        S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None))

        if self.noise_both != 0.:
            rng = MRG_RandomStreams(2014 / 6 + 2)
            S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both
            X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both

        y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob,
                                     self.discriminator_input_include_probs,
                                     self.discriminator_default_input_scale,
                                     self.discriminator_input_scales)
        y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob,
                                     self.discriminator_input_include_probs,
                                     self.discriminator_default_input_scale,
                                     self.discriminator_input_scales)

        d_obj =  0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0))

        if self.no_drop_in_d_for_g:
            y_hat0_no_drop = d.dropout_fprop(S)
            g_obj = d.layers[-1].cost(y1, y_hat0_no_drop)
        else:
            g_obj = d.layers[-1].cost(y1, y_hat0)

        if self.blend_obj:
            g_obj = (self.zurich_coeff * g_obj - self.minimax_coeff * d_obj) / (self.zurich_coeff + self.minimax_coeff)

        if model.inferer is not None:
            # Change this if we ever switch to using dropout in the
            # construction of S.
            S_nograd = block_gradient(S)  # Redundant as long as we have custom get_gradients
            pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob,
                                                self.inference_input_include_probs,
                                                self.inference_default_input_scale,
                                                self.inference_input_scales)
            if self.infer_layer is None:
                target = z
            else:
                target = other_layers[self.infer_layer]
            i_obj = model.inferer.layers[-1].cost(target, pred)
        else:
            i_obj = 0

        return S, d_obj, g_obj, i_obj
    def apply(self, x):

        # lazy hack
        h0 = self.parameters[0]
        c0 = self.parameters[1]
        Wa = self.parameters[2]
        Wx = self.parameters[3]
        if self.baseline:
            ab_betas = self.parameters[4]
            h_betas = self.parameters[5]
            a_gammas = None
            b_gammas = None
            h_gammas = None
        else:
            a_gammas = self.parameters[4]
            b_gammas = self.parameters[5]
            h_gammas = self.parameters[6]
            ab_betas = self.parameters[7]
            h_betas = self.parameters[8]

        xtilde = tensor.dot(x, Wx)

        if self.noise:
            # prime h with white noise
            Trng = MRG_RandomStreams()
            h_prime = Trng.normal((xtilde.shape[1], self.state_dim), std=args.noise)
        #elif args.summarize:
        #    # prime h with summary of example
        #    Winit = theano.shared(orthogonal((nclasses, self.state_dim)), name="Winit")
        #    parameters.append(Winit)
        #    h_prime = tensor.dot(x, Winit).mean(axis=0)
        else:
            h_prime = 0

        dummy_states = dict(h=tensor.zeros((xtilde.shape[0], xtilde.shape[1], self.state_dim)),
                            c=tensor.zeros((xtilde.shape[0], xtilde.shape[1], self.state_dim)))

        def stepfn(xtilde, dummy_h, dummy_c, h, c):
            atilde = tensor.dot(h, Wa)
            btilde = xtilde
            a = self.bn(atilde, a_gammas, ab_betas)
            b = self.bn(btilde, b_gammas, 0)
            ab = a + b
            g, f, i, o = [fn(ab[:, j * self.state_dim:(j + 1) * self.state_dim])
                          for j, fn in enumerate([self.children[0].apply] + 3 * [tensor.nnet.sigmoid])]
            c = dummy_c + f * c + i * g
            htilde = c
            h = dummy_h + o * self.children[0].apply(self.bn(htilde, h_gammas, h_betas))
            return h, c, atilde, btilde, htilde

        [h, c, atilde, btilde, htilde], _ = theano.scan(
            stepfn,
            sequences=[xtilde, dummy_states["h"], dummy_states["c"]],
            outputs_info=[tensor.repeat(h0[None, :], xtilde.shape[1], axis=0) + h_prime,
                          tensor.repeat(c0[None, :], xtilde.shape[1], axis=0),
                          None, None, None])
        #return dict(h=h, c=c, atilde=atilde, btilde=btilde, htilde=htilde), dummy_states, parameters
        return h
예제 #26
0
def apply_noise(computation_graph, variables, level, seed=None):
    if not seed:
        seed = config.default_seed
    rng = MRG_RandomStreams(seed)
    replace = {}
    for variable in variables:
        replace[variable] = (variable +
                             level*rng.normal(variable.shape))
    return computation_graph.replace(replace)
예제 #27
0
class ESGD(RmsProp):
    r'''Equilibrated SGD computes a diagonal preconditioner for gradient descent.

    The ESGD method uses the same general strategy as SGD, in the sense that all
    gradient-based methods make small parameter adjustments using local
    derivative information. The difference here is that as gradients are
    computed during each parameter update, an exponential moving average of
    diagonal preconditioner values is maintained as well. At each update, the
    EWMA is used to compute the root-mean-square (RMS) diagonal preconditioner
    value that's been seen in the recent past. The actual gradient is normalized
    by this preconditioner before being applied to update the parameters.

    .. math::
        \begin{eqnarray*}
        r &\sim& \mathcal{N}(0, 1) \\
        Hr &=& \frac{\partial^2 \mathcal{L}}{\partial^2\theta}r \\
        D_{t+1} &=& \gamma D_t + (1 - \gamma) (Hr)^2 \\
        v_{t+1} &=& \mu v_t - \frac{\alpha}{\sqrt{D_{t+1} + \epsilon}} \frac{\partial\mathcal{L}}{\partial\theta} \\
        \theta_{t+1} &=& \theta_t + v_{t+1}
        \end{eqnarray*}

    Like :class:`Rprop` and the :class:`ADADELTA`--:class:`RmsProp` family, this
    learning method effectively maintains a sort of parameter-specific momentum
    value. The primary difference between this method and :class:`RmsProp` is
    that ESGD treats the normalizing fraction explicitly as a preconditioner for
    the diaonal of the Hessian, and estimates this diagonal by drawing a vector
    of standard normal values at every training step. The primary difference
    between this implementation and the algorithm described in the paper (see
    below) is the use of an EWMA to decay the diagonal values over time, while
    in the paper the diagonal is divided by the training iteration.

    In this implementation, :math:`\epsilon` is set to 1e-4. The weight
    parameter :math:`\gamma` for the EWMA window is computed from the
    ``rms_halflife`` keyword argument, such that the actual EWMA weight varies
    inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln
    2}{h}}`.

    The implementation here is modeled after Dauphin, de Vries, Chung & Bengio
    (2014), "RMSProp and equilibrated adaptive learning rates for non-convex
    optimization," http://arxiv.org/pdf/1502.04390.pdf.
    '''

    def __init__(self, *args, **kwargs):
        self.rng = RandomStreams()
        super(ESGD, self).__init__(*args, **kwargs)

    def learning_updates(self):
        eps = 1e-4  # more or less from the paper
        for param, grad in zip(self.params, self.clipped_gradients()):
            D_tm1 = self.shared_like(param, 'D_ewma')
            vel_tm1 = self.shared_like(param, 'vel')
            Hv = TT.Rop(grad, param, self.rng.normal(param.shape))
            D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv
            vel_t = self.momentum * vel_tm1 - grad * self.learning_rate / TT.sqrt(D_t + eps)
            yield D_tm1, D_t
            yield param, param + vel_t
예제 #28
0
파일: VPPD.py 프로젝트: deworrall92/VPPD
def SGLD(loss, params, learning_rate, log_prior, N):
    """Apply the SGLD MCMC sampler"""
    g_lik = get_or_compute_grads(-N*loss, params)
    g_prior = get_or_compute_grads(log_prior, params)
    smrg = MRG_RandomStreams()
    updates = OrderedDict()
    for param, gl, gp in zip(params, g_lik, g_prior):
        eta = T.sqrt(learning_rate)*smrg.normal(size=param.shape)
        delta = 0.5*learning_rate*(gl + gp) + eta
        updates[param] = param + delta
    return updates
예제 #29
0
def Santa(tparams, cost, inps, lr, eidx, nframes, max_epoch, rho=0.95, anne_rate=0.5, e=1e-8, clip_norm=5):
    """ The implementation of Santa algorithm.
        tparams: theano shared variables, params that we need to optimize
        cost: cost function, the cross-entropy loss in our case
        inps: input theano variables
        lr: learning rate, in our case, we choose it to be 1.*1e-3, or 2.*1e-4
        eidx: the current epochs we are running, used to decide when to change 
            from exploration to refinement
        nframes: how many time-steps we have in the training dataset.
        max_epoch: the maximum of epochs we run
        rho, anne_rate, e, clip_norm: hyper-parameters we used in all the algorithms.
    """
    
    trng = RandomStreams(123)
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
    
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)
    
    updates = []
    
    i = theano.shared(numpy_floatX(0.))    
    i_t = i + 1.

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        alpha = theano.shared(np.ones(p.get_value().shape)*.5)
        
        alpha_t = alpha + (m**2 - lr/(i_t ** anne_rate)) * tensor.lt(eidx, 0.15*max_epoch) 
        v_t = rho * v + (1.-rho) * (g ** 2) 
        pcder = tensor.sqrt(tensor.sqrt(v_t)+e) 
            
        eps = trng.normal(p.get_value().shape, avg = 0.0, std = 1.0, 
                          dtype=theano.config.floatX)
            
        m_t = -lr*g/pcder + (1. - alpha_t) * m + (tensor.sqrt(2*lr*v_t/(i_t ** anne_rate)/nframes) *eps) * tensor.lt(eidx, 0.15*max_epoch)
        p_t = p + (m_t/ pcder)
        
        updates.append((alpha, alpha_t))
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    
    f_update = theano.function([lr,eidx,nframes,max_epoch], [], updates=updates)
    
    return f_grad_shared, f_update
예제 #30
0
def _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples, random_seed):
    """Return expression of approximate ELBO based on Monte Carlo sampling.
    """
    if random_seed is None:
        r = MRG_RandomStreams(gen_random_state())
    else:
        r = MRG_RandomStreams(seed=random_seed)

    if uw_l is not None:
        l_g = (uw_g.size / 2).astype('int64')
        u_g = uw_g[:l_g]
        w_g = uw_g[l_g:]
        l_l = (uw_l.size / 2).astype('int64')
        u_l = uw_l[:l_l]
        w_l = uw_l[l_l:]

        def logp_(z_g, z_l):
            return theano.clone(logp, {inarray_g: z_g, inarray_l: z_l}, strict=False)
        if n_mcsamples == 1:
            n_g = r.normal(size=inarray_g.tag.test_value.shape)
            z_g = n_g * tt.exp(w_g) + u_g
            n_l = r.normal(size=inarray_l.tag.test_value.shape)
            z_l = n_l * tt.exp(w_l) + u_l
            elbo = logp_(z_g, z_l) + \
                tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) + \
                tt.sum(w_l) + 0.5 * l_l * (1 + np.log(2.0 * np.pi))
        else:
            ns_g = r.normal(size=inarray_g.tag.test_value.shape)
            zs_g = ns_g * tt.exp(w_g) + u_g
            ns_l = r.normal(size=inarray_l.tag.test_value.shape)
            zs_l = ns_l * tt.exp(w_l) + u_l
            logps, _ = theano.scan(fn=lambda z_g, z_l: logp_(z_g, z_l),
                                   outputs_info=None,
                                   sequences=zip(zs_g, zs_l))
            elbo = tt.mean(logps) + \
                tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi)) + \
                tt.sum(w_l) + 0.5 * l_l * (1 + np.log(2.0 * np.pi))
    else:
        l_g = (uw_g.size / 2).astype('int64')
        u_g = uw_g[:l_g]
        w_g = uw_g[l_g:]

        def logp_(z_g):
            return theano.clone(logp, {inarray_g: z_g}, strict=False)

        if n_mcsamples == 1:
            n_g = r.normal(size=inarray_g.tag.test_value.shape)
            z_g = n_g * tt.exp(w_g) + u_g
            elbo = logp_(z_g) + \
                tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi))
        else:
            n_g = r.normal(size=(n_mcsamples, u_g.tag.test_value.shape[0]))
            zs_g = n_g * tt.exp(w_g) + u_g
            logps, _ = theano.scan(fn=lambda q: logp_(q),
                                   outputs_info=None,
                                   sequences=[zs_g])
            elbo = tt.mean(logps) + \
                tt.sum(w_g) + 0.5 * l_g * (1 + np.log(2.0 * np.pi))

    return elbo
예제 #31
0
class LadderAE():
    def __init__(self, p):
        self.p = p
        self.init_weights_transpose = False
        self.default_lr = p.lr
        self.shareds = OrderedDict()
        self.rstream = RandomStreams(seed=p.seed)
        self.rng = np.random.RandomState(seed=p.seed)

        n_layers = len(p.encoder_layers)
        assert n_layers > 1, "Need to define encoder layers"
        assert n_layers == len(p.denoising_cost_x), (
            "Number of denoising costs does not match with %d layers: %s" %
            (n_layers, str(p.denoising_cost_x)))

        def one_to_all(x):
            """ (5.,) -> 5 -> (5., 5., 5.)
                ('relu',) -> 'relu' -> ('relu', 'relu', 'relu')
            """
            if type(x) is tuple and len(x) == 1:
                x = x[0]

            if type(x) is float:
                x = (np.float32(x), ) * n_layers

            if type(x) is str:
                x = (x, ) * n_layers
            return x

        p.decoder_spec = one_to_all(p.decoder_spec)
        p.f_local_noise_std = one_to_all(p.f_local_noise_std)
        acts = one_to_all(p.get('act', 'relu'))

        assert n_layers == len(p.decoder_spec), "f and g need to match"
        assert (n_layers == len(acts)), (
            "Not enough activations given. Requires %d. Got: %s" %
            (n_layers, str(acts)))
        acts = acts[:-1] + ('softmax', )

        def parse_layer(spec):
            """ 'fc:5' -> ('fc', 5)
                '5'    -> ('fc', 5)
                5      -> ('fc', 5)
                'convv:3:2:2' -> ('convv', [3,2,2])
            """
            if type(spec) is not str:
                return "fc", spec
            spec = spec.split(':')
            l_type = spec.pop(0) if len(spec) >= 2 else "fc"
            spec = list(map(int, spec))
            spec = spec[0] if len(spec) == 1 else spec
            return l_type, spec

        enc = list(map(parse_layer, p.encoder_layers))
        self.layers = list(enumerate(zip(enc, p.decoder_spec, acts)))

    def weight(self, init, name, cast_float32=True, for_conv=False):
        weight = self.shared(init, name, cast_float32, role=WEIGHT)
        if for_conv:
            return weight.dimshuffle('x', 0, 'x', 'x')
        return weight

    def bias(self, init, name, cast_float32=True, for_conv=False):
        b = self.shared(init, name, cast_float32, role=BIAS)
        if for_conv:
            return b.dimshuffle('x', 0, 'x', 'x')
        return b

    def shared(self, init, name, cast_float32=True, role=PARAMETER, **kwargs):
        p = self.shareds.get(name)
        if p is None:
            p = shared_param(init, name, cast_float32, role, **kwargs)
            self.shareds[name] = p
        return p

    def counter(self):
        name = 'counter'
        p = self.shareds.get(name)
        update = []
        if p is None:
            p_max_val = np.float32(10)
            p = self.shared(np.float32(1), name, role=BNPARAM)
            p_max = self.shared(p_max_val, name + '_max', role=BNPARAM)
            update = [(p, T.clip(p + np.float32(1), np.float32(0), p_max)),
                      (p_max, p_max_val)]
        return (p, update)

    def noise_like(self, x):
        noise = self.rstream.normal(size=x.shape, avg=0.0, std=1.0)
        return T.cast(noise, dtype=floatX)

    def rand_init(self, in_dim, out_dim):
        """ Random initialization for fully connected layers """
        W = self.rng.randn(in_dim, out_dim) / np.sqrt(in_dim)
        return W

    def rand_init_conv(self, dim):
        """ Random initialization for convolution filters """
        fan_in = np.prod(dtype=floatX, a=dim[1:])
        bound = np.sqrt(3. / max(1.0, (fan_in)))
        W = np.asarray(self.rng.uniform(low=-bound, high=bound, size=dim),
                       dtype=floatX)
        return W

    def new_activation_dict(self):
        return AttributeDict({'z': {}, 'h': {}, 's': {}, 'm': {}})

    def annotate_update(self, update, tag_to):
        a = Annotation()
        for (var, up) in update:
            a.updates[var] = up
        add_annotation(tag_to, a)

    def apply(self, input_labeled, target_labeled, input_unlabeled):
        self.layer_counter = 0
        input_dim = self.p.encoder_layers[0]

        # Store the dimension tuples in the same order as layers.
        layers = self.layers
        self.layer_dims = {0: input_dim}

        self.lr = self.shared(self.default_lr, 'learning_rate', role=None)

        self.costs = costs = AttributeDict()
        self.costs.denois = AttributeDict()

        self.act = AttributeDict()
        self.error = AttributeDict()

        top = len(layers) - 1

        N = input_labeled.shape[0]
        self.join = lambda l, u: T.concatenate([l, u], axis=0)
        self.labeled = lambda x: x[:N] if x is not None else x
        self.unlabeled = lambda x: x[N:] if x is not None else x
        self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x))

        input_concat = self.join(input_labeled, input_unlabeled)

        def encoder(input_, path_name, input_noise_std=0, noise_std=[]):

            h = input_
            logger.info('  0: noise %g' % input_noise_std)

            if input_noise_std > 0.:
                h = h + self.noise_like(h) * input_noise_std

            d = AttributeDict()
            d.unlabeled = self.new_activation_dict()
            d.labeled = self.new_activation_dict()
            d.labeled.z[0] = self.labeled(h)
            d.unlabeled.z[0] = self.unlabeled(h)
            prev_dim = input_dim

            for i, (spec, _, act_f) in layers[1:]:
                d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h)
                noise = noise_std[i] if i < len(noise_std) else 0.
                curr_dim, z, m, s, h = self.f(h,
                                              prev_dim,
                                              spec,
                                              i,
                                              act_f,
                                              path_name=path_name,
                                              noise_std=noise)
                assert self.layer_dims.get(i) in (None, curr_dim)
                self.layer_dims[i] = curr_dim
                d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z)
                d.unlabeled.s[i] = s
                d.unlabeled.m[i] = m
                prev_dim = curr_dim

            d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h)

            return d

        # Clean, supervised
        logger.info('Encoder: clean, labeled')
        clean = self.act.clean = encoder(input_concat, 'clean')

        # Corrupted, supervised
        logger.info('Encoder: corr, labeled')
        corr = self.act.corr = encoder(input_concat,
                                       'corr',
                                       input_noise_std=self.p.super_noise_std,
                                       noise_std=self.p.f_local_noise_std)
        est = self.act.est = self.new_activation_dict()

        # Decoder path in opposite order
        logger.info('Decoder: z_corr -> z_est')
        for i, ((_, spec), l_type, act_f) in layers[::-1]:
            z_corr = corr.unlabeled.z[i]
            z_clean = clean.unlabeled.z[i]
            z_clean_s = clean.unlabeled.s.get(i)
            z_clean_m = clean.unlabeled.m.get(i)
            fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None,
                                                                     None)

            if i == top:
                ver = corr.unlabeled.h[i]
                ver_dim = self.layer_dims[i]
                top_g = True
            else:
                ver = est.z.get(i + 1)
                ver_dim = self.layer_dims.get(i + 1)
                top_g = False

            z_est = self.g(z_lat=z_corr,
                           z_ver=ver,
                           in_dims=ver_dim,
                           out_dims=self.layer_dims[i],
                           l_type=l_type,
                           num=i,
                           fspec=fspec,
                           top_g=top_g)

            if z_est is not None:
                # Denoising cost

                if z_clean_s and self.p.zestbn == 'bugfix':
                    z_est_norm = (z_est - z_clean_m
                                  ) / T.sqrt(z_clean_s + np.float32(1e-10))
                elif z_clean_s is None or self.p.zestbn == 'no':
                    z_est_norm = z_est
                else:
                    assert False, 'Not supported path'

                se = SquaredError('denois' + str(i))
                costs.denois[i] = se.apply(z_est_norm.flatten(2),
                                           z_clean.flatten(2)) \
                    / np.prod(self.layer_dims[i], dtype=floatX)
                costs.denois[i].name = 'denois' + str(i)
                denois_print = 'denois %.2f' % self.p.denoising_cost_x[i]
            else:
                denois_print = ''

            # Store references for later use
            est.h[i] = self.apply_act(z_est, act_f)
            est.z[i] = z_est
            est.s[i] = None
            est.m[i] = None
            logger.info('  g%d: %10s, %s, dim %s -> %s' %
                        (i, l_type, denois_print, self.layer_dims.get(i + 1),
                         self.layer_dims.get(i)))

        # Costs
        y = target_labeled.flatten()

        costs.class_clean = CategoricalCrossEntropy().apply(
            y, clean.labeled.h[top])
        costs.class_clean.name = 'cost_class_clean'

        costs.class_corr = CategoricalCrossEntropy().apply(
            y, corr.labeled.h[top])
        costs.class_corr.name = 'cost_class_corr'

        # This will be used for training
        costs.total = costs.class_corr * 1.0
        for i in range(top + 1):
            if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0:
                costs.total += costs.denois[i] * self.p.denoising_cost_x[i]

        costs.total.name = 'cost_total'

        # Classification error
        mr = MisclassificationRate()
        self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.)
        self.error.clean.name = 'error_rate_clean'

    def apply_act(self, input, act_name):
        if input is None:
            return input
        act = {
            'relu': lambda x: T.maximum(0, x),
            'leakyrelu': lambda x: T.switch(x > 0., x, 0.1 * x),
            'linear': lambda x: x,
            'softplus': lambda x: T.log(1. + T.exp(x)),
            'sigmoid': lambda x: T.nnet.sigmoid(x),
            'softmax': lambda x: T.nnet.softmax(x),
        }.get(act_name)
        assert act, 'unknown act %s' % act_name
        if act_name == 'softmax':
            input = input.flatten(2)
        return act(input)

    def annotate_bn(self, var, id, var_type, mb_size, size, norm_ax):
        var_shape = np.array((1, ) + size)
        out_dim = np.prod(var_shape) / np.prod(var_shape[list(norm_ax)])
        # Flatten the var - shared variable updating is not trivial otherwise,
        # as theano seems to believe a row vector is a matrix and will complain
        # about the updates
        orig_shape = var.shape
        var = var.flatten()
        # Here we add the name and role, the variables will later be identified
        # by these values
        var.name = id + '_%s_clean' % var_type
        add_role(var, BNPARAM)
        shared_var = self.shared(np.zeros(out_dim),
                                 name='shared_%s' % var.name,
                                 role=None)

        # Update running average estimates. When the counter is reset to 1, it
        # will clear its memory
        cntr, c_up = self.counter()
        one = np.float32(1)
        run_avg = lambda new, old: one / cntr * new + (one - one / cntr) * old
        if var_type == 'mean':
            new_value = run_avg(var, shared_var)
        elif var_type == 'var':
            mb_size = T.cast(mb_size, 'float32')
            new_value = run_avg(mb_size / (mb_size - one) * var, shared_var)
        else:
            raise NotImplemented('Unknown batch norm var %s' % var_type)
        # Add the counter update to the annotated update if it is the first
        # instance of a counter
        self.annotate_update([(shared_var, new_value)] + c_up, var)

        return var.reshape(orig_shape)

    def f(self, h, in_dim, spec, num, act_f, path_name, noise_std=0):
        assert path_name in ['clean', 'corr']
        # Generates identifiers used for referencing shared variables.
        # E.g. clean and corrupted encoders will end up using the same
        # variable name and hence sharing parameters
        gen_id = lambda s: '_'.join(['f', str(num), s])
        layer_type, _ = spec

        # Pooling
        if layer_type in ['maxpool', 'globalmeanpool']:
            z, output_size = self.f_pool(h, spec, in_dim)
            norm_ax = (0, -2, -1)
            # after pooling, no activation func for now unless its softmax
            act_f = "linear" if act_f != "softmax" else act_f

        # Convolution
        elif layer_type in ['convv', 'convf']:
            z, output_size = self.f_conv(h, spec, in_dim, gen_id('W'))
            norm_ax = (0, -2, -1)

        # Fully connected
        elif layer_type == "fc":
            h = h.flatten(2) if h.ndim > 2 else h
            _, dim = spec
            W = self.weight(self.rand_init(np.prod(in_dim), dim), gen_id('W'))
            z, output_size = T.dot(h, W), (dim, )
            norm_ax = (0, )
        else:
            raise ValueError("Unknown layer spec: %s" % layer_type)

        m = s = None
        is_normalizing = True
        if is_normalizing:
            keep_dims = True
            z_l = self.labeled(z)
            z_u = self.unlabeled(z)
            m = z_u.mean(norm_ax, keepdims=keep_dims)
            s = z_u.var(norm_ax, keepdims=keep_dims)

            m_l = z_l.mean(norm_ax, keepdims=keep_dims)
            s_l = z_l.var(norm_ax, keepdims=keep_dims)
            if path_name == 'clean':
                # Batch normalization estimates the mean and variance of
                # validation and test sets based on the training set
                # statistics. The following annotates the computation of
                # running average to the graph.
                m_l = self.annotate_bn(m_l, gen_id('bn'), 'mean', z_l.shape[0],
                                       output_size, norm_ax)
                s_l = self.annotate_bn(s_l, gen_id('bn'), 'var', z_l.shape[0],
                                       output_size, norm_ax)
            z = self.join((z_l - m_l) / T.sqrt(s_l + np.float32(1e-10)),
                          (z_u - m) / T.sqrt(s + np.float32(1e-10)))

        if noise_std > 0:
            z += self.noise_like(z) * noise_std

        # z for lateral connection
        z_lat = z
        b_init, c_init = 0.0, 1.0
        b_c_size = output_size[0]

        # Add bias
        if act_f != 'linear':
            z += self.bias(b_init * np.ones(b_c_size),
                           gen_id('b'),
                           for_conv=len(output_size) > 1)

        if is_normalizing:
            # Add free parameter (gamma in original Batch Normalization paper)
            # if needed by the activation. For instance ReLU does't need one
            # and we only add it to softmax if hyperparameter top_c is set.
            if (act_f not in ['relu', 'leakyrelu', 'linear', 'softmax']
                    or (act_f == 'softmax' and self.p.top_c is True)):
                c = self.weight(c_init * np.ones(b_c_size),
                                gen_id('c'),
                                for_conv=len(output_size) > 1)
                z *= c

        h = self.apply_act(z, act_f)

        logger.info('  f%d: %s, %s,%s noise %.2f, params %s, dim %s -> %s' %
                    (num, layer_type, act_f, ' BN,' if is_normalizing else '',
                     noise_std, spec[1], in_dim, output_size))
        return output_size, z_lat, m, s, h

    def f_pool(self, x, spec, in_dim):
        layer_type, dims = spec
        num_filters = in_dim[0]
        if "globalmeanpool" == layer_type:
            y, output_size = global_meanpool_2d(x, num_filters)
            # scale the variance to match normal conv layers with xavier init
            y = y * np.float32(in_dim[-1]) * np.float32(np.sqrt(3))
        else:
            assert dims[0] != 1 or dims[1] != 1
            y, output_size = maxpool_2d(x,
                                        in_dim,
                                        poolsize=(dims[1], dims[1]),
                                        poolstride=(dims[0], dims[0]))
        return y, output_size

    def f_conv(self, x, spec, in_dim, weight_name):
        layer_type, dims = spec
        num_filters = dims[0]
        filter_size = (dims[1], dims[1])
        stride = (dims[2], dims[2])

        bm = 'full' if 'convf' in layer_type else 'valid'

        num_channels = in_dim[0]

        W = self.weight(
            self.rand_init_conv((num_filters, num_channels) + filter_size),
            weight_name)

        if stride != (1, 1):
            f = GpuCorrMM(subsample=stride, border_mode=bm, pad=(0, 0))
            y = f(gpu_contiguous(x), gpu_contiguous(W))
        else:
            assert self.p.batch_size == self.p.valid_batch_size
            y = conv2d(x,
                       W,
                       image_shape=(2 * self.p.batch_size, ) + in_dim,
                       filter_shape=((num_filters, num_channels) +
                                     filter_size),
                       border_mode=bm)
        output_size = (
            (num_filters, ) +
            ConvOp.getOutputShape(in_dim[1:], filter_size, stride, bm))

        return y, output_size

    def g(self, z_lat, z_ver, in_dims, out_dims, l_type, num, fspec, top_g):
        f_layer_type, dims = fspec
        is_conv = f_layer_type is not None and ('conv' in f_layer_type
                                                or 'pool' in f_layer_type)
        gen_id = lambda s: '_'.join(['g', str(num), s])

        in_dim = np.prod(dtype=floatX, a=in_dims)
        out_dim = np.prod(dtype=floatX, a=out_dims)
        num_filters = out_dims[0] if is_conv else out_dim

        if l_type[-1] in ['0']:
            g_type, u_type = l_type[:-1], l_type[-1]
        else:
            g_type, u_type = l_type, None

        # Mapping from layer above: u
        if u_type in ['0'] or z_ver is None:
            if z_ver is None and u_type not in ['0']:
                logger.warn('Decoder %d:%s without vertical input' %
                            (num, g_type))
            u = None
        else:
            if top_g:
                u = z_ver
            elif is_conv:
                u = self.g_deconv(z_ver, in_dims, out_dims, gen_id('W'), fspec)
            else:
                W = self.weight(self.rand_init(in_dim, out_dim), gen_id('W'))
                u = T.dot(z_ver, W)

        # Batch-normalize u
        if u is not None:
            norm_ax = (0, ) if u.ndim <= 2 else (0, -2, -1)
            keep_dims = True
            u -= u.mean(norm_ax, keepdims=keep_dims)
            u /= T.sqrt(u.var(norm_ax, keepdims=keep_dims) + np.float32(1e-10))

        # Define the g function
        if not is_conv:
            z_lat = z_lat.flatten(2)
        bi = lambda inits, name: self.bias(
            inits * np.ones(num_filters), gen_id(name), for_conv=is_conv)
        wi = lambda inits, name: self.weight(
            inits * np.ones(num_filters), gen_id(name), for_conv=is_conv)

        if g_type == '':
            z_est = None

        elif g_type == 'i':
            z_est = z_lat

        elif g_type in ['sig']:
            sigval = bi(0., 'c1') + wi(1., 'c2') * z_lat
            if u is not None:
                sigval += wi(0., 'c3') * u + wi(0., 'c4') * z_lat * u
            sigval = T.nnet.sigmoid(sigval)

            z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat + wi(1., 'b1') * sigval
            if u is not None:
                z_est += wi(0., 'a3') * u + wi(0., 'a4') * z_lat * u

        elif g_type in ['lin']:
            a1 = wi(1.0, 'a1')
            b = bi(0.0, 'b')

            z_est = a1 * z_lat + b

        elif g_type in ['relu']:
            assert u is not None
            b = bi(0., 'b')
            x = u + b
            z_est = self.apply_act(x, 'relu')

        elif g_type in ['sigmoid']:
            assert u is not None
            b = bi(0., 'b')
            c = wi(1., 'c')
            z_est = self.apply_act((u + b) * c, 'sigmoid')

        elif g_type in ['comparison_g2']:
            # sig without the uz cross term
            sigval = bi(0., 'c1') + wi(1., 'c2') * z_lat
            if u is not None:
                sigval += wi(0., 'c3') * u
            sigval = T.nnet.sigmoid(sigval)

            z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat + wi(1., 'b1') * sigval
            if u is not None:
                z_est += wi(0., 'a3') * u

        elif g_type in ['comparison_g3']:
            # sig without the sigmoid nonlinearity
            z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat
            if u is not None:
                z_est += wi(0., 'a3') * u + wi(0., 'a4') * z_lat * u

        elif g_type in ['comparison_g4']:
            # No mixing between z_lat and u before final sum, otherwise similar
            # to sig
            def nonlin(inp, in_name='input', add_bias=True):
                w1 = wi(1., 'w1_%s' % in_name)
                b1 = bi(0., 'b1')
                w2 = wi(1., 'w2_%s' % in_name)
                b2 = bi(0., 'b2') if add_bias else 0
                w3 = wi(0., 'w3_%s' % in_name)
                return w2 * T.nnet.sigmoid(b1 + w1 * inp) + w3 * inp + b2

            z_est = nonlin(z_lat, 'lat') if u is None else \
                nonlin(z_lat, 'lat') + nonlin(u, 'ver', False)

        elif g_type in ['comparison_g5', 'gauss']:
            # Gaussian assumption on z: (z - mu) * v + mu
            if u is None:
                b1 = bi(0., 'b1')
                w1 = wi(1., 'w1')
                z_est = w1 * z_lat + b1
            else:
                a1 = bi(0., 'a1')
                a2 = wi(1., 'a2')
                a3 = bi(0., 'a3')
                a4 = bi(0., 'a4')
                a5 = bi(0., 'a5')

                a6 = bi(0., 'a6')
                a7 = wi(1., 'a7')
                a8 = bi(0., 'a8')
                a9 = bi(0., 'a9')
                a10 = bi(0., 'a10')

                mu = a1 * T.nnet.sigmoid(a2 * u + a3) + a4 * u + a5
                v = a6 * T.nnet.sigmoid(a7 * u + a8) + a9 * u + a10

                z_est = (z_lat - mu) * v + mu

        else:
            raise NotImplementedError("unknown g type: %s" % str(g_type))

        # Reshape the output if z is for conv but u from fc layer
        if (z_est is not None and type(out_dims) == tuple
                and len(out_dims) > 1.0 and z_est.ndim < 4):
            z_est = z_est.reshape((z_est.shape[0], ) + out_dims)

        return z_est

    def g_deconv(self, z_ver, in_dims, out_dims, weight_name, fspec):
        """ Inverse operation for each type of f used in convnets """
        f_type, f_dims = fspec
        assert z_ver is not None
        num_channels = in_dims[0] if in_dims is not None else None
        num_filters, width, height = out_dims[:3]

        if f_type in ['globalmeanpool']:
            u = T.addbroadcast(z_ver, 2, 3)
            assert in_dims[1] == 1 and in_dims[2] == 1, \
                "global pooling needs in_dims (1,1): %s" % str(in_dims)

        elif f_type in ['maxpool']:
            sh, str, size = z_ver.shape, f_dims[0], f_dims[1]
            assert str == size, "depooling requires stride == size"
            u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str),
                        dtype=z_ver.dtype)
            for x in range(str):
                for y in range(str):
                    u = T.set_subtensor(u[:, :, x::str, y::str], z_ver)
            u = u[:, :, :width, :height]

        elif f_type in ['convv', 'convf']:
            filter_size, str = (f_dims[1], f_dims[1]), f_dims[2]
            W_shape = (num_filters, num_channels) + filter_size
            W = self.weight(self.rand_init_conv(W_shape), weight_name)
            if str > 1:
                # upsample if strided version
                sh = z_ver.shape
                u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str),
                            dtype=z_ver.dtype)
                u = T.set_subtensor(u[:, :, ::str, ::str], z_ver)
            else:
                u = z_ver  # no strides, only deconv
            u = conv2d(u,
                       W,
                       filter_shape=W_shape,
                       border_mode='valid' if 'convf' in f_type else 'full')
            u = u[:, :, :width, :height]
        else:
            raise NotImplementedError('Layer %s has no convolutional decoder' %
                                      f_type)

        return u
예제 #32
0
def random_normal(shape, mean=0.0, std=1.0, dtype=_FLOATX, seed=None):
    if seed is None:
        seed = np.random.randint(10e6)
    rng = RandomStreams(seed=seed)
    return rng.normal(size=shape, avg=mean, std=std, dtype=dtype)
예제 #33
0
else:
    params = lasagne.layers.get_all_params([l_dec_x_mu, l_dec_x_log_var],
                                           trainable=True)
    for p in params:
        print p, p.get_value().shape
    params_count = lasagne.layers.count_params([l_dec_x_mu, l_dec_x_log_var],
                                               trainable=True)
print 'Number of parameters:', params_count

# random generation for visualization
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
srng_ran = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
srng_ran_share = theano.tensor.shared_randomstreams.RandomStreams(1234)
sym_nimages = T.iscalar('nimages')

ran_z = srng_ran.normal((sym_nimages, latent_size))
if dataset in ['sample', 'fixed', 'caltech', 'ocr_letter', 'omniglot']:
    random_x_mean = lasagne.layers.get_output(l_dec_x_mu, {l_z: ran_z},
                                              deterministic=True)
    random_x = srng_ran_share.binomial(n=1,
                                       p=random_x_mean,
                                       dtype=theano.config.floatX)
else:
    random_x_mean, random_x_log_var = lasagne.layers.get_output(
        [l_dec_x_mu, l_dec_x_log_var], {l_z: ran_z}, deterministic=True)
    random_x = srng_ran_share.normal(size=(sym_nimages, num_features),
                                     avg=random_x_mean,
                                     std=T.exp(0.5 * random_x_log_var))
generate_model = theano.function(inputs=[sym_nimages],
                                 outputs=[random_x_mean, random_x])
예제 #34
0
def _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples, random_seed):
    """Return expression of approximate ELBO based on Monte Carlo sampling.
    """
    if random_seed is None:
        r = MRG_RandomStreams(gen_random_state())
    else:
        r = MRG_RandomStreams(seed=random_seed)

    normal_const = floatX(1 + np.log(2.0 * np.pi))

    elbo = 0

    # Sampling local variational parameters
    if uw_l is not None:
        l_l = (uw_l.size / 2).astype('int64')
        l_l_ = (uw_l.size / 2).astype(floatX_str)
        u_l = uw_l[:l_l]
        w_l = uw_l[l_l:]
        ns_l = r.normal(size=(n_mcsamples, inarray_l.tag.test_value.shape[0]))
        zs_l = ns_l * tt.exp(w_l) + u_l
        elbo += tt.sum(w_l) + 0.5 * l_l_ * normal_const
    else:
        zs_l = None

    # Sampling global variational parameters
    if uw_g is not None:
        l_g = (uw_g.size / 2).astype('int64')
        l_g_ = (uw_g.size / 2).astype(floatX_str)
        u_g = uw_g[:l_g]
        w_g = uw_g[l_g:]
        ns_g = r.normal(size=(n_mcsamples, inarray_g.tag.test_value.shape[0]))
        zs_g = ns_g * tt.exp(w_g) + u_g
        elbo += tt.sum(w_g) + 0.5 * l_g_ * normal_const
    else:
        zs_g = None

    if (zs_l is not None) and (zs_g is not None):

        def logp_(z_g, z_l):
            return theano.clone(logp,
                                OrderedDict({
                                    inarray_g: z_g,
                                    inarray_l: z_l
                                }),
                                strict=False)

        sequences = [zs_g, zs_l]

    elif zs_l is not None:

        def logp_(z_l):
            return theano.clone(logp,
                                OrderedDict({inarray_l: z_l}),
                                strict=False)

        sequences = [zs_l]

    else:

        def logp_(z_g):
            return theano.clone(logp,
                                OrderedDict({inarray_g: z_g}),
                                strict=False)

        sequences = [zs_g]

    logps, _ = theano.scan(fn=logp_, outputs_info=None, sequences=sequences)
    elbo += tt.mean(logps)

    return elbo
예제 #35
0
class GaussianDropoutLayer(lasagne.layers.Layer):
    '''
        Puts a gaussian prior on the weights of the previous layer
    '''
    def __init__(self, incoming, p=lasagne.init.Constant(-10), log_alpha=None,
                 mask=None, n_samples=None, shared_axes=(), **kwargs):
        super(GaussianDropoutLayer, self).__init__(
            incoming, **kwargs)

        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
        self.shared_axes = tuple(shared_axes)

        if log_alpha is None:
            if isinstance(p, Number):
                p = np.atleast_1d(p)
            if callable(p):
                p_shape = self.input_shape[1:]
            else:
                p_shape = p.shape
            p = lasagne.utils.create_param(p, p_shape, name='p')
            p = p.get_value()
            log_alpha = np.log(p/(1-p))

        # add log_alpha as trainable parameter
        if isinstance(log_alpha, Number):
            log_alpha = np.atleast_1d(log_alpha)
        if callable(log_alpha):
            log_alpha_shape = self.input_shape[1:]
        elif isinstance(log_alpha, tt.sharedvar.SharedVariable):
            log_alpha_shape = log_alpha.get_value().shape
        else:
            log_alpha_shape = log_alpha.shape

        self.log_alpha = self.add_param(
            log_alpha, log_alpha_shape, name='log_alpha', regularizable=False)

        # init mask to shape compatible with log_alpha
        mask_shape = [2] + list(self.input_shape[1:])
        # the mask should be drawn from a normal (1, alpha) distribution
        sq_alpha = np.exp(0.5*self.log_alpha.get_value())
        mask = sq_alpha*np.random.normal(1, 1, mask_shape).astype(floatX)

        self.mask = self.add_param(
            mask, mask_shape, name='mask', trainable=False, regularzable=False)
        self.mask_updates = None

    def get_output_for(self, input, deterministic=False,
                       fixed_dropout_masks=False, **kwargs):
        if deterministic:
            return input
        else:
            # use nonsymbolic shape for dropout mask if possible
            mask_shape = self.input_shape
            if any(s is None for s in mask_shape):
                mask_shape = input.shape

            # apply dropout, respecting shared axes
            if self.shared_axes:
                shared_axes = tuple(a if a >= 0 else a + input.ndim
                                    for a in self.shared_axes)
                mask_shape = tuple(1 if a in shared_axes else s
                                   for a, s in enumerate(mask_shape))

            mask = self._srng.normal(
                mask_shape, avg=0, std=1,
                dtype=input.dtype)

            if self.shared_axes:
                bcast = tuple(bool(s == 1) for s in mask_shape)
                mask = tt.patternbroadcast(mask, bcast)

            if self.mask is not None and fixed_dropout_masks:
                # the user may update the shared mask value however they want,
                # but here we provide an update expression. note that if the
                # batch size changes, the update will only have an effect at
                # the next call causing a shape mis-match in the elementwise
                # product. To avoid this, the user should update the masks
                # before performing a forward pass on this layer.
                self.mask_updates = mask

                # make sure that we use the local shared variable as the mask
                mask = self.mask
            sq_alpha = tt.exp(0.5*self.log_alpha)
            return input * (1 + sq_alpha * mask)
예제 #36
0
                   latents)


embedded = lib.ops.embedding.Embedding('Embedding', 256, CONV_DIM, images)
embedded = embedded.dimshuffle(0, 1, 4, 2, 3)
embedded = embedded.reshape(
    (embedded.shape[0], embedded.shape[1] * embedded.shape[2],
     embedded.shape[3], embedded.shape[4]))

mu_and_logsig1 = E1(embedded)
mu1, logsig1 = split(mu_and_logsig1)

if VANILLA:
    latents1 = mu1
else:
    eps = T.cast(theano_srng.normal(mu1.shape), theano.config.floatX)
    latents1 = mu1 + (eps * T.exp(logsig1))

outputs1 = D1(latents1)

reconst_cost = T.nnet.categorical_crossentropy(
    T.nnet.softmax(
        outputs1.reshape(
            (-1, 256, N_CHANNELS, HEIGHT, WIDTH)).dimshuffle(0, 2, 3, 4,
                                                             1).reshape(
                                                                 (-1, 256))),
    images.flatten()).mean()

# Layer 2

예제 #37
0
    def __init__(self,
                 rng,
                 input,
                 n_in,
                 n_out,
                 num_MC,
                 num_FF,
                 Domain_number=None,
                 number="1",
                 Domain_consideration=True):
        #inputも100*N*Dで入ってくるようにする.
        #DATA=input
        #N=DATA.shape[1]
        #n_in_D=DATA.shape[2]
        srng = RandomStreams(seed=234)

        #Define hyperparameters
        lhyp_values = np.zeros(n_in + 1, dtype=theano.config.floatX)
        self.lhyp = theano.shared(value=lhyp_values,
                                  name='lhyp' + number,
                                  borrow=True)
        self.sf2, self.l = T.exp(self.lhyp[0]), T.exp(self.lhyp[1:1 + n_in])

        if Domain_consideration:
            ls_value = np.zeros(Domain_number,
                                dtype=theano.config.floatX) + np.log(
                                    0.1, dtype=theano.config.floatX)
        else:
            ls_value = np.zeros(1, dtype=theano.config.floatX) + np.log(
                0.1, dtype=theano.config.floatX)

        self.ls = theano.shared(value=ls_value,
                                name='ls' + number,
                                borrow=True)
        self.beta = T.exp(self.ls)

        #Define prior omega
        #prior_mean_Omega.append(tf.zeros([self.d_in[i],1]))
        log_prior_var_Omega = T.tile(1 / (self.l)**0.5, (num_FF, 1)).T

        #Define posterior omega

        #get samples from  omega
        sample_value = np.random.randn(1, n_in, num_FF)
        sample_Omega_epsilon_0 = theano.shared(value=sample_value,
                                               name='sample_Omega' + number)
        #sample_Omega_epsilon_0 = srng.normal((1,n_in,num_FF))
        Omega_sample = sample_Omega_epsilon_0 * log_prior_var_Omega[None, :, :]
        Omega_samples = T.tile(Omega_sample, (num_MC, 1, 1))

        #Define prior W
        prior_mean_W = T.zeros(2 * num_FF)
        log_prior_var_W = T.ones(2 * num_FF)

        #Define posterior W
        mean_mu_value = np.random.randn(2 * num_FF, n_out) * 1e-2
        self.mean_mu = theano.shared(value=mean_mu_value,
                                     name='mean_mu' + number,
                                     borrow=True)

        log_var_value = np.zeros((2 * num_FF, n_out))
        self.log_var_W = theano.shared(value=log_var_value,
                                       name='q_W' + number,
                                       borrow=True)

        #get samples from W
        sample_Omega_epsilon = srng.normal((num_MC, 2 * num_FF, n_out))
        W_samples = sample_Omega_epsilon * (T.exp(
            self.log_var_W)**0.5)[None, :, :] + self.mean_mu[None, :, :]

        # calculate lyaer N_MC*N*D_out
        F_next, updates = theano.scan(
            fn=lambda a, b, c: self.passage(a, b, c, num_FF),
            sequences=[input, Omega_samples, W_samples])

        #output
        self.output = F_next

        #KL-divergence
        #Omega

        #W
        self.KL_W = self.DKL_gaussian(self.mean_mu, self.log_var_W,
                                      prior_mean_W, log_prior_var_W)

        #parameter_setting
        self.all_params = [self.lhyp, self.ls, self.mean_mu, self.log_var_W]
        self.hyp_params = [self.lhyp, self.ls]
        self.variational_params = [self.mean_mu, self.log_var_W]
예제 #38
0
def sample_vp(vparams,
              draws=1000,
              model=None,
              local_RVs=None,
              random_seed=None,
              hide_transformed=True,
              progressbar=True):
    """Draw samples from variational posterior.

    Parameters
    ----------
    vparams : dict or pymc3.variational.ADVIFit
        Estimated variational parameters of the model.
    draws : int
        Number of random samples.
    model : pymc3.Model
        Probabilistic model.
    random_seed : int or None
        Seed of random number generator.  None to use current seed.
    hide_transformed : bool
        If False, transformed variables are also sampled. Default is True.

    Returns
    -------
    trace : pymc3.backends.base.MultiTrace
        Samples drawn from the variational posterior.
    """
    import warnings
    warnings.warn(
        'Old ADVI interface and sample_vp is deprecated and will '
        'be removed in future, use pm.fit and pm.sample_approx instead',
        DeprecationWarning,
        stacklevel=2)
    model = pm.modelcontext(model)

    if isinstance(vparams, ADVIFit):
        vparams = {'means': vparams.means, 'stds': vparams.stds}

    ds = model.deterministics

    def get_transformed(v):
        return v if v not in ds else v.transformed

    def rvs(x):
        return [get_transformed(v) for v in x] if x is not None else []

    global_RVs = list(set(model.free_RVs) - set(rvs(local_RVs)))

    # Make dict for replacements of random variables
    if random_seed is None:
        r = MRG_RandomStreams(gen_random_state())
    else:
        r = MRG_RandomStreams(seed=random_seed)
    updates = {}
    for v in global_RVs:
        u = theano.shared(vparams['means'][str(v)]).ravel()
        w = theano.shared(vparams['stds'][str(v)]).ravel()
        n = r.normal(size=u.tag.test_value.shape)
        updates.update({v: (n * w + u).reshape(v.tag.test_value.shape)})

    if local_RVs is not None:
        for v_, (uw, _) in local_RVs.items():
            v = get_transformed(v_)
            u = uw[0].ravel()
            w = uw[1].ravel()
            n = r.normal(size=u.tag.test_value.shape)
            updates.update(
                {v: (n * tt.exp(w) + u).reshape(v.tag.test_value.shape)})

    # Replace some nodes of the graph with variational distributions
    vars = model.free_RVs
    samples = theano.clone(vars, updates)
    f = theano.function([], samples)

    # Random variables which will be sampled
    vars_sampled = pm.util.get_default_varnames(
        model.unobserved_RVs, include_transformed=not hide_transformed)

    varnames = [str(var) for var in model.unobserved_RVs]
    trace = pm.sampling.NDArray(model=model, vars=vars_sampled)
    trace.setup(draws=draws, chain=0)

    range_ = trange(draws) if progressbar else range(draws)

    for _ in range_:
        # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...}
        point = {varname: value for varname, value in zip(varnames, f())}
        trace.record(point)

    return MultiTrace([trace])
예제 #39
0
class VAELayer(Layer):
    def __init__(self,
                 incoming,
                 encoder,
                 decoder,
                 x_distribution='bernoulli',
                 pz_distribution='gaussian',
                 qz_distribution='gaussian',
                 latent_size=50,
                 W=init.Normal(0.01),
                 b=init.Normal(0.01),
                 **kwargs):
        super(VAELayer, self).__init__(incoming, **kwargs)
        num_batch, n_features = self.input_shape
        self.num_batch = num_batch
        self.n_features = n_features
        self.x_distribution = x_distribution
        self.pz_distribution = pz_distribution
        self.qz_distribution = qz_distribution
        self.encoder = encoder
        self.decoder = decoder
        self._srng = RandomStreams()

        if self.x_distribution not in ['gaussian', 'bernoulli']:
            raise NotImplementedError
        if self.pz_distribution not in ['gaussian', 'gaussianmarg']:
            raise NotImplementedError
        if self.qz_distribution not in ['gaussian', 'gaussianmarg']:
            raise NotImplementedError

        self.params_encoder = lasagne.layers.get_all_params(encoder)
        self.params_decoder = lasagne.layers.get_all_params(decoder)
        for p in self.params_encoder:
            p.name = "VAELayer encoder :" + p.name
        for p in self.params_decoder:
            p.name = "VAELayer decoder :" + p.name

        self.num_hid_enc = encoder.output_shape[1]
        self.num_hid_dec = decoder.output_shape[1]
        self.latent_size = latent_size

        self.W_enc_to_z_mu = self.add_param(W, (self.num_hid_enc, latent_size))
        self.b_enc_to_z_mu = self.add_param(b, (latent_size, ))

        self.W_enc_to_z_logsigma = self.add_param(
            W, (self.num_hid_enc, self.latent_size))
        self.b_enc_to_z_logsigma = self.add_param(b, (latent_size, ))

        self.W_dec_to_x_mu = self.add_param(
            W, (self.num_hid_dec, self.n_features))
        self.b_dec_to_x_mu = self.add_param(b, (self.n_features, ))

        self.W_params = [
            self.W_enc_to_z_mu, self.W_enc_to_z_logsigma, self.W_dec_to_x_mu
        ] + self.params_encoder + self.params_decoder
        self.bias_params = [
            self.b_enc_to_z_mu, self.b_enc_to_z_logsigma, self.b_dec_to_x_mu
        ]

        params_tmp = []
        if self.x_distribution == 'gaussian':
            self.W_dec_to_x_logsigma = self.add_param(
                W, (self.num_hid_dec, self.n_features))
            self.b_dec_to_x_logsigma = self.add_param(b, (self.n_features, ))
            self.W_params += [self.W_dec_to_x_logsigma]
            self.bias_params += [self.b_dec_to_x_logsigma]
            self.W_dec_to_x_logsigma.name = "VAE: W_dec_to_x_logsigma"
            self.b_dec_to_x_logsigma.name = "VAE: b_dec_to_x_logsigma"
            params_tmp = [self.W_dec_to_x_logsigma, self.b_dec_to_x_logsigma]

        self.params = self.params_encoder + [self.W_enc_to_z_mu,
                                             self.b_enc_to_z_mu,
                                             self.W_enc_to_z_logsigma,
                                             self.b_enc_to_z_logsigma] + self.params_decoder + \
                      [self.W_dec_to_x_mu, self.b_dec_to_x_mu] + params_tmp

        self.W_enc_to_z_mu.name = "VAELayer: W_enc_to_z_mu"
        self.W_enc_to_z_logsigma.name = "VAELayer: W_enc_to_z_logsigma"
        self.W_dec_to_x_mu.name = "VAELayer: W_dec_to_x_mu"
        self.b_enc_to_z_mu.name = "VAELayer: b_enc_to_z_mu"
        self.b_enc_to_z_logsigma.name = "VAELayer: b_enc_to_z_logsigma"
        self.b_dec_to_x_mu.name = "VAELayer: b_dec_to_x_mu"

    def get_params(self):
        return self.params

    def get_output_shape_for(self, input_shape):
        dec_out_shp = self.decoder.get_output_shape_for(
            (self.num_batch, self.num_hid_dec))
        if self.x_distribution == 'bernoulli':
            return dec_out_shp
        elif self.x_distribution == 'gaussian':
            return [dec_out_shp, dec_out_shp]

    def _encoder_output(self, x, *args, **kwargs):
        return lasagne.layers.get_output(self.encoder, x, **kwargs)

    def decoder_output(self, z, *args, **kwargs):
        h_decoder = lasagne.layers.get_output(self.decoder, z, **kwargs)
        if self.x_distribution == 'gaussian':
            mu_decoder = T.dot(h_decoder,
                               self.W_dec_to_x_mu) + self.b_dec_to_x_mu
            log_sigma_decoder = T.dot(
                h_decoder, self.W_dec_to_x_logsigma) + self.b_dec_to_x_logsigma
            decoder_out = mu_decoder, log_sigma_decoder
        elif self.x_distribution == 'bernoulli':
            # TODO: Finish writing the output of the decoder for a bernoulli distributed x.
            decoder_out = T.nnet.sigmoid(
                T.dot(h_decoder, self.W_dec_to_x_mu) + self.b_dec_to_x_mu)
        else:
            raise NotImplementedError
        return decoder_out

    def get_z_mu_sigma(self, x, *args, **kwargs):
        h_encoder = self._encoder_output(x, *args, **kwargs)
        mu_encoder = T.dot(h_encoder, self.W_enc_to_z_mu) + self.b_enc_to_z_mu
        log_sigma_encoder = (T.dot(h_encoder, self.W_enc_to_z_logsigma) +
                             self.b_enc_to_z_logsigma)
        eps = self._srng.normal(log_sigma_encoder.shape)
        # TODO: Calculate the sampled z.
        z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps
        return z, mu_encoder, log_sigma_encoder

    def get_log_distributions(self, x, *args, **kwargs):
        # sample z from q(z|x).
        h_encoder = self._encoder_output(x, *args, **kwargs)
        mu_encoder = T.dot(h_encoder, self.W_enc_to_z_mu) + self.b_enc_to_z_mu
        log_sigma_encoder = (T.dot(h_encoder, self.W_enc_to_z_logsigma) +
                             self.b_enc_to_z_logsigma)
        eps = self._srng.normal(log_sigma_encoder.shape)
        z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps

        # forward pass z through decoder to generate p(x|z).
        decoder_out = self.decoder_output(z, *args, **kwargs)
        if self.x_distribution == 'bernoulli':
            x_mu = decoder_out
            log_px_given_z = -T.nnet.binary_crossentropy(x_mu, x)
        elif self.x_distribution == 'gaussian':
            x_mu, x_logsigma = decoder_out
            log_px_given_z = normal2(x, x_mu, x_logsigma)

        # sample prior distribution p(z).
        if self.pz_distribution == 'gaussian':
            log_pz = standard_normal(z)
        elif self.pz_distribution == 'gaussianmarg':
            log_pz = -0.5 * (T.log(2 * np.pi) +
                             (T.sqr(mu_encoder) + T.exp(log_sigma_encoder)))

        # variational approximation distribution q(z|x)
        if self.qz_distribution == 'gaussian':
            log_qz_given_x = normal2(z, mu_encoder, log_sigma_encoder)
        elif self.qz_distribution == 'gaussianmarg':
            log_qz_given_x = -0.5 * (T.log(2 * np.pi) + 1 + log_sigma_encoder)

        # sum over dim 1 to get shape (,batch_size)
        log_px_given_z = log_px_given_z.sum(
            axis=1, dtype=theano.config.floatX)  # sum over x
        log_pz = log_pz.sum(axis=1,
                            dtype=theano.config.floatX)  # sum over latent vars
        log_qz_given_x = log_qz_given_x.sum(
            axis=1, dtype=theano.config.floatX)  # sum over latent vars

        return log_pz, log_qz_given_x, log_px_given_z

    def draw_sample(self, z=None, *args, **kwargs):
        if z is None:  # draw random z
            z = self._srng.normal((self.num_batch, self.latent_size))
        return self.decoder_output(z, *args, **kwargs)
예제 #40
0
class SampleLayer(lasagne.layers.MergeLayer):
    """
    Sampling layer supporting importance sampling as described in [BURDA]_ and
    multiple Monte Carlo samples for the approximation of
    E_q [log( p(x,z) / q(z|x) )].

    Parameters
    ----------
    mu : class:`Layer` instance
        Parameterizing the mean of the distribution to sample
        from as described in [BURDA]_.

    log_var : class:`Layer` instance
        By default assumed to parametrize log(sigma^2) of the distribution to
        sample from as described in [BURDA]_ which is transformed to sigma using
        the nonlinearity function as described below. Effectively this means
        that the nonlinearity function controls what log_var parametrizes. A few
        common examples:
        -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default]
        -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2
        -nonlinearity = lambda x: x => log_var = sigma

    eq_samples : int or T.scalar
        Number of Monte Carlo samples used to estimate the expectation over
        q(z|x) in eq. (8) in [BURDA]_.

    iw_samples : int or T.scalar
        Number of importance samples in the sum over k in eq. (8) in [BURDA]_.

    nonlinearity : callable or None
        The nonlinearity that is applied to the log_var input layer to transform
        it into a standard deviation. By default we assume that
        log_var = log(sigma^2) and hence the corresponding nonlinearity is
        f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma

    seed : int
        seed to random stream

    Methods
    ----------
    seed : Helper function to change the random seed after init is called

    References
    ----------
        ..  [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov.
            "Importance Weighted Autoencoders."
            arXiv preprint arXiv:1509.00519 (2015).
    """
    def __init__(self,
                 mean,
                 log_var,
                 eq_samples=1,
                 iw_samples=1,
                 nonlinearity=lambda x: T.exp(0.5 * x),
                 seed=lasagne.random.get_rng().randint(1, 2147462579),
                 **kwargs):
        super(SampleLayer, self).__init__([mean, log_var], **kwargs)

        self.eq_samples = eq_samples
        self.iw_samples = iw_samples
        self.nonlinearity = nonlinearity

        self._srng = RandomStreams(seed)

    def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
        self._srng.seed(seed)

    def get_output_shape_for(self, input_shapes):
        batch_size, num_latent = input_shapes[0]
        if isinstance(batch_size, int) and \
           isinstance(self.iw_samples, int) and \
           isinstance(self.eq_samples, int):
            out_dim = (batch_size * self.eq_samples * self.iw_samples,
                       num_latent)
        else:
            out_dim = (None, num_latent)
        return out_dim

    def get_output_for(self, input, **kwargs):
        mu, log_var = input
        batch_size, num_latent = mu.shape
        eps = self._srng.normal(
            [batch_size, self.eq_samples, self.iw_samples, num_latent],
            dtype=theano.config.floatX)

        z = mu.dimshuffle(0,'x','x',1) + \
            self.nonlinearity( log_var.dimshuffle(0,'x','x',1)) * eps

        return z.reshape((-1, num_latent))
예제 #41
0
파일: vae.py 프로젝트: fr42k/gap-wgan-gp
)

l1_mu_and_log_sigma = lib.ops.mlp.MLP(
    'L1Encoder',
    input_dim=FRAME_SIZE*EMBED_DIM,
    hidden_dim=L1_DIM,
    output_dim=2*L1_LATENT,
    n_layers=N_LAYERS,
    inputs=embedded.reshape((-1, FRAME_SIZE*EMBED_DIM))
)
l1_mu, l1_log_sigma = l1_mu_and_log_sigma[:,::2], l1_mu_and_log_sigma[:,1::2]

if VANILLA:
    l1_latents = l1_mu
else:
    eps = T.cast(theano_srng.normal(l1_mu.shape), theano.config.floatX)
    l1_latents = l1_mu + (eps * T.exp(l1_log_sigma))

def L1Decoder(latents):
    outputs = lib.ops.mlp.MLP(
        'L1Decoder',
        input_dim=L1_LATENT,
        hidden_dim=L1_DIM,
        output_dim=FRAME_SIZE*EMBED_DIM,
        n_layers=N_LAYERS,
        inputs=latents
    )
    outputs = outputs.reshape((-1, FRAME_SIZE, EMBED_DIM))
    outputs = lib.ops.linear.Linear('L1DecoderOutput', 
            input_dim=EMBED_DIM, 
            output_dim=Q_LEVELS, 
예제 #42
0
class PosteriorGP(object):
    def __init__(self,
                 inducing_pts,
                 x_test,
                 kernel,
                 symbolic_kernel,
                 init_params=None,
                 random_seed=101):
        self.rng = RandomStreams(seed=random_seed)
        self.len_test = len(x_test)
        self.cov_vec = CovVec(inducing_pts, kernel, symbolic_kernel)
        self.post_mean = PosteriorMean(inducing_pts, kernel, symbolic_kernel)

        # input symbolic variables
        self.t_idx_train = T.imatrix()
        self.t_w_train = T.matrix()
        self.t_idx_test = T.imatrix()
        self.t_w_test = T.matrix()
        self.t_y_train = T.vector()

        if init_params is None:
            #init_params = [np.log(np.array([2., 10.])), np.log(0.3)]
            init_params = [np.array([-0.7, 5]), -2.5]
        log_gp_params, log_indep_noise = init_params
        self.log_gp_params = theano.shared(log_gp_params)
        self.log_indep_noise = theano.shared(log_indep_noise)

        self.gp_params = T.exp(self.log_gp_params)
        self.indep_noise = T.exp(self.log_indep_noise)

        # collection of symbolic variables derived from data
        self.data_variables = [
            self.t_idx_train, self.t_w_train, self.t_idx_test, self.t_w_test,
            self.t_y_train
        ]
        # GP hyperparameters and noise parameter
        self.params = [self.log_gp_params, self.log_indep_noise]

    def set_params(self, params):
        log_gp_params, log_indep_noise = params
        self.log_gp_params.set_value(log_gp_params)
        self.log_indep_noise.set_value(log_indep_noise)

    def mean(self):
        mu = self.post_mean(self.t_idx_train, self.t_w_train, self.t_idx_test,
                            self.t_w_test, self.gp_params, self.indep_noise,
                            self.t_y_train)
        return mu

    def cov_rand_proj(self, n_sample=10, n_lanczos_basis=10):
        cov_vec = self.cov_vec
        if n_sample == 1:
            cov_vec.use_single_sample()

        def linear_op(zs):
            return cov_vec(self.t_idx_train, self.t_w_train, self.t_idx_test,
                           self.t_w_test, self.gp_params, self.indep_noise, zs)

        eps = self.rng.normal(size=(n_sample, self.len_test))
        cov_zs = lanczos(linear_op, eps, n_lanczos_basis, n_sample)
        return cov_zs

    def cov_proj(self, eps, n_sample=10, n_lanczos_basis=10):
        cov_vec = self.cov_vec

        def linear_op(zs):
            return cov_vec(self.t_idx_train, self.t_w_train, self.t_idx_test,
                           self.t_w_test, self.gp_params, self.indep_noise, zs)

        cov_zs = lanczos(linear_op, eps, n_lanczos_basis, n_sample)
        return cov_zs
    def construct_graph_popstats(self,
                                 args,
                                 x,
                                 drops_state,
                                 drops_cell,
                                 length,
                                 popstats=None):
        p = self.allocate_parameters(args)

        def stepfn(x, drops_state, drops_cell, dummy_h, dummy_c, pop_means_a,
                   pop_means_b, pop_means_c, pop_vars_a, pop_vars_b,
                   pop_vars_c, h, c):

            atilde = T.dot(h, p.Wa)
            btilde = x
            if args.baseline:
                a_normal, a_mean, a_var = bn(atilde, 1.0, p.ab_betas,
                                             pop_means_a, pop_vars_a, args)
                b_normal, b_mean, b_var = bn(btilde, 1.0, 0, pop_means_b,
                                             pop_vars_b, args)
            else:
                a_normal, a_mean, a_var = bn(atilde, p.a_gammas, p.ab_betas,
                                             pop_means_a, pop_vars_a, args)
                b_normal, b_mean, b_var = bn(btilde, p.b_gammas, 0,
                                             pop_means_b, pop_vars_b, args)
            ab = a_normal + b_normal
            g, f, i, o = [
                fn(ab[:, j * args.num_hidden:(j + 1) * args.num_hidden])
                for j, fn in enumerate([self.activation] +
                                       3 * [T.nnet.sigmoid])
            ]

            if args.elephant:
                c_n = dummy_c + f * c + drops_cell * (i * g)
            else:
                c_n = dummy_c + f * c + i * g

            if args.zoneout:
                c_n_z = c_n * drops_cell + (1 - drops_cell) * c
            else:
                c_n_z = c_n

            if args.baseline:
                c_normal, c_mean, c_var = bn(c_n, 1.0, p.c_betas, pop_means_c,
                                             pop_vars_c, args)
            else:
                c_normal, c_mean, c_var = bn(c_n, p.c_gammas, p.c_betas,
                                             pop_means_c, pop_vars_c, args)

            h_n = dummy_h + o * self.activation(c_normal)

            ## Zoneout
            if args.zoneout:
                h = h_n * drops_state + (1 - drops_state) * h
                c = c_n_z
            else:
                h = h_n
                c = c_n

            return (h, c, atilde, btilde, c, a_mean, b_mean, c_mean, a_var,
                    b_var, c_var)

        xtilde = T.dot(x, p.Wx)
        if args.noise:
            # prime h with white noise
            Trng = MRG_RandomStreams()
            h_prime = Trng.normal((xtilde.shape[1], args.num_hidden),
                                  std=args.noise)
        elif args.summarize:
            # prime h with mean of example
            h_prime = x.mean(axis=[0, 2])[:, None]
        else:
            h_prime = 0

        dummy_states = dict(h=T.zeros(
            (xtilde.shape[0], xtilde.shape[1], args.num_hidden)),
                            c=T.zeros((xtilde.shape[0], xtilde.shape[1],
                                       args.num_hidden)))

        if popstats is None:
            popstats = OrderedDict()
            for key, size in zip(
                    "abc",
                [4 * args.num_hidden, 4 * args.num_hidden, args.num_hidden]):
                for stat, init in zip("mean var".split(), [0, 1]):
                    name = "%s_%s" % (key, stat)
                    popstats[name] = theano.shared(init + np.zeros(
                        (
                            length,
                            size,
                        ), dtype=theano.config.floatX),
                                                   name=name)
        popstats_seq = [
            popstats['a_mean'], popstats['b_mean'], popstats['c_mean'],
            popstats['a_var'], popstats['b_var'], popstats['c_var']
        ]

        [
            h, c, atilde, btilde, htilde, batch_mean_a, batch_mean_b,
            batch_mean_c, batch_var_a, batch_var_b, batch_var_c
        ], _ = theano.scan(
            stepfn,
            sequences=[
                xtilde, drops_cell, drops_state, dummy_states["h"],
                dummy_states["c"]
            ] + popstats_seq,
            outputs_info=[
                T.repeat(p.h0[None, :], xtilde.shape[1], axis=0) + h_prime,
                T.repeat(p.c0[None, :], xtilde.shape[1], axis=0), None, None,
                None, None, None, None, None, None, None
            ])

        batchstats = OrderedDict()
        batchstats['a_mean'] = batch_mean_a
        batchstats['b_mean'] = batch_mean_b
        batchstats['c_mean'] = batch_mean_c
        batchstats['a_var'] = batch_var_a
        batchstats['b_var'] = batch_var_b
        batchstats['c_var'] = batch_var_c

        updates = OrderedDict()
        if not args.use_population_statistics:
            alpha = 1e-2
            for key in "abc":
                for stat, init in zip("mean var".split(), [0, 1]):
                    name = "%s_%s" % (key, stat)
                    print name
                    popstats[name].tag.estimand = batchstats[name]
                    updates[popstats[name]] = (alpha * batchstats[name] +
                                               (1 - alpha) * popstats[name])
        return dict(h=h, c=c), updates, dummy_states, popstats
예제 #44
0
def test_grad_strided():
    rng = np.random.RandomState([2012, 10, 9])
    batch_size = 5
    rows = 9
    cols = 9
    channels = 3
    filter_rows = 3
    filter_cols = filter_rows
    num_filters = 16
    stride = 3

    images = shared(rng.uniform(
        -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                    name='images')
    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs(stride=stride)(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01,
                           filters_bc01,
                           border_mode='valid',
                           subsample=(stride, stride))
    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    checker = function([], [output, output_conv2d])
    output_numpy, output_conv2d_numpy = checker()
    if output_numpy.shape != output_conv2d_numpy.shape:
        raise AssertionError(
            "theano and cuda convnet follow different conventions for this input size, so we can't test cuda convnet by matching it against theano for these inputs"
        )

    # Proper random projection, like verify_grad does.
    theano_rng = MRG_RandomStreams(2013 * 5 * 4)
    cost_weights = theano_rng.normal(size=output_conv2d.shape,
                                     dtype=output_conv2d.dtype)
    cost = (cost_weights * output).sum()

    # XXX: use verify_grad
    images_grad, filters_grad = grad(cost, [images, filters])
    reference_cost = (cost_weights * output_conv2d).sum()
    images_conv2d_grad, filters_conv2d_grad = grad(reference_cost,
                                                   [images, filters])

    f = function(
        [],
        [images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad])

    images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad = f()

    warnings.warn(
        """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    # XXX: Refactor
    if np.abs(images_grad - images_conv2d_grad).max() > 1.15e-5:
        print "=== IMAGES GRADIENT ==="
        assert type(images_grad) == type(images_conv2d_grad)
        assert images_grad.dtype == images_conv2d_grad.dtype
        if images_grad.shape != images_conv2d_grad.shape:
            print 'cuda-convnet shape: ', images_grad.shape
            print 'theano shape: ', images_conv2d_grad.shape
            assert False
        err = np.abs(images_grad - images_conv2d_grad)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (images_grad.min(),
                                             images_grad.max())
        print 'theano value range: ', (images_conv2d_grad.min(),
                                       images_conv2d_grad.max())
        assert False
    if np.abs(filters_grad - filters_conv2d_grad).max() > 1e-5:
        print "=== FILTERS GRADIENT ==="
        assert type(filters_grad) == type(filters_conv2d_grad)
        assert filters_grad.dtype == filters_conv2d_grad.dtype
        if filters_grad.shape != filters_conv2d_grad.shape:
            print 'cuda-convnet shape: ', filters_grad.shape
            print 'theano shape: ', filters_conv2d_grad.shape
            assert False
        err = np.abs(filters_grad - filters_conv2d_grad)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (filters_grad.min(),
                                             filters_grad.max())
        print 'theano value range: ', (filters_conv2d_grad.min(),
                                       filters_conv2d_grad.max())
        assert False
예제 #45
0
    def __init__(self, N_tot, D_in, D_out, M, Domain_number, Ydim,
                 Hiddenlayerdim1, Hiddenlayerdim2, num_MC):
        ########################################
        # set type
        self.Xlabel = T.matrix('Xlabel')
        self.X = T.matrix('X')
        self.Y = T.matrix('Y')
        self.Weight = T.matrix('Weight')

        Ydim = self.Y.shape[1]
        N = self.X.shape[0]
        self.Ntot = N_tot
        #############################################
        #BCなXの設定 後でこれもレイヤー化する  MCsample 分を生成することにします。

        self.hiddenLayer_x = HiddenLayer(rng=rng,
                                         input=self.X,
                                         n_in=D_in,
                                         n_out=Hiddenlayerdim1,
                                         activation=T.nnet.relu,
                                         number='_x')
        self.hiddenLayer_hidden = HiddenLayer(rng=rng,
                                              input=self.hiddenLayer_x.output,
                                              n_in=Hiddenlayerdim1,
                                              n_out=Hiddenlayerdim2,
                                              activation=T.nnet.relu,
                                              number='_h')
        self.hiddenLayer_m = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_hidden.output,
                                         n_in=Hiddenlayerdim2,
                                         n_out=D_out,
                                         activation=T.nnet.relu,
                                         number='_m')
        self.hiddenLayer_S = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_hidden.output,
                                         n_in=Hiddenlayerdim2,
                                         n_out=D_out,
                                         activation=T.nnet.relu,
                                         number='_S')

        self.loc_params = []
        self.loc_params.extend(self.hiddenLayer_x.params)
        self.loc_params.extend(self.hiddenLayer_hidden.params)
        self.loc_params.extend(self.hiddenLayer_m.params)
        self.loc_params.extend(self.hiddenLayer_S.params)

        self.local_params = {}
        for i in self.loc_params:
            self.local_params[str(i)] = i

        #when we use the back constrained model....
        srng = RandomStreams(seed=234)
        sample_latent_epsilon = srng.normal((num_MC, N, D_out))
        latent_samples = sample_latent_epsilon * (
            T.exp(self.hiddenLayer_S.output)**
            0.5)[None, :, :] + self.hiddenLayer_m.output[None, :, :]

        #普通のsupervised な場合 MCサンプル分コピーしときます。
        #self.Data_input=T.tile(self.X,(num_MC,1,1))
        self.Data_input = latent_samples
        ##########################################
        ####X側の推論
        #self.Gaussian_layer_X=KernelLayer(self.Data_input, D_in=D_out, D_out=D_in,num_MC=num_MC,inducing_number=M,Domain_number=None,Domain_consideration=False,number='_X')

        self.Gaussian_layer_X = KernelLayer(self.Data_input,
                                            D_in=D_out,
                                            D_out=D_in,
                                            num_MC=num_MC,
                                            inducing_number=M,
                                            Domain_number=Domain_number,
                                            Domain_consideration=True,
                                            number='_X')

        self.params = self.Gaussian_layer_X.params
        self.Z_params_list = self.Gaussian_layer_X.Z_params_list
        self.global_param_list = self.Gaussian_layer_X.global_params_list
        self.hyp_list = self.Gaussian_layer_X.hyp_params_list

        self.hidden_layer = self.Gaussian_layer_X.output

        ##############################################################################################
        ###Y側の計算
        #self.Gaussian_layer_Y=KernelLayer(self.hidden_layer,D_in=D_out,D_out=Ydim,num_MC=num_MC,inducing_number=M,Domain_number=None,Domain_consideration=False,number='_Y')

        #self.params.extend(self.Gaussian_layer_Y.params)
        #self.Z_params_list.extend(self.Gaussian_layer_Y.Z_params_list)
        #self.global_param_list.extend(self.Gaussian_layer_Y.global_params_list)
        #self.hyp_list.extend(self.Gaussian_layer_Y.hyp_params_list)

        ###########################################
        ###目的関数

        #self.LL = self.Gaussian_layer_X.liklihood_nodomain(self.X)*N_tot/(N)
        self.LL = self.Gaussian_layer_X.likelihood_domain(
            self.X, self.Xlabel) * N_tot / (N)
        self.KL_U = self.Gaussian_layer_X.KL_U
        #self.KL_UY=self.Gaussian_layer_Y.KL_U
        #y=self.Gaussian_layer_Y.softmax_class()
        #self.LLY = -T.mean(T.nnet.categorical_crossentropy(y, self.Y))*N
        #self.LLY=T.sum(T.log(T.maximum(T.sum(self.Y * y, 1), 1e-16)))
        #self.error = self.Gaussian_layer_Y.error_classification(self.Y)

        self.KL_latent_dim = self.KLD_X(
            self.hiddenLayer_m.output, T.exp(
                self.hiddenLayer_S.output)) * N_tot / (N)

        #pred = T.mean(self.Gaussian_layer_X.output,0)
        #self.error = (T.mean((self.Y - pred)**2,0))**0.5

        ###########################################
        #domain checker MMD と クラス分類
        #self.MMD=self.Gaussian_layer_Y.MMD_class_penalty(self.Y,self.Xlabel)

        ##########################################
        #パラメータの格納
        self.hyp_params = {}
        for i in self.hyp_list:
            self.hyp_params[str(i)] = i

        self.Z_params = {}
        for i in self.Z_params_list:
            self.Z_params[str(i)] = i

        self.global_params = {}
        for i in self.global_param_list:
            self.global_params[str(i)] = i

        self.params.extend(self.loc_params)

        self.wrt = {}
        for i in self.params:
            self.wrt[str(i)] = i
예제 #46
0
class SLmodel():

    #This is a test of my idea to adapt the proposal distribution by
    #maximizing the entropy of the weights

    def __init__(self, nx, ns, nh, npcl, xvar=1.0):

        #for this model I assume one linear generative model and a
        #combination of nh linear dynamical models

        #generative matrix
        init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32')
        #init_W=np.asarray(np.eye(2),dtype='float32')

        #always normalize the columns of W to be unit length
        init_W = init_W / np.sqrt(np.sum(init_W**2, axis=0))

        #observed variable means
        init_c = np.asarray(np.zeros(nx), dtype='float32')

        #dynamical matrices
        #init_M=np.asarray(np.random.randn(ns,ns*nh)/2.0,dtype='float32')
        init_M = np.asarray((np.tile(np.eye(ns), (1, nh))), dtype='float32')

        #state-variable variances
        #(covariance matrix of state variable noise assumed to be diagonal)
        init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32')

        #Switching parameter matrix
        init_A = np.asarray(np.zeros((ns, nh)), dtype='float32')

        #priors for switching variable
        init_ph = np.asarray(np.zeros(nh), dtype='float32')

        #parameters for proposal distribution
        init_D = np.asarray(np.eye(ns), dtype='float32')
        init_E = np.asarray(np.random.randn(nx, ns) / 100.0, dtype='float32')
        init_k = np.asarray(np.zeros(ns), dtype='float32')
        init_sig = np.asarray(np.ones(ns), dtype='float32')

        init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_h_now = np.asarray(np.zeros((npcl, nh)), dtype='float32')
        init_h_now[:, 0] = 1.0
        init_weights_now = np.asarray(np.ones(npcl) / float(npcl),
                                      dtype='float32')

        init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32')
        init_h_past[:, 0] = 1.0
        init_weights_past = np.asarray(np.ones(npcl) / float(npcl),
                                       dtype='float32')

        self.W = theano.shared(init_W)
        self.c = theano.shared(init_c)
        self.M = theano.shared(init_M)
        self.b = theano.shared(init_b)
        self.A = theano.shared(init_A)
        self.ph = theano.shared(init_ph)

        self.D = theano.shared(init_D)
        self.E = theano.shared(init_E)
        self.k = theano.shared(init_k)
        self.sig = theano.shared(init_sig)

        #this is to help vectorize operations
        self.sum_mat = T.as_tensor_variable(
            np.asarray((np.tile(np.eye(ns), nh)).T, dtype='float32'))

        self.s_now = theano.shared(init_s_now)
        self.h_now = theano.shared(init_h_now)
        self.weights_now = theano.shared(init_weights_now)

        self.s_past = theano.shared(init_s_past)
        self.h_past = theano.shared(init_h_past)
        self.weights_past = theano.shared(init_weights_past)

        self.xvar = np.asarray(xvar, dtype='float32')

        self.nx = nx  #dimensionality of observed variables
        self.ns = ns  #dimensionality of latent variables
        self.nh = nh  #number of (linear) dynamical modes
        self.npcl = npcl  #numer of particles in particle filter

        self.theano_rng = RandomStreams()

        self.params = [self.W, self.M, self.b, self.A, self.c, self.ph]
        self.rel_lrates = np.asarray([0.1, 1.0, 0.01, 10.0, 0.1, 1.0],
                                     dtype='float32')

        self.meta_params = [self.D, self.E, self.k, self.sig]
        self.meta_rel_lrates = [1.0, 1.0, 1.0, 1.0]

    def sample_proposal_s(self, s, h, xp):

        s_pred = self.get_prediction(s, h)

        n = self.theano_rng.normal(size=T.shape(s))

        prop_mean = T.dot(s_pred, self.D) + T.reshape(T.dot(xp, self.E),
                                                      (1, self.ns)) + self.k

        s_prop = prop_mean + n * T.reshape(T.exp(self.sig / 2.0), (1, self.ns))

        #I compute the term inside the exponent for the pdf of the proposal distrib
        prop_term = -T.sum(n**2) / 2.0

        return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast(
            prop_term, 'float32'), prop_mean

    def calc_h_probs(self, s):

        #this function takes an np by ns matrix of s samples
        #and returns an nh by np set of h probabilities

        exp_terms = T.dot(s, self.A) + T.reshape(self.ph, (1, self.nh))

        #re-centering for numerical stability
        exp_terms_recentered = exp_terms - T.max(exp_terms, axis=1)

        #exponentiation and normalization
        rel_probs = T.exp(exp_terms)
        probs = rel_probs.T / T.sum(rel_probs, axis=1)

        return probs.T

    def proposal_loss(self, s_pred, s_samps, xp, weights):

        #estimates the KL divergence between the proposal distribution
        #and the true posterior (minus one term, which we assume does not
        #depend on the proposal distribution).

        #prop means should be symblolic variables since we need to
        #compute the derivatives of D and E through this function

        prop_means = T.dot(s_pred, self.D) + T.reshape(T.dot(
            xp, self.E), (1, self.ns)) + self.k  #np by ns

        diffs = (prop_means - s_samps)
        scl_diffs = diffs * T.reshape(T.exp(-self.sig), (1, self.ns))
        energies = 0.5 * T.sum(diffs * scl_diffs, axis=1)
        tot = T.sum(energies * weights) + 0.5 * T.sum(self.sig)
        return tot

    def forward_filter_step(self, xp):

        #need to sample from the proposal distribution first
        s_samps, s_pred, prop_terms, prop_means = self.sample_proposal_s(
            self.s_now, self.h_now, xp)

        updates = {}

        #now that we have samples from the proposal distribution, we need to reweight them

        h_probs = self.calc_h_probs(s_samps)

        h_samps = self.theano_rng.multinomial(pvals=h_probs)

        recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1))

        x_terms = -T.sum(
            (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 *
                                                                  self.xvar**2)
        s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1) / 2.0

        energies = x_terms + s_terms - prop_terms

        #to avoid exponentiating large or very small numbers, I
        #"re-center" the reweighting factors by adding a constant,
        #as this has no impact on the resulting new weights

        energies_recentered = energies - T.max(energies)

        alpha = T.exp(energies_recentered)  #these are the reweighting factors

        new_weights_unnorm = self.weights_now * alpha
        normalizer = T.sum(new_weights_unnorm)
        new_weights = new_weights_unnorm / normalizer  #need to normalize new weights

        #gradient updates for the proposal distribution parameters
        lrate = 1e-2

        loss = self.proposal_loss(s_pred, s_samps, xp, new_weights)

        gparams = T.grad(loss,
                         self.meta_params,
                         consider_constant=[s_pred, s_samps, xp, new_weights])
        # constructs the update dictionary
        for gparam, param, rel_lr in zip(gparams, self.meta_params,
                                         self.meta_rel_lrates):
            updates[param] = T.cast(param - gparam * lrate * rel_lr, 'float32')

        updates[self.h_past] = T.cast(self.h_now, 'float32')
        updates[self.s_past] = T.cast(self.s_now, 'float32')

        updates[self.h_now] = T.cast(h_samps, 'float32')
        updates[self.s_now] = T.cast(s_samps, 'float32')

        updates[self.weights_past] = T.cast(self.weights_now, 'float32')
        updates[self.weights_now] = T.cast(new_weights, 'float32')

        #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates
        #return normalizer, energies_recentered, updates
        #return h_samps, updates
        return updates

    def get_prediction(self, s, h):

        s_dot_M = T.dot(s, self.M)  #this is np by nh*ns
        s_pred = T.dot(s_dot_M * T.extra_ops.repeat(h, self.ns, axis=1),
                       self.sum_mat)  #should be np by ns

        return T.cast(s_pred, 'float32')

    def sample_joint(self, sp):

        t2_samp = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s2_samp = T.cast(
            T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')
        h2_samp = T.cast(
            T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')

        diffs = self.b * (s2_samp - sp)
        sqr_term = T.sum(diffs**2, axis=1)
        alpha = T.exp(-sqr_term)
        probs_unnorm = self.weights_past * alpha
        probs = probs_unnorm / T.sum(probs_unnorm)

        t1_samp = self.theano_rng.multinomial(
            pvals=T.reshape(probs, (1, self.npcl))).T
        s1_samp = T.cast(
            T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')
        h1_samp = T.cast(
            T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')

        return [s1_samp, h1_samp, s2_samp, h2_samp]

    #def sample_posterior(self, n_samps):

    #sp, updates = theano.scan(fn=self.get_prediction,
    #outputs_info=[None],
    #sequences=[self.s_past, self.h_past],
    #n_steps=self.npcl)

    ##sp should be np by ns

    #[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint,
    #outputs_info=[None, None, None, None],
    #non_sequences=[sp],
    #n_steps=n_samps)

    #return [s1_samps, h1_samps, s2_samps, h2_samps]

    def h_energy_step(self, s, h):

        #helper function for self.calc_mean_h_energy

        exp_A_i = T.reshape(
            T.sum(self.exp_A * T.reshape(h, (self.nh, 1)), axis=0),
            (self.ns, 1))
        mu_i = T.reshape(T.sum(self.mu * T.reshape(h, (self.nh, 1)), axis=0),
                         (self.ns, 1))
        ln_Z_h_i = T.sum(self.ln_Z_h * T.reshape(h, (self.nh, 1)))
        ph_i = T.sum(self.ph * T.reshape(h, (self.nh, 1)))
        diff = T.reshape(T.reshape(s, (self.ns, 1)) - mu_i, (self.ns, 1))
        diff_dot_exp_A_i = diff * exp_A_i
        gterm = -0.5 * T.sum(T.sum(diff_dot_exp_A_i * diff))
        energy = gterm + ln_Z_h_i + ph_i

        return energy

    def calc_mean_h_energy(self, s, h):

        #you give this function a set of samples of s and h,
        #it gives you the average energy of those samples

        exp_terms = T.dot(s, self.A) + T.reshape(self.ph,
                                                 (1, self.nh))  #np by nh

        energies = T.sum(h * exp_terms, axis=1) + T.log(
            T.sum(T.exp(exp_terms), axis=1))  #should be np by 1

        energy = T.mean(energies)

        return energy

    def update_params(self, x1, x2, n_samps, lrate):

        #this function samples from the joint posterior and performs
        # a step of gradient ascent on the log-likelihood

        sp = self.get_prediction(self.s_past, self.h_past)

        #sp should be np by ns

        [s1_samps, h1_samps, s2_samps, h2_samps
         ], updates = theano.scan(fn=self.sample_joint,
                                  outputs_info=[None, None, None, None],
                                  non_sequences=[sp],
                                  n_steps=n_samps)

        x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1))
        x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1))

        s_pred = self.get_prediction(s1_samps, h1_samps)

        hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps)
        #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps)

        sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1)) / 2.0

        #xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2))
        xterm2 = -T.mean(
            T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) /
            (2.0 * self.xvar**2))

        #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2))
        energy = hterm1 + xterm2 + sterm

        gparams = T.grad(
            energy,
            self.params,
            consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps])

        # constructs the update dictionary
        for gparam, param, rel_lr in zip(gparams, self.params,
                                         self.rel_lrates):
            #gnat=T.dot(param, T.dot(param.T,param))
            updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32')

        #make sure W has unit-length columns
        #new_W=updates[self.W]
        #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32')

        #MIGHT NEED TO NORMALIZE A

        return energy, updates

    def get_ESS(self):

        return 1.0 / T.sum(self.weights_now**2)

    def resample_step(self):

        idx = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0)
        h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0)

        return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32')

    def resample(self):

        [s_samps, h_samps], updates = theano.scan(fn=self.resample_step,
                                                  outputs_info=[None, None],
                                                  n_steps=self.npcl)

        updates[self.s_now] = T.cast(s_samps, 'float32')
        updates[self.h_now] = T.cast(h_samps, 'float32')
        updates[self.weights_now] = T.cast(
            T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'),
            'float32')  #dtype paranoia

        return updates

    def simulate_step(self, s):

        s = T.reshape(s, (1, self.ns))
        #get h probabilities
        h_probs = self.calc_h_probs(s)

        #h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(self.nh,1)))
        h_samp = self.theano_rng.multinomial(pvals=h_probs)

        sp = self.get_prediction(s, h_samp)

        xp = T.dot(self.W, sp.T) + T.reshape(self.c, (self.nx, 1))

        return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp

    def simulate_forward(self, n_steps):

        s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)),
                   axis=0)
        s0 = T.reshape(s0, (1, self.ns))
        [sp, xp, hs], updates = theano.scan(fn=self.simulate_step,
                                            outputs_info=[s0, None, None],
                                            n_steps=n_steps)

        return sp, xp, hs, updates
예제 #47
0
class RNNCluster(rnn.RNNBase):
    """RNNCluster combines sampling-based RNN with item clustering.

	Parameters
	----------
	n_clusters: int
		Number of clusters

	loss: "Blackout", "CCE", "BPR" or "BPRelu"
		Determines the loss function, among:
			- BPR, as used in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016
			- TOP1, defined in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016
			- Blackout, discriminative loss function defined in "BlackOut: Speeding up Recurrent Neural Network Language Models With Very Large Vocabularies", Ji, S. et al., 2015 (equation 6)
			- BPRelu, approximation of BPR based on relu/hinge non-linearities
			- CCE, categorical cross-entropy computed on the set of samples

	cluster_type: "mix", "softmax" or "sigmoid"
		Determines whether items can belong to multiple clusters.
			- mix, items belong to at least one cluster, possibly many.
			- softmax, items belong to one and only one cluster.
			- sigmoid, items belong to zero, one or multiple clusters.

	sampling: int
		Number of samples.

	cluster_sampling: int
		If cluster_sampling > 0, the recommendation loss and the clustering loss use different samples.
		In that case, cluster_sampling is the number of samples used by the clustering loss.

	sampling_bias: float
		Items are sampled with a probability proportional to their frequency to the power of the sampling_bias.

	predict_with_clusters: bool
		Set to false during testing if you want to ignore the clustering.

	cluster_selection_noise: float
		If cluster_selection_noise > 0, a random gaussian noise (whose std is cluster_selection_noise) is added to the cluster selection output during training.
		Can help to explore a large number of clusters.
	
	init_scale: float
		Initial scale of the softmax and sigmoid functions used in the cluster selection process.

	scale_growing_rate: float
		After each training epoch, the scale of the softmax and sigmoid functions is multiplied by the scale_growing_rate.

	max_scale: float
		Maximum allowed scale.

	See classes SequenceNoise, RecurrentLayers, SelectTargets and update manager for options common to the other RNN methods.
	"""
    def __init__(self,
                 n_clusters=10,
                 loss="Blackout",
                 cluster_type='mix',
                 sampling=100,
                 cluster_sampling=-1,
                 sampling_bias=0.,
                 predict_with_clusters=True,
                 cluster_selection_noise=0.,
                 init_scale=1.,
                 scale_growing_rate=1.,
                 max_scale=50,
                 **kwargs):
        super(RNNCluster, self).__init__(**kwargs)

        self.n_clusters = n_clusters
        self.init_scale = np.cast[theano.config.floatX](init_scale)
        self.effective_scale = np.cast[theano.config.floatX](init_scale)
        self.scale_growing_rate = np.cast[theano.config.floatX](
            scale_growing_rate)
        self.max_scale = np.cast[theano.config.floatX](max_scale)
        self.cluster_type = cluster_type
        self.sampling_bias = sampling_bias
        self.loss = loss
        self.cluster_selection_noise = cluster_selection_noise

        self.predict_with_clusters = predict_with_clusters

        if self.loss == "Blackout":
            self._loss = self._blackout_loss
        elif self.loss == 'lin':
            self._loss = self._lin_loss
        elif self.loss == 'BPRelu':
            self._loss = self._BPRelu_loss
        elif self.loss == 'BPR':
            self._loss = self._BPR_loss
        elif self.loss == 'TOP1':
            self._loss = self._TOP1_loss
        elif self.loss == 'CCE':
            self._loss = self._cce_loss
        else:
            raise ValueError('Unknown cluster loss')

        self.n_samples = int(sampling)
        self.n_cluster_samples = int(cluster_sampling)

        self._srng = MRG_RandomStreams(lasagne.random.get_rng().randint(
            1, 2147462579))

        self.name = "RNN Cluster with categorical cross entropy"

        self.metrics = {
            'recall': {
                'direction': 1
            },
            'cluster_recall': {
                'direction': 1
            },
            'sps': {
                'direction': 1
            },
            'cluster_sps': {
                'direction': 1
            },
            'ignored_items': {
                'direction': -1
            },
            'assr': {
                'direction': 1
            },
            'cluster_use': {
                'direction': 1
            },
            'cluster_use_std': {
                'direction': -1
            },
            'cluster_size': {
                'direction': 1
            }
        }

    def _get_model_filename(self, epochs):
        '''Return the name of the file to save the current model
		'''
        filename = "rnn_clusters" + str(self.n_clusters) + "_sc" + str(
            self.init_scale)

        if self.scale_growing_rate != 1.:
            filename += "-" + str(self.scale_growing_rate) + "-" + str(
                self.max_scale)

        filename += "_"
        if self.sampling_bias > 0.:
            filename += "p" + str(self.sampling_bias)
        filename += "s" + str(self.n_samples)

        if self.n_cluster_samples > 0:
            filename += "_"
            if self.sampling_bias > 0.:
                filename += "p" + str(self.sampling_bias)
            filename += "cs" + str(self.n_cluster_samples)

        if self.cluster_type == 'softmax':
            filename += "_softmax"
        elif self.cluster_type == 'mix':
            filename += "_mix"

        if self.cluster_selection_noise > 0.:
            filename += '_n' + str(self.cluster_selection_noise)

        filename += "_c" + self.loss

        return filename + "_" + self._common_filename(epochs)

    def _blackout_loss(self, predictions, n_targets):
        targets = np.arange(n_targets)
        predictions = T.nnet.softmax(predictions)
        pos = T.nnet.categorical_crossentropy(predictions, targets)
        neg = T.log(1 - predictions)
        return pos - neg[:, targets.shape[0]:].sum(axis=-1)

    def _cce_loss(self, predictions, n_targets):
        targets = np.arange(n_targets)
        predictions = T.nnet.softmax(predictions)
        pos = T.nnet.categorical_crossentropy(predictions, targets)
        return pos

    def _lin_loss(self, predictions, n_targets):
        neg = predictions[:, n_targets:].sum(axis=-1)
        pos = T.diag(predictions)
        return neg - pos

    def _BPR_loss(self, predictions, n_targets):
        diff = (predictions -
                T.diag(predictions).dimshuffle([0, 'x']))[:, n_targets:]
        return -(T.log(T.nnet.sigmoid(-diff))).mean(axis=-1)

    def _BPRelu_loss(self, predictions, n_targets):
        diff = (predictions -
                T.diag(predictions).dimshuffle([0, 'x']))[:, n_targets:]
        return lasagne.nonlinearities.leaky_rectify(diff + 0.5).mean(axis=-1)

    def _TOP1_loss(self, predictions, n_targets):
        diff = (predictions -
                T.diag(predictions).dimshuffle([0, 'x']))[:, n_targets:]
        reg = T.sqr(predictions[:, n_targets:])
        return (T.nnet.sigmoid(diff) + T.nnet.sigmoid(reg)).mean(axis=-1)

    def _create_ini_clusters(self):
        c = 0.1 * np.random.randn(self.n_items, self.n_clusters)
        # c = -2 * np.random.random((self.n_items, self.n_clusters)) - 1
        # for i, j in enumerate(np.random.choice(self.n_clusters, self.n_items)):
        # 	c[i,j] *= -1

        # print(np.round(c[:5, :], 2))
        return c.astype(theano.config.floatX)

    def _prepare_networks(self, n_items):
        ''' Prepares the building blocks of the RNN, but does not compile them:
		self.l_in : input layer
		self.l_mask : mask of the input layer
		self.target : target of the network
		self.l_out : output of the network
		self.cost : cost function
		'''

        self.n_items = n_items
        # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie
        self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size,
                                                     self.max_length,
                                                     self._input_size()))
        # The input is completed by a mask to inform the LSTM of the length of the sequence
        self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size,
                                                       self.max_length))

        # recurrent layer
        if not self.use_movies_features:
            l_recurrent = self.recurrent_layer(self.l_in,
                                               self.l_mask,
                                               true_input_size=self.n_items +
                                               self._n_optional_features(),
                                               only_return_final=True)
        else:
            l_recurrent = self.recurrent_layer(self.l_in,
                                               self.l_mask,
                                               true_input_size=None,
                                               only_return_final=True)

        # Theano tensor for the targets
        self.target = T.ivector('target_output')
        self.exclude = T.fmatrix('excluded_items')
        self.samples = T.ivector('samples')
        self.cluster_samples = T.ivector('cluster_samples')

        self.user_representation_layer = l_recurrent

        # The sliced output is then passed through linear layer to obtain the right output size
        self.l_out = BlackoutLayer(l_recurrent,
                                   num_units=self.n_items,
                                   num_outputs=self.n_samples,
                                   nonlinearity=None,
                                   W=lasagne.init.GlorotUniform())

        # lasagne.layers.get_output produces a variable for the output of the net
        network_output = lasagne.layers.get_output(self.l_out,
                                                   targets=self.target,
                                                   samples=self.samples)

        # loss function
        self.cost = self._loss(network_output, self.batch_size).mean()

        # Cluster learning
        self.T_scale = theano.shared(self.effective_scale)
        scaled_softmax = lambda x: lasagne.nonlinearities.softmax(x * self.
                                                                  T_scale)

        self.cluster_selection_layer = lasagne.layers.DenseLayer(
            l_recurrent, b=None, num_units=self.n_clusters, nonlinearity=None)
        cluster_selection = lasagne.layers.get_output(
            self.cluster_selection_layer)
        if self.cluster_selection_noise > 0.:
            cluster_selection = cluster_selection + self._srng.normal(
                cluster_selection.shape,
                avg=0.0,
                std=self.cluster_selection_noise)
        cluster_selection = scaled_softmax(cluster_selection)

        self.cluster_repartition = theano.shared(self._create_ini_clusters())
        if self.cluster_type == 'softmax':
            target_and_samples_clusters = scaled_softmax(
                self.cluster_repartition[
                    T.concatenate([self.target, self.cluster_samples]), :])
        elif self.cluster_type == 'mix':
            target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) + \
             T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :])
        else:
            target_and_samples_clusters = T.nnet.sigmoid(
                self.T_scale * self.cluster_repartition[
                    T.concatenate([self.target, self.cluster_samples]), :])
        cluster_score = cluster_selection.dot(target_and_samples_clusters.T)

        self.cost_clusters = self._loss(cluster_score, self.batch_size).mean()

    def _compile_train_function(self):
        ''' Compile self.train. 
		self.train recieves a sequence and a target for every steps of the sequence, 
		compute error on every steps, update parameter and return global cost (i.e. the error).
		'''
        print("Compiling train...")
        # Compute AdaGrad updates for training
        all_params = lasagne.layers.get_all_params(self.l_out, trainable=True)
        updates = self.updater(self.cost, all_params)

        params_clusters = self.cluster_selection_layer.get_params(
            trainable=True)
        params_clusters.append(self.cluster_repartition)
        updates.update(self.updater(self.cost_clusters, params_clusters))
        # Compile network
        self.train_function = theano.function([
            self.l_in.input_var, self.l_mask.input_var, self.target,
            self.samples, self.cluster_samples, self.exclude
        ],
                                              self.cost,
                                              updates=updates,
                                              allow_input_downcast=True,
                                              name="Train_function",
                                              on_unused_input='ignore')
        print("Compilation done.")

    def _get_hard_clusters(self):
        if self.cluster_type == 'softmax':
            return lasagne.nonlinearities.softmax(100. *
                                                  self.cluster_repartition)
        elif self.cluster_type == 'mix':
            # Clipping is used to avoid the sum of sigmoid and softmax to produce a cluster indicator of 2
            return (lasagne.nonlinearities.softmax(
                100. * self.cluster_repartition) +
                    T.nnet.sigmoid(100. * self.cluster_repartition)).clip(
                        0, 1)
        else:
            return T.nnet.sigmoid(100. * self.cluster_repartition)

    def _compile_predict_function(self):
        ''' Compile self.predict, the deterministic rnn that output the prediction at the end of the sequence
		'''
        print("Compiling predict...")
        if self.predict_with_clusters:
            cluster_selection = lasagne.layers.get_output(
                self.cluster_selection_layer,
                deterministic=True)[0, :].argmax()
            user_representation = lasagne.layers.get_output(
                self.user_representation_layer, deterministic=True)
            theano_predict_function = theano.function(
                [self.l_in.input_var, self.l_mask.input_var],
                [user_representation, cluster_selection],
                allow_input_downcast=True,
                name="Predict_function",
                on_unused_input='ignore')

            def cluster_predict_function(sequence, mask, k, exclude):
                u, c = theano_predict_function(sequence, mask)
                scores = u[0].dot(
                    self.clusters_embeddings[c]) + self.clusters_bias[c]

                cluster_index_exclude = []
                for i in exclude:
                    if i in self.clusters_reverse_index[c]:
                        cluster_index_exclude.append(
                            self.clusters_reverse_index[c][i])
                scores[cluster_index_exclude] = -np.inf

                # find top k according to output
                effective_k = min(k, len(self.clusters[c]))
                return list(self.clusters[c][np.argpartition(
                    -scores,
                    range(effective_k))[:effective_k]]), len(self.clusters[c])

            self.predict_function = cluster_predict_function
        else:
            items_score = lasagne.nonlinearities.softmax(
                lasagne.layers.get_output(self.l_out, deterministic=True))

            user_representation = lasagne.layers.get_output(
                self.user_representation_layer, deterministic=True)
            theano_predict_function = theano.function(
                [self.l_in.input_var, self.l_mask.input_var],
                user_representation,
                allow_input_downcast=True,
                name="Predict_function",
                on_unused_input='ignore')

            def no_cluster_predict_function(sequence, mask, k, exclude):
                u = theano_predict_function(sequence, mask)
                scores = u[0].dot(self.l_out.W.get_value(
                    borrow=True)) + self.l_out.b.get_value(borrow=True)

                scores[exclude] = -np.inf

                # find top k according to output
                return list(np.argpartition(-scores,
                                            range(k))[:k]), self.n_items

            self.predict_function = no_cluster_predict_function

        print("Compilation done.")

    def _compile_test_function(self):
        ''' Compile self.test_function, the deterministic rnn that output the precision@10
		'''
        print("Compiling test...")

        items_score1 = lasagne.nonlinearities.softmax(
            lasagne.layers.get_output(self.l_out, deterministic=True))

        cluster_selection = lasagne.layers.get_output(
            self.cluster_selection_layer, deterministic=True)[0, :].argmax()
        items_clusters = self._get_hard_clusters()
        used_items = items_clusters[:, cluster_selection]
        items_score2 = items_score1 * used_items

        if self.interactions_are_unique:
            items_score1 *= (1 - self.exclude)
            items_score2 *= (1 - self.exclude)

        theano_test_function = theano.function([
            self.l_in.input_var, self.l_mask.input_var, self.target,
            self.samples, self.cluster_samples, self.exclude
        ], [items_score1, items_score2, cluster_selection,
            used_items.sum()],
                                               allow_input_downcast=True,
                                               name="Test_function",
                                               on_unused_input='ignore')

        def precision_test_function(theano_inputs):
            k = 10
            scores1, scores2, c_select, n_used_items = theano_test_function(
                *theano_inputs)
            ids1 = np.argpartition(-scores1, range(k), axis=-1)[0, :k]
            ids2 = np.argpartition(-scores2, range(k), axis=-1)[0, :k]

            return ids1, ids2, c_select, n_used_items

        self.test_function = precision_test_function

        print("Compilation done.")

    def _popularity_sample(self):
        if not hasattr(self, '_cumsum'):
            self._cumsum = np.cumsum(
                np.power(self.dataset.item_popularity, self.sampling_bias))

        return bisect(self._cumsum, random.uniform(0, self._cumsum[-1]))

    def _prepare_input(self, sequences):
        ''' Sequences is a list of [user_id, input_sequence, targets]
		'''

        batch_size = len(sequences)

        # Shape return variables
        X = np.zeros((batch_size, self.max_length, self._input_size()),
                     dtype=self._input_type)  # input of the RNN
        mask = np.zeros(
            (batch_size, self.max_length)
        )  # mask of the input (to deal with sequences of different length)
        Y = np.zeros((batch_size, ), dtype='int32')  # output target
        exclude = np.zeros((batch_size, self.n_items),
                           dtype=theano.config.floatX)

        for i, sequence in enumerate(sequences):
            user_id, in_seq, target = sequence
            seq_features = np.array(
                map(lambda x: self._get_features(x, user_id), in_seq))
            X[i, :len(in_seq), :] = seq_features  # Copy sequences into X
            mask[i, :len(in_seq)] = 1
            Y[i] = target[0][0]  # id of the first and only target
            exclude[i, [j[0] for j in in_seq]] = 1

        if self.sampling_bias > 0.:
            samples = np.array(
                [self._popularity_sample() for i in range(self.n_samples)],
                dtype='int32')
            if self.n_cluster_samples > 0:
                cluster_samples = np.array([
                    self._popularity_sample()
                    for i in range(self.n_cluster_samples)
                ],
                                           dtype='int32')
            else:
                cluster_samples = samples
        else:
            samples = np.random.choice(self.n_items,
                                       self.n_samples).astype('int32')
            if self.n_cluster_samples > 0:
                cluster_samples = np.random.choice(
                    self.n_items, self.n_cluster_samples).astype('int32')
            else:
                cluster_samples = samples

        # scale
        if not hasattr(self, '_last_epoch'):
            self._last_epoch = self.dataset.training_set.epochs
        else:
            if self.dataset.training_set.epochs > self._last_epoch + 1 and self.scale_growing_rate != 1.:
                self.effective_scale *= self.scale_growing_rate**int(
                    self.dataset.training_set.epochs - self._last_epoch)
                self._last_epoch += int(self.dataset.training_set.epochs -
                                        self._last_epoch)
                print("New scale: ", self.effective_scale)
                self.T_scale.set_value(self.effective_scale)

        return (X, mask.astype(theano.config.floatX), Y, samples,
                cluster_samples, exclude)

    def _compute_validation_metrics(self, metrics):
        clusters = np.zeros(self.n_clusters, dtype="int")
        used_items = []
        ev = evaluation.Evaluator(self.dataset, k=10)
        ev_clusters = evaluation.Evaluator(self.dataset, k=10)
        for batch, goal in self._gen_mini_batch(
                self.dataset.validation_set(epochs=1), test=True):
            pred1, pred2, cl, i = self.test_function(batch)
            ev.add_instance(goal, pred1)
            ev_clusters.add_instance(goal, pred2)
            clusters[cl] += 1
            used_items.append(i)

        if self.cluster_type == 'softmax':
            ignored_items = 0
            cluster_size = np.histogram(
                self.cluster_repartition.get_value(borrow=True).argmax(axis=1),
                bins=range(self.n_clusters + 1))[0].tolist()
        elif self.cluster_type == 'mix':
            ignored_items = 0
            sig_clusters = self.cluster_repartition.get_value(borrow=True) > 0.
            softmax_clusters = self.cluster_repartition.get_value(
                borrow=True).argmax(axis=1)
            for i in range(self.n_items):
                sig_clusters[i, softmax_clusters[i]] = True
            cluster_size = sig_clusters.sum(axis=0)
        else:
            ignored_items = (self.cluster_repartition.get_value(
                borrow=True).max(axis=1) < 0.).sum()
            cluster_size = (self.cluster_repartition.get_value(borrow=True) >
                            0.).sum(axis=0)

        metrics['recall'].append(ev.average_recall())
        metrics['cluster_recall'].append(ev_clusters.average_recall())
        metrics['sps'].append(ev.sps())
        metrics['cluster_sps'].append(ev_clusters.sps())
        metrics['assr'].append(self.n_items / np.mean(used_items))
        metrics['ignored_items'].append(ignored_items)
        metrics['cluster_use'].append(clusters)
        metrics['cluster_use_std'].append(np.std(clusters))
        metrics['cluster_size'].append(cluster_size)

        return metrics

    def _print_progress(self, iterations, epochs, start_time, train_costs,
                        metrics, validation_metrics):
        '''Print learning progress in terminal
		'''
        print(self.name, iterations, "batchs, ", epochs, " epochs in",
              time() - start_time, "s")
        print("Last train cost : ", train_costs[-1])
        for m in self.metrics.keys():
            print(m, ': ', metrics[m][-1])
            if m in validation_metrics:
                print(
                    'Best ', m, ': ',
                    max(np.array(metrics[m]) * self.metrics[m]['direction']) *
                    self.metrics[m]['direction'])
        print('-----------------')

        # Print on stderr for easier recording of progress
        print(iterations,
              epochs,
              time() - start_time,
              train_costs[-1],
              metrics['sps'][-1],
              metrics['cluster_sps'][-1],
              metrics['recall'][-1],
              metrics['cluster_recall'][-1],
              metrics['assr'][-1],
              metrics['ignored_items'][-1],
              metrics['cluster_use_std'][-1],
              file=sys.stderr)

    def prepare_tests(self):
        '''Take the soft clustering and make actual clusters.
		'''
        cluster_membership = self.cluster_repartition.get_value(borrow=True)
        item_embeddings = self.l_out.W.get_value(borrow=True)
        item_bias = self.l_out.b.get_value(borrow=True)
        self.clusters = [[] for i in range(self.n_clusters)]
        for i in range(cluster_membership.shape[0]):
            no_cluster = True
            best_cluster = 0
            best_val = cluster_membership[i, 0]
            for j in range(self.n_clusters):
                if cluster_membership[i, j] > 0:
                    self.clusters[j].append(i)
                    no_cluster = False
                elif cluster_membership[i, j] > best_val:
                    best_val = cluster_membership[i, j]
                    best_cluster = j
            if no_cluster:
                self.clusters[best_cluster].append(i)

        self.clusters = [np.array(c) for c in self.clusters]
        self.clusters_reverse_index = []
        for c in self.clusters:
            self.clusters_reverse_index.append(
                {c[j]: j
                 for j in range(len(c))})
        self.clusters_embeddings = [
            item_embeddings[:, c] for c in self.clusters
        ]
        self.clusters_bias = [item_bias[c] for c in self.clusters]

    def top_k_recommendations(self,
                              sequence,
                              user_id=None,
                              k=10,
                              exclude=None):
        ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
		'''

        if exclude is None:
            exclude = []

        # Compile network if needed
        if not hasattr(self, 'predict_function'):
            self._compile_predict_function()

        # Prepare RNN input
        max_length_seq = sequence[-min(self.max_length, len(sequence)):]
        X = np.zeros((1, self.max_length, self._input_size()),
                     dtype=self._input_type)  # input of the RNN
        X[0, :len(max_length_seq), :] = np.array(
            map(lambda x: self._get_features(x, user_id), max_length_seq))
        mask = np.zeros(
            (1, self.max_length)
        )  # mask of the input (to deal with sequences of different length)
        mask[0, :len(max_length_seq)] = 1

        # Run RNN
        if self.interactions_are_unique:
            should_exclude = [i[0] for i in sequence]
        else:
            should_exclude = []
        should_exclude.extend(exclude)
        return self.predict_function(X, mask.astype(theano.config.floatX), k,
                                     should_exclude)

    def save(self, filename):
        '''Save the parameters of a network into a file
		'''
        print('Save model in ' + filename)
        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))
        param = lasagne.layers.get_all_param_values(self.l_out)
        param.append(self.cluster_repartition.get_value(borrow=True))
        param.append([
            p.get_value(borrow=True)
            for p in self.cluster_selection_layer.get_params()
        ])
        f = file(filename, 'wb')
        cPickle.dump(param, f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()

    def load(self, filename):
        '''Load parameters values form a file
		'''
        f = file(filename, 'rb')
        param = cPickle.load(f)
        f.close()
        lasagne.layers.set_all_param_values(
            self.l_out, [i.astype(theano.config.floatX) for i in param[:-2]])
        self.cluster_repartition.set_value(param[-2])
        for p, v in zip(self.cluster_selection_layer.get_params(), param[-1]):
            p.set_value(v)

        self.prepare_tests()
class Conv2DVarDropOutARD(ConvLayer):
    def __init__(self,
                 incoming,
                 num_filters,
                 filter_size,
                 stride=(1, 1),
                 pad=0,
                 untie_biases=False,
                 Wconv=GlorotUniform(),
                 b=Constant(0.),
                 nonlinearity=nonlinearities.rectify,
                 flip_filters=False,
                 convolution=T.nnet.conv2d,
                 ard_init=-10,
                 **kwargs):
        super(Conv2DVarDropOutARD,
              self).__init__(incoming, num_filters, filter_size, stride, pad,
                             untie_biases, Wconv, b, nonlinearity,
                             flip_filters)
        self.convolution = convolution
        self.reg = True
        self.shape = self.get_W_shape()
        self.log_sigma2 = self.add_param(Constant(ard_init),
                                         self.shape,
                                         name="ls2")
        self._srng = RandomStreams(get_rng().randint(1, 2147462579))

    @staticmethod
    def clip(mtx, to=8):
        mtx = T.switch(T.le(mtx, -to), -to, mtx)
        mtx = T.switch(T.ge(mtx, to), to, mtx)
        return mtx

    def convolve(self,
                 input,
                 deterministic=False,
                 train_clip=False,
                 thresh=3,
                 **kwargs):
        log_alpha = self.clip(self.log_sigma2 - T.log(self.W**2 + 1e-8))
        conv_mode = 'conv' if self.flip_filters else 'cross'
        border_mode = self.pad
        clip_mask = T.ge(log_alpha, thresh)
        if border_mode == 'same':
            border_mode = tuple(s // 2 for s in self.filter_size)

        if deterministic:
            conved = dnn.dnn_conv(img=input,
                                  kerns=T.switch(T.ge(log_alpha, thresh), 0,
                                                 self.W),
                                  subsample=self.stride,
                                  border_mode=border_mode,
                                  conv_mode=conv_mode)
        else:
            W = self.W
            if train_clip:
                W = T.switch(clip_mask, 0, W)
            conved_mu = dnn.dnn_conv(img=input,
                                     kerns=W,
                                     subsample=self.stride,
                                     border_mode=border_mode,
                                     conv_mode=conv_mode)
            conved_si = T.sqrt(1e-8 +
                               dnn.dnn_conv(img=input * input,
                                            kerns=T.exp(log_alpha) * W * W,
                                            subsample=self.stride,
                                            border_mode=border_mode,
                                            conv_mode=conv_mode))
            conved = conved_mu + conved_si * self._srng.normal(
                conved_mu.shape, avg=0, std=1)
        return conved

    def eval_reg(self, **kwargs):
        k1, k2, k3 = 0.63576, 1.8732, 1.48695
        C = -k1
        log_alpha = self.clip(self.log_sigma2 - T.log(self.W**2))
        mdkl = k1 * T.nnet.sigmoid(k2 + k3 * log_alpha) - 0.5 * T.log1p(
            T.exp(-log_alpha)) + C
        return -T.sum(mdkl)

    def get_ard(self, thresh=3, **kwargs):
        log_alpha = self.log_sigma2.get_value() - 2 * np.log(
            np.abs(self.W.get_value()))
        return '%.4f' % (np.sum(log_alpha > thresh) * 1.0 / log_alpha.size)

    def get_reg(self):
        log_alpha = self.log_sigma2.get_value() - 2 * np.log(
            np.abs(self.W.get_value()))
        return '%.1f, %.1f' % (log_alpha.min(), log_alpha.max())
예제 #49
0
class NeuralNetSvi:
    """Implements a feedforward neural network trained using stochastic variational inference.
    Supports various types of layers and loss functions."""
    def __init__(self, n_inputs):
        """Constructs a net with a given number of inputs and no layers."""

        assert isposint(
            n_inputs), 'Number of inputs must be a positive integer.'

        self.n_inputs = n_inputs
        self.n_outputs = n_inputs
        self.n_units = [n_inputs]
        self.n_layers = 0
        self.n_params = 0

        self.mWs = []
        self.mbs = []
        self.sWs = []
        self.sbs = []
        self.uas = []
        self.mas = []
        self.zas = []
        self.hs = [tt.matrix('x')]

        self.mps = self.mWs + self.mbs
        self.sps = self.sWs + self.sbs
        self.parms = self.mps + self.sps
        self.input = self.hs[0]
        self.output = self.hs[-1]

        self.srng = RandomStreams()

        self.eval_f = None
        self.eval_f_rand = None

    def addLayer(self, n_units, type):
        """Adds a new layer to the network,
        :param n_units: number of units in the layer
        :param type: a string specification of the activation function
        """

        # check number of units
        assert isposint(n_units), 'Number of units must be a positive integer.'

        # choose activation function
        if type == 'logistic':
            if dtype == 'float32':
                clipvalue = 15.0
            else:
                clipvalue = 19.0
            actfun = lambda t: tt.nnet.sigmoid(
                tt.clip(t, -clipvalue, clipvalue))
        elif type == 'tanh':
            if dtype == 'float32':
                clipvalue = 9.0
            else:
                clipvalue = 19.0
            actfun = lambda t: tt.tanh(tt.clip(t, -clipvalue, clipvalue))
        elif type == 'linear':
            actfun = lambda t: t
        elif type == 'relu':
            actfun = tt.nnet.relu
        elif type == 'softplus':
            actfun = tt.nnet.softplus
        elif type == 'softmax':
            actfun = tt.nnet.softmax
        else:
            raise ValueError(type +
                             ' is not a supported activation function type.')

        n_prev_units = self.n_outputs
        self.n_outputs = n_units
        self.n_units.append(n_units)
        self.n_layers += 1
        self.n_params += 2 * (n_prev_units + 1) * n_units

        mW = theano.shared((rng.randn(n_prev_units, n_units) /
                            np.sqrt(n_prev_units + 1)).astype(dtype),
                           name='mW' + str(self.n_layers))
        mb = theano.shared(np.zeros(n_units, dtype=dtype),
                           name='mb' + str(self.n_layers))
        sW = theano.shared(-5.0 *
                           np.ones([n_prev_units, n_units], dtype=dtype),
                           name='sW' + str(self.n_layers))
        sb = theano.shared(-5.0 * np.ones(n_units, dtype=dtype),
                           name='sb' + str(self.n_layers))
        ua = self.srng.normal((self.hs[-1].shape[0], n_units), dtype=dtype)
        ma = tt.dot(self.hs[-1], mW) + mb
        sa = tt.dot(self.hs[-1]**2, tt.exp(2 * sW)) + tt.exp(2 * sb)
        za = tt.sqrt(sa) * ua + ma
        h = actfun(za)
        h.name = 'h' + str(self.n_layers)

        self.mWs.append(mW)
        self.mbs.append(mb)
        self.sWs.append(sW)
        self.sbs.append(sb)
        self.uas.append(ua)
        self.mas.append(ma)
        self.zas.append(za)
        self.hs.append(h)

        self.mps = self.mWs + self.mbs
        self.sps = self.sWs + self.sbs
        self.parms = self.mps + self.sps
        self.output = self.hs[-1]

        self.eval_f = None
        self.eval_f_rand = None

    def removeLayer(self):
        """Removes a layer from the network."""

        assert self.n_layers > 0, 'There is no layer to remove.'

        n_params_to_rem = 2 * self.n_outputs * (self.n_units[-2] + 1)
        self.n_outputs = self.n_units[-2]
        self.n_units.pop()
        self.n_layers -= 1
        self.n_params -= n_params_to_rem

        self.mWs.pop()
        self.mbs.pop()
        self.sWs.pop()
        self.sbs.pop()
        self.uas.pop()
        self.mas.pop()
        self.zas.pop()
        self.hs.pop()

        self.mps = self.mWs + self.mbs
        self.sps = self.sWs + self.sbs
        self.parms = self.mps + self.sps
        self.output = self.hs[-1]

        self.eval_f = None
        self.eval_f_rand = None

    def eval(self, x, rand=False):
        """Evaluate net at locations in x."""

        if rand:

            # compile theano computation graph, if haven't already done so
            if self.eval_f_rand == None:

                n_data = tt.iscalar('n_data')
                uas = [
                    tt.tile(self.srng.normal((n_units, ), dtype=dtype),
                            [n_data, 1]) for n_units in self.n_units[1:]
                ]

                self.eval_f_rand = theano.function(inputs=[self.hs[0], n_data],
                                                   outputs=self.hs[-1],
                                                   givens=zip(self.uas, uas))

            return self.eval_f_rand(x.astype(dtype), x.shape[0])

        else:

            # compile theano computation graph, if haven't already done so
            if self.eval_f == None:
                self.eval_f = theano.function(inputs=[self.hs[0]],
                                              outputs=self.hs[-1],
                                              givens=zip(self.zas, self.mas))

            return self.eval_f(x.astype(dtype))

    def printInfo(self):
        """Prints some useful info about the net."""

        print 'Number of inputs  =', self.n_inputs
        print 'Number of outputs =', self.n_outputs
        print 'Number of units   =', self.n_units
        print 'Number of layers  =', self.n_layers
        print 'Number of params  =', self.n_params
        print 'Data type =', dtype

    def visualize_weights(self, layer, imsize, layout):
        """
        Displays the weights of a specified layer as images.
        :param layer: the layer whose weights to display
        :param imsize: the image size
        :param layout: number of rows and columns for each page
        :return: none
        """

        helper.disp_imdata(self.mWs[layer].get_value().T, imsize, layout)
        plt.show(block=False)

    def visualize_activations(self, x, layers=None):
        """
        Visualizes the activations of specified layers caused by a given data minibatch.
        :param x: a minibatch of data
        :param layers: list of layers to visualize activations of; defaults to the whole net except the input layer
        :return: none
        """

        if layers is None:
            layers = xrange(self.n_layers)

        forwprop = theano.function(inputs=[self.hs[0]], outputs=self.hs[1:])
        hs = forwprop(x.astype(dtype))

        for l in layers:

            fig = plt.figure()
            ax = fig.add_subplot(1, 1, 1)
            ax.imshow(hs[l], cmap='gray', interpolation='none')
            ax.set_title('Layer ' + str(l))
            ax.set_xlabel('layer units')
            ax.set_ylabel('data points')

        plt.show(block=False)

    def param_hist(self, layers=None):
        """
        Displays a histogram of weights and biases for specified layers.
        :param layers: list of layers to show histograms for; defaults to the whole net
        :return: none
        """

        if layers is None:
            layers = xrange(self.n_layers)

        for l in layers:

            fig, axs = plt.subplots(2, 2)

            nbins = int(np.sqrt(self.mWs[l].get_value().size))
            axs[0, 0].hist(self.mWs[l].get_value().flatten(),
                           nbins,
                           normed=True)
            axs[0, 0].set_title('weight means, layer ' + str(l))
            axs[1, 0].hist(self.sWs[l].get_value().flatten(),
                           nbins,
                           normed=True)
            axs[1, 0].set_title('weight log stds, layer ' + str(l))

            nbins = int(np.sqrt(self.mbs[l].get_value().size))
            axs[0, 1].hist(self.mbs[l].get_value(), nbins, normed=True)
            axs[0, 1].set_title('bias means, layer ' + str(l))
            axs[1, 1].hist(self.sbs[l].get_value(), nbins, normed=True)
            axs[1, 1].set_title('bias log stds, layer ' + str(l))

        plt.show(block=False)
예제 #50
0
def test_undefined_grad():
    srng = MRG_RandomStreams(seed=1234)

    # checking uniform distribution
    low = tensor.scalar()
    out = srng.uniform((), low=low)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, low)

    high = tensor.scalar()
    out = srng.uniform((), low=0, high=high)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, high)

    out = srng.uniform((), low=low, high=high)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (low, high))

    # checking binomial distribution
    prob = tensor.scalar()
    out = srng.binomial((), p=prob)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, prob)

    # checking multinomial distribution
    prob1 = tensor.scalar()
    prob2 = tensor.scalar()
    p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad,
                  theano.tensor.sum(out), prob1)

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad,
                  theano.tensor.sum(out), (prob1, prob2))

    # checking choice
    p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  prob1)

    # checking normal distribution
    avg = tensor.scalar()
    out = srng.normal((), avg=avg)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg)

    std = tensor.scalar()
    out = srng.normal((), avg=0, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std)

    out = srng.normal((), avg=avg, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (avg, std))
예제 #51
0
class LadderAE():
    def __init__(self):
        self.input_dim = 784
        # self.denoising_cost_x = (500.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        # self.denoising_cost_x = (4000.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        self.denoising_cost_x = (1000, 10, 0.1, 0.1, 0.1, 0.1, 0.1)
        self.noise_std = (0.3,) * 7
        # self.noise_std = (0.55, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        self.default_lr = 0.002
        self.shareds = OrderedDict()
        self.rstream = RandomStreams(seed=1)
        self.rng = np.random.RandomState(seed=1)
        self.layers = [(0, (('fc', 784), 'relu')),
                       (1, (('fc', 1000), 'relu')),
                       (2, (('fc', 500), 'relu')),
                       (3, (('fc', 250), 'relu')),
                       (4, (('fc', 250), 'relu')),
                       (5, (('fc', 250), 'relu')),
                       (6, (('fc', 10), 'softmax'))]

    def counter(self):
        name = 'counter'
        p = self.shareds.get(name)
        update = []
        if p is None:
            p_max_val = np.float32(10)
            p = self.shared(np.float32(1), name, role=BNPARAM)
            p_max = self.shared(p_max_val, name + '_max', role=BNPARAM)
            update = [(p, T.clip(p + np.float32(1),
                                 np.float32(0),
                                 p_max)),
                      (p_max, p_max_val)]
        return (p, update)

    def annotate_bn(self, var, id, var_type, mb_size, size):
        var_shape = np.array((1, size))
        out_dim = np.prod(var_shape) / np.prod(var_shape[0])
        # Flatten the var - shared variable updating is not trivial otherwise,
        # as theano seems to believe a row vector is a matrix and will complain
        # about the updates
        orig_shape = var.shape
        var = var.flatten()
        # Here we add the name and role, the variables will later be identified
        # by these values
        var.name = id + '_%s_clean' % var_type
        add_role(var, BNPARAM)
        shared_var = self.shared(np.zeros(out_dim),
                                 name='shared_%s' % var.name, role=None)

        # Update running average estimates. When the counter is reset to 1, it
        # will clear its memory
        cntr, c_up = self.counter()
        one = np.float32(1)
        run_avg = lambda new, old: one / cntr * new + (one - one / cntr) * old
        if var_type == 'mean':
            new_value = run_avg(var, shared_var)
        elif var_type == 'var':
            mb_size = T.cast(mb_size, 'float32')
            new_value = run_avg(mb_size / (mb_size - one) * var, shared_var)
        else:
            raise NotImplemented('Unknown batch norm var %s' % var_type)

        def annotate_update(update, tag_to):
            a = Annotation()
            for (var, up) in update:
                a.updates[var] = up
            add_annotation(tag_to, a)

        # Add the counter update to the annotated update if it is the first
        # instance of a counter
        annotate_update([(shared_var, new_value)] + c_up, var)

        return var.reshape(orig_shape)

    def shared(self, init, name, cast_float32=True, role=PARAMETER, **kwargs):
        p = self.shareds.get(name)
        if p is None:
            p = shared_param(init, name, cast_float32, role, **kwargs)
            self.shareds[name] = p
        return p

    def new_activation_dict(self):
        return AttributeDict({'z': {}, 'h': {}, 's': {}, 'm': {}})

    def encoder(self, input_, noise_std):
        z = input_
        d = self.new_activation_dict()
        z = z + (self.rstream.normal(size=z.shape).astype(floatX) *
                 noise_std[0])
        d.z[0] = z
        h = z
        d.h[0] = h

        prev_dim = self.input_dim
        for i, (spec, act_f) in self.layers[1:]:
            layer_type, dim = spec
            noise = noise_std[i] if i < len(noise_std) else 0.
            z, m, s, h = self.f(h, prev_dim, layer_type, dim,
                                i, act_f, noise)
            self.layer_dims[i] = dim
            d.z[i] = z
            d.s[i] = s
            d.m[i] = m
            d.h[i] = h
            prev_dim = dim

        return d

    def decoder(self, clean, corr, batch_size):
        get_unlabeled = lambda x: x[batch_size:] if x is not None else x
        est = self.new_activation_dict()
        costs = AttributeDict()
        costs.denois = AttributeDict()
        for i, ((_, spec), act_f) in self.layers[::-1]:
            z_corr = get_unlabeled(corr.z[i])
            z_clean = get_unlabeled(clean.z[i])
            z_clean_s = get_unlabeled(clean.s.get(i))
            z_clean_m = get_unlabeled(clean.m.get(i))

            # It's the last layer
            if i == len(self.layers) - 1:
                fspec = (None, None)
                ver = get_unlabeled(corr.h[i])
                ver_dim = self.layer_dims[i]
                top_g = True
            else:
                fspec = self.layers[i + 1][1][0]
                ver = est.z.get(i + 1)
                ver_dim = self.layer_dims.get(i + 1)
                top_g = False

            z_est = self.g(z_lat=z_corr,
                           z_ver=ver,
                           in_dims=ver_dim,
                           out_dims=self.layer_dims[i],
                           num=i,
                           fspec=fspec,
                           top_g=top_g)

            # For semi-supervised version
            if z_clean_s:
                z_est_norm = (z_est - z_clean_m) / z_clean_s
            else:
                z_est_norm = z_est
            z_est_norm = z_est

            se = SquaredError('denois' + str(i))
            costs.denois[i] = se.apply(z_est_norm.flatten(2),
                                       z_clean.flatten(2)) \
                / np.prod(self.layer_dims[i], dtype=floatX)
            costs.denois[i].name = 'denois' + str(i)

            # Store references for later use
            est.z[i] = z_est
            est.h[i] = apply_act(z_est, act_f)
            est.s[i] = None
            est.m[i] = None
        return est, costs

    def apply(self, input_lb, input_un, target):
        batch_size = input_lb.shape[0]
        get_labeled = lambda x: x[:batch_size] if x is not None else x
        input = T.concatenate([input_lb, input_un], axis=0)
        self.layer_dims = {0: self.input_dim}
        self.lr = self.shared(self.default_lr, 'learning_rate', role=None)
        top = len(self.layers) - 1

        clean = self.encoder(input, noise_std=[0])
        corr = self.encoder(input, noise_std=self.noise_std)

        ests, costs = self.decoder(clean, corr, batch_size)

        # Costs
        y = target.flatten()

        costs.class_clean = CategoricalCrossEntropy().apply(
            y, get_labeled(clean.h[top]))
        costs.class_clean.name = 'CE_clean'

        costs.class_corr = CategoricalCrossEntropy().apply(
            y, get_labeled(corr.h[top]))
        costs.class_corr.name = 'CE_corr'

        costs.total = costs.class_corr * 1.0
        for i in range(len(self.layers)):
            costs.total += costs.denois[i] * self.denoising_cost_x[i]
        costs.total.name = 'Total_cost'

        self.costs = costs

        # Classification error
        mr = MisclassificationRate()
        self.error = mr.apply(y, get_labeled(clean.h[top])) * np.float32(100.)
        self.error.name = 'Error_rate'

    def rand_init(self, in_dim, out_dim):
        return self.rng.randn(in_dim, out_dim) / np.sqrt(in_dim)

    def apply_layer(self, layer_type, input_, in_dim, out_dim, layer_name):
        # Since we pass this path twice (clean and corr encoder), we
        # want to make sure that parameters of both layers are shared.
        layer = self.shareds.get(layer_name)
        if layer is None:
            if layer_type == 'fc':
                linear = Linear(use_bias=False,
                                name=layer_name,
                                input_dim=in_dim,
                                output_dim=out_dim,
                                seed=1)
                linear.weights_init = Glorot(self.rng, in_dim, out_dim)
                linear.initialize()
                layer = linear
                self.shareds[layer_name] = layer

        return layer.apply(input_)

    def f(self, h, in_dim, layer_type, dim, num, act_f, noise_std):
        layer_name = 'f_' + str(num) + '_'

        z = self.apply_layer(layer_type, h, in_dim, dim, layer_name)

        m = s = None
        m = z.mean(0, keepdims=True)
        s = z.var(0, keepdims=True)

        # if noise_std == 0:
        #     m = self.annotate_bn(m, layer_name + 'bn', 'mean',
        #                          z.shape[0], dim)
        #     s = self.annotate_bn(s, layer_name + 'bn', 'var',
        #                          z.shape[0], dim)

        z = (z - m) / T.sqrt(s + np.float32(1e-10))

        z_lat = z + self.rstream.normal(size=z.shape).astype(
            floatX) * noise_std
        z = z_lat

        # Add bias
        if act_f != 'linear':
            z += self.shared(0.0 * np.ones(dim), layer_name + 'b',
                             role=BIAS)

        # Add Gamma parameter if necessary. (Not needed for all act_f)
        if (act_f in ['sigmoid', 'tanh', 'softmax']):
            c = self.shared(1.0 * np.ones(dim), layer_name + 'c',
                            role=WEIGHT)
            z *= c

        h = apply_act(z, act_f)

        return z_lat, m, s, h

    def g(self, z_lat, z_ver, in_dims, out_dims, num, fspec, top_g):
        f_layer_type, dims = fspec
        layer_name = 'g_' + str(num) + '_'

        in_dim = np.prod(dtype=floatX, a=in_dims)
        out_dim = np.prod(dtype=floatX, a=out_dims)

        if top_g:
            u = z_ver
        else:
            u = self.apply_layer(f_layer_type, z_ver,
                                 in_dim, out_dim, layer_name)

        u -= u.mean(0, keepdims=True)
        u /= T.sqrt(u.var(0, keepdims=True) + np.float32(1e-10))

        z_lat = z_lat.flatten(2)
        bi = lambda inits, name: self.shared(inits * np.ones(out_dim),
                                             layer_name + name, role=BIAS)
        wi = lambda inits, name: self.shared(inits * np.ones(out_dim),
                                             layer_name + name, role=WEIGHT)

        type_ = 'wierd'

        if type_ == 'wierd':
            sigval = (bi(0., 'c1') +
                      wi(1., 'c2') * z_lat +
                      wi(0., 'c3') * u +
                      wi(0., 'c4') * z_lat * u)
            sigval = T.nnet.sigmoid(sigval)
            z_est = (bi(0., 'a1') +
                     wi(1., 'a2') * z_lat +
                     wi(0., 'a3') * u +
                     wi(0., 'a4') * z_lat * u +
                     wi(1., 'b1') * sigval)

        elif type_ == 'simple':
            # if num != 6:
            #     z_lat = z_lat * 0.0
            #     wu = wi(1., 'a3') * u
            # else:
            wu = wi(0., 'a3') * u
            wz = wi(1., 'a2') * z_lat
            wzu = wi(0., 'a4') * z_lat * u

            z_est = (bi(0., 'a1') +
                     wz +
                     wu +
                     wzu)

        elif type_ == 'yoshua':
            wz = wi(1., 'a2') * z_lat
            wu = wi(0., 'a3') * u
            b = wi(1., 'b1')

            batch_size = u[:, 0:1].shape
            srng = T.shared_randomstreams.RandomStreams(
                self.rng.randint(999999))
            mask = srng.binomial(n=1, p=0.5, size=batch_size)
            mask = T.addbroadcast(mask, 1)
            z_est = (mask * wz + (1 - mask) * wu) + b

        if (type(out_dims) == tuple and
                len(out_dims) > 1.0 and z_est.ndim < 4):
            z_est = z_est.reshape((z_est.shape[0],) + out_dims)

        return z_est
예제 #52
0
def test_normal0():

    steps = 50
    std = 2.
    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE']
            or config.mode == 'Mode' and config.linker in ['py']):
        sample_size = (25, 30)
        default_rtol = .02
    else:
        sample_size = (999, 50)
        default_rtol = .01
    sample_size_odd = (sample_size[0], sample_size[1] - 1)
    x = tensor.matrix()

    for size, const_size, var_input, input, avg, rtol, std_tol in [
        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
        (x.shape, sample_size, [x],
         [np.zeros(sample_size,
                   dtype=config.floatX)], -5., default_rtol, default_rtol),
            # test odd value
        (x.shape, sample_size_odd, [x],
         [np.zeros(sample_size_odd,
                   dtype=config.floatX)], -5., default_rtol, default_rtol),
        (sample_size, sample_size, [], [],
         np.arange(np.prod(sample_size), dtype='float32').reshape(sample_size),
         10. * std / np.sqrt(steps), default_rtol),
            # test empty size (scalar)
        ((), (), [], [], -5., default_rtol, 0.02),
            # test with few samples at the same time
        ((1, ), (1, ), [], [], -5., default_rtol, 0.02),
        ((3, ), (3, ), [], [], -5., default_rtol, 0.02),
    ]:

        R = MRG_RandomStreams(234)
        # Note: we specify `nstreams` to avoid a warning.
        n = R.normal(size=size,
                     avg=avg,
                     std=std,
                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, n)
        f(*input)

        # Increase the number of steps if size implies only a few samples
        if np.prod(const_size) < 10:
            steps_ = steps * 50
        else:
            steps_ = steps
        basictest(f,
                  steps_,
                  const_size,
                  target_avg=avg,
                  target_std=std,
                  prefix='mrg ',
                  allow_01=True,
                  inputs=input,
                  mean_rtol=rtol,
                  std_tol=std_tol)

        sys.stdout.flush()

        RR = theano.tensor.shared_randomstreams.RandomStreams(234)

        nn = RR.normal(size=size, avg=avg, std=std)
        ff = theano.function(var_input, nn)

        basictest(ff,
                  steps_,
                  const_size,
                  target_avg=avg,
                  target_std=std,
                  prefix='numpy ',
                  allow_01=True,
                  inputs=input,
                  mean_rtol=rtol)
예제 #53
0
class vdrvc(object):
    def __init__(self):
        self._srng = RandomStreams(42)
        self.theta = None
        self.log_alpha = None

    def score(self, X, t):
        return acc(np.argmax(X.dot(self.theta.T), axis=1), t)

    def predict(self, X):
        return np.argmax(X.dot(self.theta.T), axis=1)

    def fit(self,
            X,
            t,
            num_classes,
            batch_size,
            max_iter=1000,
            display_each=100,
            lr=1e-2,
            beta=0.95):
        N, d = X.shape

        def create_theano_loss(d):
            X, t = T.dmatrix('X'), T.dvector('t')
            log_sigma2 = theano.shared(np.ones((num_classes, d)))
            theta = theano.shared(np.random.randn(num_classes, d))

            # Change parametrization
            log_alpha = log_sigma2 - T.log(theta**2)
            la, alpha = log_alpha, T.exp(log_alpha)

            # -KL(q || prior)
            mD_KL = -(0.5 * T.log1p(T.exp(-la)) -
                      (0.03 + 1.0 /
                       (1.0 + T.exp(-(1.5 * (la + 1.3)))) * 0.64)).sum()

            # NLL through Local Reparametrization
            mu, si = T.dot(X, theta.T), T.sqrt(
                T.dot(X * X, (alpha * theta * theta).T))
            activation = mu + self._srng.normal(mu.shape, avg=0, std=1) * si
            predictions = T.nnet.softmax(activation)
            ell = -T.sum(
                categorical_crossentropy(predictions, one_hot(t, num_classes)))

            # Objective Negative SGVLB
            nlb = -(N / batch_size * ell + mD_KL)

            # Optimization Method and Function Compiling
            opt = lasagne.updates.adam(nlb, [log_sigma2, theta],
                                       learning_rate=lr,
                                       beta1=beta)
            lbf = function([X, t], nlb, updates=opt)

            return lbf, theta, log_sigma2

        lbf, theta, log_sigma2 = create_theano_loss(d)

        # Main loop
        for i in range(max_iter):
            if batch_size != N:
                idx = np.random.choice(X.shape[0], batch_size)
                loss = lbf(X[idx], t[idx])
            else:
                loss = lbf(X, t)

            if display_each and i % display_each == 0:
                self.theta = theta.get_value()
                self.log_alpha = log_sigma2.get_value() - 2 * np.log(
                    np.abs(self.theta))
                acc_, ard_ = acc(
                    self.predict(X),
                    t), np.sum(self.log_alpha > 5) * 1.0 / self.log_alpha.size
                print('iter = %.4f' % i, 'vlb = %.4f' % loss,
                      'acc = %.4f' % acc_, 'ard = %.4f' % ard_)

        return self
    def get_samples_and_objectives(self, model, data):
        space, sources = self.get_data_specs(model)
        space.validate(data)
        assert isinstance(model, AdversaryPair)
        g = model.generator
        d = model.discriminator

        # Note: this assumes data is design matrix
        X = data
        m = data.shape[space.get_batch_axis()]
        y1 = T.alloc(1, m, 1)
        y0 = T.alloc(0, m, 1)
        # NOTE: if this changes to optionally use dropout, change the inference
        # code below to use a non-dropped-out version.
        S, z, other_layers = g.sample_and_noise(
            m,
            default_input_include_prob=self.
            generator_default_input_include_prob,
            default_input_scale=self.generator_default_input_scale,
            all_g_layers=(self.infer_layer is not None))

        if self.noise_both != 0.:
            rng = MRG_RandomStreams(2014 / 6 + 2)
            S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both
            X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both

        y_hat1 = d.dropout_fprop(X,
                                 self.discriminator_default_input_include_prob,
                                 self.discriminator_input_include_probs,
                                 self.discriminator_default_input_scale,
                                 self.discriminator_input_scales)
        y_hat0 = d.dropout_fprop(S,
                                 self.discriminator_default_input_include_prob,
                                 self.discriminator_input_include_probs,
                                 self.discriminator_default_input_scale,
                                 self.discriminator_input_scales)

        # d_obj =  0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0))

        pos_mask = y_hat1 < .5 + self.d_eps
        neg_mask = y_hat0 > .5 - self.d_eps

        pos_cost_matrix = d.layers[-1].cost_matrix(y1, y_hat1)
        neg_cost_matrix = d.layers[-1].cost_matrix(y0, y_hat0)

        pos_cost = (pos_mask * pos_cost_matrix).mean()
        neg_cost = (neg_mask * neg_cost_matrix).mean()

        d_obj = 0.5 * (pos_cost + neg_cost)

        if self.no_drop_in_d_for_g:
            y_hat0_no_drop = d.dropout_fprop(S)
            g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0_no_drop)
        else:
            g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0)
        assert g_cost_mat.ndim == 2
        assert y_hat0.ndim == 2

        mask = y_hat0 < 0.5 + self.g_eps
        masked_cost = g_cost_mat * mask
        g_obj = masked_cost.mean()

        if model.inferer is not None:
            # Change this if we ever switch to using dropout in the
            # construction of S.
            S_nograd = block_gradient(
                S)  # Redundant as long as we have custom get_gradients
            pred = model.inferer.dropout_fprop(
                S_nograd, self.inference_default_input_include_prob,
                self.inference_input_include_probs,
                self.inference_default_input_scale,
                self.inference_input_scales)
            if self.infer_layer is None:
                target = z
            else:
                target = other_layers[self.infer_layer]
            i_obj = model.inferer.layers[-1].cost(target, pred)
        else:
            i_obj = 0

        return S, d_obj, g_obj, i_obj
class QueuelessVariationalQueueManager(QueueManager):
    """
    A variational-autoencoder-based manager which does not use the queue, with a configurable loss
    """
    def __init__(self, feature_size, period=None, variational_loss_scale=1):
        """
        Initialize the manager.

        Parameters:
            feature_size: The width of a feature
            period: Period for queue activations
            variational_loss_scale: Factor by which to scale variational loss
        """
        self._feature_size = feature_size
        self._period = period
        self._srng = MRG_RandomStreams(np.random.randint(0, 1024))
        self._variational_loss_scale = np.array(variational_loss_scale,
                                                np.float32)

    @property
    def activation_width(self):
        return self.feature_size * 2

    @property
    def feature_size(self):
        return self._feature_size

    def helper_sample(self, input_activations):
        n_batch, n_time, _ = input_activations.shape
        means = input_activations[:, :, :self.feature_size]
        stdevs = abs(input_activations[:, :,
                                       self.feature_size:]) + constants.EPSILON
        wiggle = self._srng.normal(means.shape)

        vects = means + (stdevs * wiggle)

        strengths = T.zeros((n_batch, n_time))
        if self._period is None:
            strengths = T.set_subtensor(strengths[:, -1], 1)
        else:
            strengths = T.set_subtensor(
                strengths[:, self._period - 1::self._period], 1)

        return strengths, vects, means, stdevs, {}

    def get_strengths_and_vects(self, input_activations):
        strengths, vects, means, stdevs, _ = self.helper_sample(
            input_activations)
        return strengths, vects

    def process(self, input_activations, extra_info=False):

        strengths, vects, means, stdevs, sample_info = self.helper_sample(
            input_activations)

        means_sq = means**2
        variance = stdevs**2
        loss_parts = 1 + T.log(variance) - means_sq - variance
        if self._period is None:
            loss_parts = loss_parts[:, -1]
        else:
            loss_parts = loss_parts[:, self._period - 1::self._period]
        variational_loss = -0.5 * T.sum(
            loss_parts) * self._variational_loss_scale

        info = {"variational_loss": variational_loss}
        info.update(sample_info)
        if extra_info:
            return variational_loss, strengths, vects, info
        else:
            return variational_loss, strengths, vects
class Generator(Model):
    def __init__(self,
                 mlp,
                 noise="gaussian",
                 monitor_ll=False,
                 ll_n_samples=100,
                 ll_sigma=0.2):
        Model.__init__(self)
        self.__dict__.update(locals())
        del self.self
        self.theano_rng = MRG_RandomStreams(2014 * 5 + 27)

    def get_input_space(self):
        return self.mlp.get_input_space()

    def dropout_fprop(self,
                      sample_data,
                      default_input_include_prob=1.,
                      default_input_scale=1.,
                      all_g_layers=False):
        if all_g_layers:
            rval = self.mlp.dropout_fprop(
                sample_data,
                default_input_include_prob=default_input_include_prob,
                default_input_scale=default_input_scale,
                return_all=all_g_layers)
            other_layers, rval = rval[:-1], rval[-1]
        else:
            rval = self.mlp.dropout_fprop(
                sample_data,
                default_input_include_prob=default_input_include_prob,
                default_input_scale=default_input_scale)
            other_layers = None

        return rval, other_layers

    def sample_and_noise(self,
                         num_samples,
                         default_input_include_prob=1.,
                         default_input_scale=1.,
                         all_g_layers=False):
        n = self.mlp.get_input_space().get_total_dimension()
        noise = self.get_noise((num_samples, n))
        formatted_noise = VectorSpace(n).format_as(noise,
                                                   self.mlp.get_input_space())
        rval, other_layers = self.dropout_fprop(
            formatted_noise,
            default_input_include_prob=default_input_include_prob,
            default_input_scale=default_input_scale,
            all_g_layers=all_g_layers)

        return rval, formatted_noise, other_layers

    def sample(self,
               num_samples,
               default_input_include_prob=1.,
               default_input_scale=1.):
        sample, _, _ = self.sample_and_noise(num_samples,
                                             default_input_include_prob,
                                             default_input_scale)
        return sample

    def get_monitoring_channels(self, data):
        if data is None:
            m = 100
        else:
            m = data.shape[0]
        n = self.mlp.get_input_space().get_total_dimension()
        noise = self.get_noise((m, n))
        rval = OrderedDict()

        try:
            rval.update(self.mlp.get_monitoring_channels((noise, None)))
        except Exception:
            warnings.warn(
                "something went wrong with generator.mlp's monitoring channels"
            )

        if self.monitor_ll:
            rval['ll'] = T.cast(
                self.ll(data, self.ll_n_samples, self.ll_sigma),
                theano.config.floatX).mean()
            rval['nll'] = -rval['ll']
        return rval

    def get_noise(self, size):

        # Allow just requesting batch size
        if isinstance(size, int):
            size = (size, self.get_input_space().get_total_dimension())

        if not hasattr(self, 'noise'):
            self.noise = "gaussian"
        if self.noise == "uniform":
            return self.theano_rng.uniform(low=-np.sqrt(3),
                                           high=np.sqrt(3),
                                           size=size,
                                           dtype='float32')
        elif self.noise == "gaussian":
            return self.theano_rng.normal(size=size, dtype='float32')
        elif self.noise == "spherical":
            noise = self.theano_rng.normal(size=size, dtype='float32')
            noise = noise / T.maximum(1e-7, T.sqrt(
                T.sqr(noise).sum(axis=1))).dimshuffle(0, 'x')
            return noise
        else:
            raise NotImplementedError(self.noise)

    def get_params(self):
        return self.mlp.get_params()

    def get_output_space(self):
        return self.mlp.get_output_space()

    def ll(self, data, n_samples, sigma):

        samples = self.sample(n_samples)
        output_space = self.mlp.get_output_space()
        if 'Conv2D' in str(output_space):
            samples = output_space.convert(samples, output_space.axes,
                                           ('b', 0, 1, 'c'))
            samples = samples.flatten(2)
            data = output_space.convert(data, output_space.axes,
                                        ('b', 0, 1, 'c'))
            data = data.flatten(2)
        parzen = theano_parzen(data, samples, sigma)
        return parzen

    def _modify_updates(self, updates):
        self.mlp.modify_updates(updates)

    def get_lr_scalers(self):
        return self.mlp.get_lr_scalers()

    def __setstate__(self, state):
        self.__dict__.update(state)
        if 'monitor_ll' not in state:
            self.monitor_ll = False
예제 #57
0
class ESGD(RMSProp):
    r'''Equilibrated SGD computes a diagonal Hessian preconditioner.

    Notes
    -----

    The ESGD method uses the same general strategy as all first-order
    stochastic gradient methods, in the sense that these methods make small
    parameter adjustments iteratively using local derivative information.

    The difference here is that as gradients are computed during each parameter
    update, an exponentially-weighted moving average (EWMA) of estimates of the
    diagonal of the Hessian (the matrix of second derivatives) is maintained as
    well. At each update, the EWMA is used to compute the root-mean-square (RMS)
    diagonal value that's been seen in the recent past. The actual gradient is
    scaled by the inverse of this diagonal preconditioner before being applied
    to update the parameters. Intuitively, this causes the algorithm to
    "reshape" the loss function in parameter space, such that directions of
    steep gradient (i.e., large diagonal values) and directions of shallow
    gradient (i.e., small diagonal values) are scaled to be approximately the
    same slope.

    The diagonal estimates are computed using a nice trick: A vector :math:`r
    \sim \mathcal{N}(0, 1)` consisting of standard normal values is sampled
    randomly at each update step, and the value of :math:`Hr` is computed
    symbolically. These vector values tend to approximate the diagonal of the
    Hessian. Because :math:`Hr` is itself a vector, the full Hessian :math:`H`
    does not need to be computed or stored.

    .. math::
        \begin{eqnarray*}
        r &\sim& \mathcal{N}(0, 1) \\
        Hr &=& \frac{\partial^2 \mathcal{L}}{\partial^2 p}r \\
        D_{t+1} &=& \gamma D_t + (1 - \gamma) (Hr)^2 \\
        p_{t+1} &=& p_t + - \frac{\alpha}{\sqrt{D_{t+1} + \epsilon}}
           \frac{\partial\mathcal{L}}{\partial p}
        \end{eqnarray*}

    Like :class:`Rprop` and the :class:`ADADELTA`--:class:`RMSProp` family, this
    learning method effectively maintains a sort of parameter-specific learning
    rate for each parameter in the loss.

    In this implementation, :math:`\epsilon` regularizes the RMS values; it is
    is specified using the ``rms_regularizer`` parameter.

    The weight parameter :math:`\gamma` for the EWMA is computed from the
    ``rms_halflife`` keyword argument, such that the actual EWMA weight varies
    inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln
    2}{h}}`.

    The primary difference between this implementation and the algorithm
    described in the paper (see below) is the use of an EWMA to decay the
    diagonal values over time, while in the paper the diagonal is divided by the
    training iteration. The EWMA halflife should be set to something reasonably
    large to ensure that this method emulates the method described in the
    original paper.

    References
    ----------

    .. [Daup14] Y. Dauphin, H. de Vries, J. Chung & Y. Bengio. (2014) "RMSProp
       and equilibrated adaptive learning rates for non-convex optimization."
       http://arxiv.org/abs/1502.04390
    '''
    def __init__(self, *args, **kwargs):
        self.rng = RandomStreams()
        super(ESGD, self).__init__(*args, **kwargs)

    def _get_updates_for(self, param, grad):
        D_tm1 = shared_like(param, 'D_ewma')
        Hv = TT.Rop(grad, param, self.rng.normal(param.shape))
        D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv
        den = TT.sqrt(D_t) + self.epsilon
        yield D_tm1, D_t
        yield param, param - grad * self.learning_rate / den
예제 #58
0
def test_match_grad_valid_conv():

    # Tests that weightActs is the gradient of FilterActs
    # with respect to the weights.

    for partial_sum in [0, 1, 4]:
        rng = np.random.RandomState([2012, 10, 9])

        batch_size = 3
        rows = 7
        cols = 9
        channels = 8
        filter_rows = 4
        filter_cols = filter_rows
        num_filters = 16

        images = shared(rng.uniform(
            -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                        name='images')
        filters = shared(rng.uniform(-1., 1.,
                                     (channels, filter_rows, filter_cols,
                                      num_filters)).astype('float32'),
                         name='filters')

        gpu_images = gpu_from_host(images)
        gpu_filters = gpu_from_host(filters)

        output = FilterActs(partial_sum=partial_sum)(gpu_images, gpu_filters)
        output = host_from_gpu(output)

        images_bc01 = images.dimshuffle(3, 0, 1, 2)
        filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
        filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

        output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid')

        output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

        theano_rng = MRG_RandomStreams(2013 + 1 + 31)

        coeffs = theano_rng.normal(avg=0.,
                                   std=1.,
                                   size=output_conv2d.shape,
                                   dtype='float32')

        cost_conv2d = (coeffs * output_conv2d).sum()

        weights_grad_conv2d = T.grad(cost_conv2d, filters)

        cost = (coeffs * output).sum()
        hid_acts_grad = T.grad(cost, output)

        weights_grad = WeightActs(partial_sum=partial_sum)(
            gpu_images, gpu_from_host(hid_acts_grad), as_tensor_variable(
                (4, 4)))[0]
        weights_grad = host_from_gpu(weights_grad)

        f = function(
            [], [output, output_conv2d, weights_grad, weights_grad_conv2d])

        output, output_conv2d, weights_grad, weights_grad_conv2d = f()

        if np.abs(output - output_conv2d).max() > 8e-6:
            assert type(output) == type(output_conv2d)
            assert output.dtype == output_conv2d.dtype
            if output.shape != output_conv2d.shape:
                print 'cuda-convnet shape: ', output.shape
                print 'theano shape: ', output_conv2d.shape
                assert False
            err = np.abs(output - output_conv2d)
            print 'absolute error range: ', (err.min(), err.max())
            print 'mean absolute error: ', err.mean()
            print 'cuda-convnet value range: ', (output.min(), output.max())
            print 'theano value range: ', (output_conv2d.min(),
                                           output_conv2d.max())
            assert False

        warnings.warn(
            """test_match_grad_valid_conv success criterion is not very strict. Can we verify that this is OK?
                         One possibility is that theano is numerically unstable and Alex's code is better.
                         Probably theano CPU 64 bit is OK but it's worth checking the others."""
        )

        if np.abs(weights_grad - weights_grad_conv2d).max() > 8.6e-6:
            if type(weights_grad) != type(weights_grad_conv2d):
                raise AssertionError("weights_grad is of type " +
                                     str(weights_grad))
            assert weights_grad.dtype == weights_grad_conv2d.dtype
            if weights_grad.shape != weights_grad_conv2d.shape:
                print 'cuda-convnet shape: ', weights_grad.shape
                print 'theano shape: ', weights_grad_conv2d.shape
                assert False
            err = np.abs(weights_grad - weights_grad_conv2d)
            print 'absolute error range: ', (err.min(), err.max())
            print 'mean absolute error: ', err.mean()
            print 'cuda-convnet value range: ', (weights_grad.min(),
                                                 weights_grad.max())
            print 'theano value range: ', (weights_grad_conv2d.min(),
                                           weights_grad_conv2d.max())
            assert False
예제 #59
0
def test_normal0():

    steps = 50
    std = 2.
    if mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE']:
        sample_size = (25, 30)
        default_rtol = .02
    else:
        sample_size = (999, 50)
        default_rtol = .01
    sample_size_odd = (sample_size[0], sample_size[1] - 1)
    x = tensor.matrix()
    for size, const_size, var_input, input, avg, rtol in [
        (sample_size, sample_size, [], [], -5., default_rtol),
        (x.shape, sample_size, [x],
         [numpy.zeros(sample_size, dtype=config.floatX)], -5., default_rtol),
            #test odd value
        (sample_size_odd, sample_size_odd, [], [], -5., default_rtol),
            #test odd value
        (x.shape, sample_size_odd, [x],
         [numpy.zeros(sample_size_odd,
                      dtype=config.floatX)], -5., default_rtol),
        (sample_size, sample_size, [], [],
         numpy.arange(numpy.prod(sample_size),
                      dtype='float32').reshape(sample_size),
         10. * std / numpy.sqrt(steps)),
    ]:
        #print ''
        #print 'ON CPU:'

        R = MRG_RandomStreams(234, use_cuda=False)
        # Note: we specify `nstreams` to avoid a warning.
        n = R.normal(size=size,
                     avg=avg,
                     std=std,
                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, n, mode=mode)
        #theano.printing.debugprint(f)
        out = f(*input)
        #print 'random?[:10]\n', out[0, 0:10]
        basictest(f,
                  steps,
                  const_size,
                  target_avg=avg,
                  target_std=std,
                  prefix='mrg ',
                  allow_01=True,
                  inputs=input,
                  mean_rtol=rtol)

        sys.stdout.flush()

        if mode != 'FAST_COMPILE' and cuda_available:
            #print ''
            #print 'ON GPU:'
            R = MRG_RandomStreams(234, use_cuda=True)
            n = R.normal(size=size,
                         avg=avg,
                         std=std,
                         dtype='float32',
                         nstreams=rng_mrg.guess_n_streams(size, warn=False))
            #well, it's really that this test w GPU doesn't make sense otw
            assert n.dtype == 'float32'
            f = theano.function(
                var_input,
                theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(n),
                           borrow=True),
                mode=mode_with_gpu)

            #theano.printing.debugprint(f)
            sys.stdout.flush()
            gpu_out = numpy.asarray(f(*input))
            #print 'random?[:10]\n', gpu_out[0, 0:10]
            #print '----'
            sys.stdout.flush()
            basictest(f,
                      steps,
                      const_size,
                      target_avg=avg,
                      target_std=std,
                      prefix='gpu mrg ',
                      allow_01=True,
                      inputs=input,
                      mean_rtol=rtol)
            # Need to allow some rounding error as their is float
            # computation that are done on the gpu vs cpu
            assert numpy.allclose(out, gpu_out, rtol=5e-6, atol=5e-6)

        #print ''
        #print 'ON CPU w NUMPY:'
        RR = theano.tensor.shared_randomstreams.RandomStreams(234)

        nn = RR.normal(size=size, avg=avg, std=std)
        ff = theano.function(var_input, nn)

        basictest(ff,
                  steps,
                  const_size,
                  target_avg=avg,
                  target_std=std,
                  prefix='numpy ',
                  allow_01=True,
                  inputs=input,
                  mean_rtol=rtol)
예제 #60
0
class SGHMCSampler(object):
    def __init__(self, rng=None, precondition=False, ignore_burn_in=False):
        if rng:
            self._srng = rng
        else:
            self._srng = RandomStreams(np.random.randint(1, 2147462579))
        self.precondition = precondition
        self.prepared = False
        self.ignore_burn_in = ignore_burn_in
        self.steps_burn_in = 0
        self.requires_burn_in = self.precondition
        self.optim_params = []
        self.initial_values = []

    def _store_initial_values(self, *params):
        self.optim_params = []
        self.initial_values = []
        for param in params:
            self.optim_params.append(param)
            self.initial_values.append(param.get_value())

    def prepare_updates(self,
                        cost,
                        params,
                        epsilon,
                        mdecay=0.05,
                        inputs=[],
                        scale_grad=1.,
                        A=None,
                        **kwargs):
        self.updates = []
        self.burn_in_updates = []
        grads = T.grad(cost, params)
        self.params = params
        self.cost = cost
        self.count = sharedX(0)
        self.epsilon = sharedX(np.float32(epsilon))
        self.mdecay = sharedX(np.float32(mdecay))
        self.inputs = inputs
        self.scale_grad = theano.shared(np.float32(scale_grad))
        if A is not None:
            # calculate mdecay based on A
            #raise NotImplementedError("TODO")
            eps_scaled = epsilon / np.sqrt(scale_grad)
            new_mdecay = A * eps_scaled
            self.mdecay.set_value(np.float32(new_mdecay))
            print("You specified A of {} -> changing mdecay to {}".format(
                A, mdecay))

        for theta, grad in zip(params, grads):
            xi = sharedX(theta.get_value() * 0. + 1,
                         broadcastable=theta.broadcastable)
            g = sharedX(theta.get_value() * 0. + 1,
                        broadcastable=theta.broadcastable)
            g2 = sharedX(theta.get_value() * 0. + 1,
                         broadcastable=theta.broadcastable)
            p = sharedX(theta.get_value() * 0.,
                        broadcastable=theta.broadcastable)
            r_t = 1. / (xi + 1.)
            self._store_initial_values(xi, g, g2, p)
            if self.precondition:
                g_t = (1. - r_t) * g + r_t * grad
                g2_t = (1. - r_t) * g2 + r_t * grad**2
                xi_t = 1. + xi * (1. - g * g / (g2 + 1e-16))
                Minv = 1. / (T.sqrt(g2 + 1e-16) + 1e-16)
                self.burn_in_updates.append((g, g_t))
                self.burn_in_updates.append((g2, g2_t))
                self.burn_in_updates.append((xi, xi_t))
                noise = 0.
            else:
                Minv = 1.
                noise = 0.
            self.epsilon_scaled = self.epsilon / T.sqrt(self.scale_grad)
            noise_scale = 2. * self.epsilon_scaled**2 * self.mdecay * Minv - 2. * self.epsilon_scaled**3 * T.square(
                Minv) * noise
            sigma = T.sqrt(T.maximum(noise_scale, 1e-16))
            sample_t = self._srng.normal(size=theta.shape) * sigma
            p_t = p - self.epsilon**2 * Minv * grad - self.mdecay * p + sample_t
            theta_t = theta + p_t
            self.updates.append((theta, theta_t))
            self.updates.append((p, p_t))
        self.prepared = True
        if self.ignore_burn_in:
            self.updates += self.burn_in_updates
            return self.updates
        else:
            return self.updates, self.burn_in_updates

    def step(self, *inp):
        if not self.prepared:
            raise RuntimeError(
                "You called step() without a prior call to prepare_updates()")
        if not hasattr(self, "step_fun"):
            print("... compiling theano function")

            self.step_fun = theano.function(self.inputs,
                                            self.cost,
                                            updates=self.updates)
        if not self.ignore_burn_in and self.steps_burn_in < 1 and self.requires_burn_in:
            raise RuntimeError(
                "Your sampler requires a burn_in please run step_burn_in() for a few steps"
            )
        nll = self.step_fun(*inp)
        return self.params, nll

    def step_burn_in(self, *inp):
        if not self.prepared:
            raise RuntimeError(
                "You called step_burn_in() without a prior call to prepare_updates()"
            )
        if not hasattr(self, "step_fun_burn_in"):
            print("... compiling theano function")
            if self.ignore_burn_in:
                self.step_fun_burn_in = theano.function(self.inputs,
                                                        self.cost,
                                                        updates=self.updates)
            else:
                self.step_fun_burn_in = theano.function(self.inputs,
                                                        self.cost,
                                                        updates=self.updates +
                                                        self.burn_in_updates)

        nll = self.step_fun_burn_in(*inp)
        self.steps_burn_in += 1
        return self.params, nll

    def reset(self, n_samples, epsilon, reset_opt_params=False, **kwargs):
        if self.prepared:
            self.epsilon.set_value(np.float32(epsilon))
            self.scale_grad.set_value(np.float32(n_samples))
            if hasattr(self, "mdecay"):
                if "mdecay" in kwargs:
                    self.mdecay.set_value(np.float32(kwargs["mdecay"]))
                elif "A" in kwargs:
                    eps_scaled = self.epsilon.get_value() / np.sqrt(n_samples)
                    new_mdecay = A * eps_scaled
                    self.mdecay.set_value(np.float32(new_mdecay))
            if reset_opt_params:
                for param, value in zip(self.optim_params,
                                        self.initial_values):
                    param.set_value(value)
        else:
            raise RuntimeError("reset called before prepare")