Exemplo n.º 1
0
    def get_cost_updates(self, corruption_level, learning_rate):
        """ This function computes the cost and the updates for one trainng
        step of the dA """

        # this is how if-then-else is written in Theano
        tilde_x = T.switch(T.gt(corruption_level, 0), self.get_corrupted_input(self.x, corruption_level), self.x)
        y = self.get_hidden_values(tilde_x)
        z = self.get_reconstructed_input(y)
        act = T.dot(tilde_x, self.W) + self.b
        # note : we sum over the size of a datapoint; if we are using
        #        minibatches, L will be a vector, with one entry per
        #        example in minibatch
        # L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1)
        # note : L is now a vector, where each element is the
        #        cross-entropy cost of the reconstruction of the
        #        corresponding example of the minibatch. We need to
        #        compute the average of all these to get the cost of
        #        the minibatch
        
        L = T.sqrt(T.sum(T.sqr(T.sub(self.x, z)), axis=1))
        reg = T.sum(y, axis=0) / T.shape(y)[0] # sum over training set
        rho = T.constant(0.05)
        beta = T.constant(self.beta)
        reg1 = T.sum(rho * T.log(rho / reg) + (1-rho) * T.log((1-rho) / (1-reg)))
        cost = T.mean(L) + beta * reg1

        # compute the gradients of the cost of the `dA` with respect
        # to its parameters
        gparams = T.grad(cost, self.params)
        # generate the list of updates
        updates = {}
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - learning_rate * gparam
        
        return (cost, collections.OrderedDict(updates.items()))
Exemplo n.º 2
0
    def compute_output(self, network):
        hyperparameter_name = network.find_hyperparameter(["hyperparameter"])
        # TODO add default hyperparameter
        res = network.find_hyperparameter([hyperparameter_name])
        if utils.is_number(res):
            var = T.constant(res)
            shape = ()
        elif utils.is_ndarray(res):
            var = T.constant(res)
            shape = res.shape
        elif utils.is_shared_variable(res):
            var = res
            shape = res.get_value().shape
        elif utils.is_nonshared_variable(res):
            var = res
            if res.ndim == 0:
                shape = ()
            else:
                shape = network.find_hyperparameter(["shape"])
        else:
            raise ValueError("Unknown hyperparameter type of %s" % res)

        network.create_vw(
            "default",
            variable=var,
            shape=shape,
            tags={"output"},
        )
Exemplo n.º 3
0
def test_alloc_memset_0():
    i = tensor.iscalar()
    z = numpy.zeros((1,), dtype='float32')
    o = numpy.ones((1,), dtype='float32')
    ones = numpy.ones((2,), dtype='float32')

    # Test with 0
    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0
    assert (numpy.asarray(f(6)) == 0).all()

    # Test with 1
    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(o)), i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
    assert not topo[0].op.memset_0
    assert (numpy.asarray(f(6)) == 1).all()

    # Test with 1, 1
    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(ones)), i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
    assert not topo[0].op.memset_0
    assert (numpy.asarray(f(2)) == 1).all()
    def __init__(self, input_dim, N, init_scale=2.0):
        """
        A zoomable attention window for 1-dimensional inputs.

        Parameters
        ----------
        input_dim : int
            length of the input vectors
        N :
            length of the attention window
        init_scale :
            initial scaling for inputs vs. attention window
        """
        self.input_dim = input_dim
        self.N = N
        self.init_scale = init_scale
        # make offsets for internal dispersement of grid points.
        #   -- internal grid coordinates range over [-1...+1]
        offsets = np.arange(N) - (N / 2.0) + 0.5
        offsets = offsets / np.max(offsets)
        offsets = offsets.astype(theano.config.floatX)
        self.grid_offsets = T.constant(offsets)
        # make coordinate vectors for location in the input.
        #   -- coordinates for the smallest dimension are scaled to range over
        #      [-init_scale....init_scale].
        x_coords = (np.arange(input_dim) - (input_dim / 2.0) + 0.5)
        x_coords = (init_scale / np.max(x_coords)) * x_coords
        x_coords = x_coords.astype(theano.config.floatX)
        self.x_coords = T.constant(x_coords)
        return
Exemplo n.º 5
0
def hard_sigmoid(x):
    out_dtype = scalar.upgrade_to_float(scalar.Scalar(dtype=x.dtype))[0].dtype
    slope = T.constant(0.2, dtype=out_dtype)
    shift = T.constant(0.5, dtype=out_dtype)
    x = (x * slope) + shift
    x = T.clip(x, 0, 1)
    return x
    def generate_forward_diffusion_sample(self, X_noiseless):
        """
        Corrupt a training image with t steps worth of Gaussian noise, and
        return the corrupted image, as well as the mean and covariance of the
        posterior q(x^{t-1}|x^t, x^0).
        """

        X_noiseless = X_noiseless.reshape(
            (-1, self.n_colors, self.spatial_width, self.spatial_width))

        n_images = X_noiseless.shape[0].astype('int16')
        rng = Random().theano_rng
        # choose a timestep in [1, self.trajectory_length-1].
        # note the reverse process is fixed for the very
        # first timestep, so we skip it.
        # TODO for some reason random_integer is missing from the Blocks
        # theano random number generator.
        t = T.floor(rng.uniform(size=(1,1), low=1, high=self.trajectory_length,
            dtype=theano.config.floatX))
        t_weights = self.get_t_weights(t)
        N = rng.normal(size=(n_images, self.n_colors, self.spatial_width, self.spatial_width),
            dtype=theano.config.floatX)

        # noise added this time step
        beta_forward = self.get_beta_forward(t)
        # decay in noise variance due to original signal this step
        alpha_forward = 1. - beta_forward
        # compute total decay in the fraction of the variance due to X_noiseless
        alpha_arr = 1. - self.beta_arr
        alpha_cum_forward_arr = T.extra_ops.cumprod(alpha_arr).reshape((self.trajectory_length,1))
        alpha_cum_forward = T.dot(t_weights.T, alpha_cum_forward_arr)
        # total fraction of the variance due to noise being mixed in
        beta_cumulative = 1. - alpha_cum_forward
        # total fraction of the variance due to noise being mixed in one step ago
        beta_cumulative_prior_step = 1. - alpha_cum_forward/alpha_forward

        # generate the corrupted training data
        X_uniformnoise = X_noiseless + (rng.uniform(size=(n_images, self.n_colors, self.spatial_width, self.spatial_width),
            dtype=theano.config.floatX)-T.constant(0.5,dtype=theano.config.floatX))*T.constant(self.uniform_noise,dtype=theano.config.floatX)
        X_noisy = X_uniformnoise*T.sqrt(alpha_cum_forward) + N*T.sqrt(1. - alpha_cum_forward)

        # compute the mean and covariance of the posterior distribution
        mu1_scl = T.sqrt(alpha_cum_forward / alpha_forward)
        mu2_scl = 1. / T.sqrt(alpha_forward)
        cov1 = 1. - alpha_cum_forward/alpha_forward
        cov2 = beta_forward / alpha_forward
        lam = 1./cov1 + 1./cov2
        mu = (
                X_uniformnoise * mu1_scl / cov1 +
                X_noisy * mu2_scl / cov2
            ) / lam
        sigma = T.sqrt(1./lam)
        sigma = sigma.reshape((1,1,1,1))

        mu.name = 'mu q posterior'
        sigma.name = 'sigma q posterior'
        X_noisy.name = 'X_noisy'
        t.name = 't'

        return X_noisy, t, mu, sigma
Exemplo n.º 7
0
def lcn_std_diff(x,size=9):
    # Function borrowed from bengioe_util
    p = x.reshape((1,1,48,48))
    #p = (p-TT.mean(p))/T.std(p)
    g = gaussian(size,1.591/size)
    g/=g.sum()
    g = numpy.float32(g.reshape((1,1,size,size)))
    mean = TT.nnet.conv.conv2d(p,TT.constant(g),
                              (1,1,48,48),
                              (1,1,size,size),
                              'full').reshape((48+size-1,)*2)
    mean = mean[size/2:48+size/2,
                size/2:48+size/2]
    meansq = TT.nnet.conv.conv2d(TT.sqr(p),TT.constant(g),
                                (1,1,48,48),
                                (1,1,size,size),
                                'full').reshape((48+size-1,)*2)
    meansq = meansq[size/2:48+size/2,
                    size/2:48+size/2]
    var = meansq - TT.sqr(mean)
    var = TT.clip(var, 0, 1e30)
    std = TT.sqrt(var)
    std = TT.clip(std, TT.mean(std), 1e30)
    out = (p - mean) / std
    return out - out.min()
Exemplo n.º 8
0
    def rmsprop(self, lr, tparams, grads, inp_list, cost, params):
        clip = params["grad_clip"]
        decay_rate = tensor.constant(params["decay_rate"], dtype=theano.config.floatX)
        smooth_eps = tensor.constant(params["smooth_eps"], dtype=theano.config.floatX)
        zipped_grads = [theano.shared(np.zeros_like(p.get_value()), name="%s_grad" % k) for k, p in tparams.iteritems()]
        running_grads2 = [
            theano.shared(np.zeros_like(p.get_value()), name="%s_rgrad2" % k) for k, p in tparams.iteritems()
        ]
        zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
        if clip > 0.0:
            rg2up = [
                (
                    rg2,
                    tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g, -clip, clip) ** 2), 0.0, np.inf),
                )
                for rg2, g in zip(running_grads2, grads)
            ]
        else:
            rg2up = [
                (rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g ** 2), 0.0, np.inf))
                for rg2, g in zip(running_grads2, grads)
            ]

        f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up, name="rmsprop_f_grad_shared")

        updir = [theano.shared(p.get_value() * numpy_floatX(0.0), name="%s_updir" % k) for k, p in tparams.iteritems()]
        updir_new = [
            (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2)
        ]
        param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)]
        f_update = theano.function(
            [lr], [], updates=updir_new + param_up, on_unused_input="ignore", name="rmsprop_f_update"
        )

        return f_grad_shared, f_update, zipped_grads, running_grads2, updir
Exemplo n.º 9
0
def lcn(x,ishape,size=9):
    # Function borrowed from bengioe_util
    """
    expects x to be tensor{3|4}, the first dimension being the number
    of images, and the two last the shape of the image (which should be
    given anyways for optimization purposes
    """
    inshape = (x.shape[0],1,ishape[0],ishape[1])
    p = x.reshape(inshape)
    #p = (p-TT.mean(p))/T.std(p)
    g = gaussian(size,1.591/size)
    g/=g.sum()
    g = numpy.float32(g.reshape((1,1,size,size)))
    mean = TT.nnet.conv.conv2d(p,TT.constant(g),
                              None,
                              (1,1,size,size),
                              'full').reshape(
                                  (x.shape[0],1)+(ishape[0]+size-1,)*2)
    mean = mean[:,:,
                size/2:ishape[0]+size/2,
                size/2:ishape[1]+size/2]
    v = (p - mean)#.dimshuffle('x','x',0,1)
    var = TT.nnet.conv.conv2d(TT.sqr(v),TT.constant(g),
                             None,
                             (1,1,size,size),
                             'full').reshape(
                                  (x.shape[0],1)+(ishape[0]+size-1,)*2)
    var = var[:,:,
              size/2:ishape[0]+size/2,
              size/2:ishape[1]+size/2]
    std = TT.sqrt(var)
    std_mean = TT.mean(TT.mean(std,axis=3),axis=2).dimshuffle(0,1,'x','x')
    out = v / TT.maximum(std,std_mean)
    return (out + 2.5 )/5# - out.min()
 def __init__(self, img_height, img_width, obj_type='circle', obj_scale=0.2):
     """
     A class for drawing a few simple objects with subpixel resolution.
     """
     self.img_height = img_height
     self.img_width = img_width
     self.obj_type = obj_type
     self.obj_scale = obj_scale
     # make coordinate system for points in the object to render
     obj_x_coords, obj_y_coords = self._construct_obj_coords( \
                 obj_type=self.obj_type, obj_scale=self.obj_scale)
     self.obj_x = T.constant(obj_x_coords)
     self.obj_y = T.constant(obj_y_coords)
     self.obj_x_range = [np.min(obj_x_coords), np.max(obj_x_coords)]
     self.obj_y_range = [np.min(obj_y_coords), np.max(obj_y_coords)]
     # make coordinate system for x and y location in the image.
     #   -- image coordinates for the smallest dimension range over
     #      [-init_scale....init_scale], and coordinates for the largest
     #      dimension are at the same scale, but over a larger range.
     img_x_coords, img_y_coords = self._construct_img_coords( \
                 x_dim=self.img_width, y_dim=self.img_height)
     self.img_x = T.constant(img_x_coords)
     self.img_y = T.constant(img_y_coords)
     self.img_x_range = [np.min(img_x_coords), np.max(img_x_coords)]
     self.img_y_range = [np.min(img_y_coords), np.max(img_y_coords)]
     return
Exemplo n.º 11
0
    def _init_params_(self, kbm, kbm_mask, emb, word_size=100, hidden_size=400, prefix='KBMN_'):
        # L2-normalize the embedding matrix
        emb_ = np.sqrt(np.sum(emb ** 2, axis=1))
        emb = emb / np.dot(emb_.reshape(-1, 1), np.ones((1, emb.shape[1])))
        emb[0, :] = 0.

        self.emb = theano.shared(
            value=np.asarray(emb, dtype=theano.config.floatX),
            name=prefix + 'emb',
            borrow=True
        )

        self.kbm = T.constant(
            x=kbm,
            name=prefix + 'kbm',
            ndim=2,
            dtype='int32'
        )

        self.kbm_mask = T.constant(
            x=kbm_mask,
            name=prefix + 'kbm_mask',
            ndim=2,
            dtype=theano.config.floatX
        )

        def _random_weights(x_dim, y_dim):
            return np.random.uniform(
                low=-np.sqrt(6. / (x_dim + y_dim)),
                high=np.sqrt(6. / (x_dim + y_dim)),
                size=(x_dim, y_dim)
            ).astype(theano.config.floatX)

        self.gru_W = theano.shared(
            value=np.concatenate(
                [_random_weights(word_size, hidden_size),
                 _random_weights(word_size, hidden_size),
                 _random_weights(word_size, hidden_size)],
                axis=1
            ).astype(theano.config.floatX),
            name=prefix+'gru_W',
            borrow=True
        )

        self.gru_U = theano.shared(
            value=np.concatenate(
                [_random_weights(hidden_size, hidden_size),
                 _random_weights(hidden_size, hidden_size),
                 _random_weights(hidden_size, hidden_size)],
                axis=1
            ).astype(theano.config.floatX),
            name=prefix+'gru_U',
            borrow=True
        )

        self.gru_B = theano.shared(
            value=np.zeros((3 * hidden_size,)).astype(theano.config.floatX),
            name=prefix+'b',
            borrow=True
        )
Exemplo n.º 12
0
 def test_constant(self):
     ## Re-init counter
     Variable.__count__ = count(0)
     r1 = tensor.constant(1.5)
     r2 = tensor.constant(1.5)
     assert r1.auto_name == "auto_0"
     assert r2.auto_name == "auto_1"
Exemplo n.º 13
0
def test_mixture_api():
    # Check basic API
    p1 = Normal(mu=0.0, sigma=T.constant(1.0))
    p2 = Normal(mu=1.0, sigma=2.0)
    m = Mixture(components=[p1, p2], weights=[0.25])

    assert len(m.components) == 2
    assert len(m.weights) == 2

    assert len(m.parameters_) == 4
    assert len(m.constants_) == 1
    assert len(m.observeds_) == 0

    assert p1.mu in m.parameters_
    assert p1.sigma in m.constants_
    assert p2.mu in m.parameters_
    assert p2.sigma in m.parameters_
    assert m.X == p1.X
    assert m.X == p2.X
    assert m.ndim == p1.ndim
    assert m.ndim == p2.ndim

    m = Mixture(components=[p1, p2])
    w = m.compute_weights()
    assert_array_equal(w, [0.5, 0.5])

    y = T.dscalar(name="y")
    w1 = T.constant(0.25)
    w2 = y * 2
    m = Mixture(components=[p1, p2], weights=[w1, w2])
    assert y in m.observeds_

    # Check errors
    assert_raises(ValueError, Mixture,
                  components=[p1, p1, p1], weights=[1.0])
Exemplo n.º 14
0
    def test_transform_thin_plate_spline_variable_input(self):
        import lasagne
        from lasagne.utils import floatX
        from theano.tensor import constant

        x = np.random.random((10, 3, 28, 28)).astype('float32')
        x_sym = theano.tensor.tensor4()

        l_in = lasagne.layers.InputLayer((None, 3, None, 28))
        l_loc = lasagne.layers.DenseLayer(
                lasagne.layers.ReshapeLayer(l_in, ([0], 3*28*28)),
                num_units=32)
        l_trans = lasagne.layers.TPSTransformerLayer(
                l_in, l_loc, precompute_grid='auto')

        # check that shape propagation works
        assert l_trans.output_shape[0] is None
        assert l_trans.output_shape[1] == 3
        assert l_trans.output_shape[2] is None
        assert l_trans.output_shape[3] == 28

        # check that data propagation works
        dest_offset = np.zeros(shape=(10, 32))
        inputs = floatX(np.arange(np.prod(x.shape)).reshape(x.shape))
        outputs = l_trans.get_output_for([constant(inputs),
                                          constant(dest_offset)]).eval()
        np.testing.assert_allclose(inputs, outputs, atol=5e-4)
Exemplo n.º 15
0
 def softmax(self, D, I):
   D = D * T.constant(self.attrs['sharpening'], 'float32')
   if self.attrs['norm'] == 'exp':
     E = T.exp(-D) * I
     E = E / T.maximum(T.sum(E,axis=0,keepdims=True),T.constant(1e-20,'float32'))
   elif self.attrs['norm'] == 'sigmoid':
     E = (numpy.float32(1) - T.tanh(D)**2) * I
   elif self.attrs['norm'] == 'lstm':
     n_out = self.attrs['template']
     def lstm(z, i_t, s_p, h_p):
       z += T.dot(h_p, self.N_re)
       i = T.outer(i_t, T.alloc(numpy.cast['int8'](1), n_out))
       ingate = T.nnet.sigmoid(z[:,n_out: 2 * n_out])
       forgetgate = T.nnet.sigmoid(z[:,2 * n_out:3 * n_out])
       outgate = T.nnet.sigmoid(z[:,3 * n_out:])
       input = T.tanh(z[:,:n_out])
       s_t = input * ingate + s_p * forgetgate
       h_t = T.tanh(s_t) * outgate
       return theano.gradient.grad_clip(s_t * i, -50, 50), h_t * i
     E, _ = theano.scan(lstm, sequences=[D,I], outputs_info=[T.zeros((n_out,), 'float32'), T.zeros((n_out,), 'int32')])
     E = T.nnet.sigmoid(T.dot(E,self.N_out))
   else:
     raise NotImplementedError()
   if self.attrs['nbest'] > 1:
     opt = T.minimum(self.attrs['nbest'], E.shape[0])
     score = (T.sort(E, axis=0)[-opt]).dimshuffle('x',0).repeat(E.shape[0],axis=0)
     E = T.switch(T.lt(E,score), T.zeros_like(E), E)
   return E
Exemplo n.º 16
0
    def _allocate(self):
        input_dim = ((self.input_dim,)
                     if not isinstance(self.input_dim, collections.Sequence)
                     else self.input_dim)
        broadcastable = (tuple(False for _ in input_dim)
                         if self.broadcastable is None else self.broadcastable)
        if len(input_dim) != len(broadcastable):
            raise ValueError("input_dim and broadcastable must be same length")
        var_dim = tuple(1 if broadcast else dim for dim, broadcast in
                        equizip(input_dim, broadcastable))
        broadcastable = broadcastable

        # "beta", from the Ioffe & Szegedy manuscript.
        if self.learn_shift:
            self.shift = shared_floatx_nans(var_dim, name='batch_norm_shift',
                                            broadcastable=broadcastable)
            add_role(self.shift, BATCH_NORM_SHIFT_PARAMETER)
            self.parameters.append(self.shift)
        else:
            self.shift = tensor.constant(0, dtype=theano.config.floatX)

        if self.learn_scale and not self.mean_only:
            # "gamma", from the Ioffe & Szegedy manuscript.
            self.scale = shared_floatx_nans(var_dim, name='batch_norm_scale',
                                            broadcastable=broadcastable)

            add_role(self.scale, BATCH_NORM_SCALE_PARAMETER)
            self.parameters.append(self.scale)
        else:
            self.scale = tensor.constant(1., dtype=theano.config.floatX)

        self._allocate_population_statistics(var_dim, broadcastable)
Exemplo n.º 17
0
 def test_transform_thin_plate_spline_shift(self):
     from lasagne.layers import InputLayer, TPSTransformerLayer
     from theano.tensor import constant
     batchsize = 5
     num_control_points = 16
     dest_offset = np.ones(shape=(batchsize, 2*num_control_points))
     l_in = InputLayer((batchsize, 3, 28, 28))
     l_loc = InputLayer((batchsize, 2*num_control_points))
     layer = TPSTransformerLayer(
             l_in, l_loc, control_points=num_control_points
     )
     image = np.zeros(shape=(28, 28))
     image[[0, -1], :] = 1
     image[:, [0, -1]] = 1
     inputs = np.tile(image, (batchsize, 3, 1, 1))
     shifted_input = np.ones(shape=(28, 28))
     shifted_input[:13, :13] = 0
     shifted_input[13, :13] = 0.50000271
     shifted_input[:13, 13] = 0.50000271
     shifted_input[13, 13] = 0.75000271
     shifted_input = np.tile(shifted_input, (batchsize, 3, 1, 1))
     outputs = layer.get_output_for([constant(inputs),
                                     constant(dest_offset)]).eval()
     np.testing.assert_allclose(shifted_input,
                                outputs, atol=1e-5)
Exemplo n.º 18
0
Arquivo: model.py Projeto: DrDark/pymc
    def __init__(self, name, data, distribution, model):
        """
        Parameters
        ----------

        type : theano type (optional)
        owner : theano owner (optional)

        name : str
        distribution : Distribution
        model : Model
        """
        self.name = name
        data = getattr(data, 'values', data) #handle pandas
        args = as_iterargs(data)

        if len(args) > 1:
            params = getargspec(distribution.logp).args
            args = [t.constant(d, name=name + "_" + param)
                    for d,param in zip(args,params) ]
        else:
            args = [t.constant(args[0], name=name)]

        self.logp_elemwiset = distribution.logp(*args)
        self.model = model
Exemplo n.º 19
0
def test_binary_hinge_loss():
    x = np.array([[-1.5, -1, -0.5, 0, 0.5, 1, 1.5]] * 2, dtype=fX)
    y = np.array([[0] * 7, [1] * 7], dtype=fX)
    res = treeano.utils.binary_hinge_loss(T.constant(x), T.constant(y)).eval()
    ans = np.array([[0, 0, 0.5, 1, 1.5, 2, 2.5], [2.5, 2, 1.5, 1, 0.5, 0, 0]],
                   dtype=fX)
    np.testing.assert_equal(res, ans)
Exemplo n.º 20
0
def test_draw_value():
    npt.assert_equal(_draw_value(np.array([5, 6])), [5, 6])
    npt.assert_equal(_draw_value(np.array(5.)), 5)

    npt.assert_equal(_draw_value(tt.constant([5., 6.])), [5, 6])
    assert _draw_value(tt.constant(5)) == 5
    npt.assert_equal(_draw_value(2 * tt.constant([5., 6.])), [10, 12])

    val = theano.shared(np.array([5., 6.]))
    npt.assert_equal(_draw_value(val), [5, 6])
    npt.assert_equal(_draw_value(2 * val), [10, 12])

    a = tt.scalar('a')
    a.tag.test_value = 6
    npt.assert_equal(_draw_value(2 * a, givens=[(a, 1)]), 2)

    assert _draw_value(5) == 5
    assert _draw_value(5.) == 5
    assert isinstance(_draw_value(5.), type(5.))
    assert isinstance(_draw_value(5), type(5))

    with pm.Model():
        mu = 2 * tt.constant(np.array([5., 6.])) + theano.shared(np.array(5))
        a = pm.Normal('a', mu=mu, sd=5, shape=2)

    val1 = _draw_value(a)
    val2 = _draw_value(a)
    assert np.all(val1 != val2)

    with pytest.raises(ValueError) as err:
        _draw_value([])
    err.match('Unexpected type')
Exemplo n.º 21
0
 def add_param(self, param, name="", constraints=True,
               custom_update=None, custom_update_normalized=False, custom_update_exp_average=0,
               custom_update_condition=None, custom_update_accumulate_batches=None):
   """
   :type param: theano.SharedVariable
   :type name: str
   :rtype: theano.SharedVariable
   """
   param = super(Layer, self).add_param(param, name)
   if custom_update:
     # Handled in Device and Updater.
     param.custom_update = custom_update
     param.custom_update_normalized = custom_update_normalized
     param.custom_update_exp_average = custom_update_exp_average
     param.custom_update_condition = custom_update_condition
     param.custom_update_accumulate_batches = custom_update_accumulate_batches
   if constraints:
     if 'L1' in self.attrs and self.attrs['L1'] > 0:
       self.constraints += T.constant(self.attrs['L1'], name="L1", dtype='floatX') * abs(param).sum()
     if 'L2' in self.attrs and self.attrs['L2'] > 0:
       self.constraints += T.constant(self.attrs['L2'], name="L2", dtype='floatX') * (param**2).sum()
     if self.attrs.get('L2_eye', 0) > 0:
       L2_eye = T.constant(self.attrs['L2_eye'], name="L2_eye", dtype='floatX')
       if param.ndim == 2:
         eye = tiled_eye(param.shape[0], param.shape[1], dtype=param.dtype)
         self.constraints += L2_eye * ((param - eye)**2).sum()
       else:  # standard L2
         self.constraints += L2_eye * (param**2).sum()
     if 'varreg' in self.attrs and self.attrs['varreg'] > 0:
       self.constraints += self.attrs['varreg'] * (1.0 * T.sqrt(T.var(param)) - 1.0 / numpy.sum(param.get_value().shape))**2
   return param
Exemplo n.º 22
0
    def get_updates(self, cost, learning_rate, momentum):
        if not self.params:
            self.learning_rate = T.constant(0)
            return {}

        if self.grads is None:
            self.grads = [theano.shared(np.zeros_like(p.get_value()))
                          for p in self.params]

        # compute the gradients of the cost with respect to the parameters
        gparams = T.grad(cost, self.params, disconnected_inputs='ignore')
        grad_mult = self.conf.geteval('grad_mult', None)
        if grad_mult is not None:
            grad_mult = T.constant(grad_mult, dtype=floatX)
            gparams = [g * grad_mult for g in gparams]

        clip = self.conf.getfloat('grad_clip', None)
        if clip is not None:
            gparams = [T.clip(g, -clip, clip) for g in gparams]

        self.gparams = gparams

        # generate the list of updates
        gupdates = OrderedDict()
        pupdates = OrderedDict()

        self.learning_rate = self.conf.getfloat('learning_rate', None)
        if self.learning_rate:
            self.learning_rate = T.constant(self.learning_rate)
        else:
            self.learning_rate = learning_rate
        for (gparam, param, gold) in zip(gparams, self.params, self.grads):
            lrscale = self.conf.getfloat(
                                'learning_rate_scale_%s' % param.name,
                                None)
            if lrscale is None:
                lrscale = self.conf.getfloat('learning_rate_scale', 1.0)
            decay = self.conf.getfloat('weight_decay_%s' % param.name, 0.0)

            lr = self.learning_rate
            if lrscale != 1.0:
                lr *= lrscale

            if decay:
                gparam += decay * param

            if momentum:
                gnew = momentum * gold + gparam
                gupdates[gold] = gnew
                pupdates[param] = param - lr * gnew
            else:
                gupdates[gold] = gparam
                pupdates[param] = param - lr * gparam

        # apply update constraints
        for (p, constraint) in self.constraints.iteritems():
            pupdates[p] = constraint(pupdates[p])

        return OrderedDict(gupdates.items() + pupdates.items())
Exemplo n.º 23
0
 def test_constant(self):
     # Get counter value
     autoname_id = next(Variable.__count__)
     Variable.__count__ = count(autoname_id)
     r1 = tensor.constant(1.5)
     r2 = tensor.constant(1.5)
     assert r1.auto_name == "auto_" + str(autoname_id)
     assert r2.auto_name == "auto_" + str(autoname_id + 1)
Exemplo n.º 24
0
 def __init__(self, incoming, means, covariances, weights, patch_size, 
              pool_func=T.sum, **kwargs):
     self.means = T.constant(means, dtype=theano.config.floatX)
     self.covariances = T.constant(covariances, dtype=theano.config.floatX)
     self.weights = T.constant(weights, dtype=theano.config.floatX)
     self.patch_size = patch_size
     self.pool_func = pool_func
     super(GaussianMixtureSimilarityLayer,self).__init__(incoming, **kwargs)
Exemplo n.º 25
0
 def compute_output(self, network, state_vw, sampled_vw):
     W = T.constant(TARGET_WEIGHT)
     b = T.constant(TARGET_BIAS)
     target = T.dot(state_vw.variable, W) + b.dimshuffle("x", 0)
     reward = -T.sqr(sampled_vw.variable - target).sum(axis=1)
     network.create_vw("raw_reward", variable=T.mean(reward), shape=())
     baseline_reward = 100
     network.create_vw("default", variable=reward + baseline_reward, shape=(state_vw.shape[0],), tags={"output"})
Exemplo n.º 26
0
    def test_dtype_normal_uniform_687(self):
        # Regression test for #687.
        rng_R = random_state_type()
        assert uniform(rng_R, low=tensor.constant(0, dtype='float64'),
                       dtype='float32')[1].dtype == 'float32'

        assert normal(rng_R, avg=tensor.constant(0, dtype='float64'),
                      dtype='float32')[1].dtype == 'float32'
Exemplo n.º 27
0
    def _build_expression(self, input_expression=None):
        if self.pool_type not in ['max', 'avg']:
            raise NotImplementedError(
                'Pooling only implemented for max and avg')

        if input_expression is None:
            self.input_ = T.tensor4(dtype=self.input_dtype)
        else:
            self.input_ = input_expression

        # Replicating caffe style pooling means zero padding
        # then strided pooling with ignore_border=True
        if self.padding in [0, (0, 0)]:
            padded_input = self.input_
        else:
            zero_padder = ZeroPad(padding=self.padding)
            zero_padder._build_expression(self.input_)
            padded_input = zero_padder.expression_
        if self.pool_type == 'max':
            pooled = fancy_max_pool(padded_input,
                                    self.pool_shape, self.pool_stride,
                                    ignore_border=False)
        elif self.pool_type == 'avg':
            # self.pool_shape needs to be a tuple
            avg_kernel = T.cast(T.ones((1, 1) + self.pool_shape,
                                dtype=self.input_.dtype
                                ) / np.prod(self.pool_shape),
                                self.input_.dtype)
            n_imgs = self.input_.shape[0]
            n_channels = self.input_.shape[1]
            conv_output = T.nnet.conv2d(
                padded_input.reshape((n_imgs * n_channels, 1,
                                      padded_input.shape[2],
                                      padded_input.shape[3])),
                avg_kernel, subsample=self.pool_stride)
            pooled = conv_output.reshape((n_imgs, n_channels,
                                         conv_output.shape[2],
                                         conv_output.shape[3]))

        # A caffe quirk: The output shape is (for width, analogous for h:)
        # ceil((w + 2 * pad_w - kernel_w) / stride_w) + 1, instead of floor
        # With floor, ignore_border=True would have yielded the exact result
        # With ceil, sometimes we need an extra column and/or line. So we do
        # ignore_border=False and then crop to the right shape. Since the
        # shape is dynamic we need to first calculate it:

        # padding gotta be a tuple too
        pad = T.constant(self.padding)
        # pad = T.constant(zero_padder.padding_)
        # supposing here that self.pool_shape is a tuple. Should check
        pool_shape = T.constant(self.pool_shape)
        # stride hopefully a tuple, too
        pool_stride = T.constant(self.pool_stride, dtype='float64')
        float_shape = (self.input_.shape[2:4] + 2 * pad
                       - pool_shape) / pool_stride + 1
        output_shape = T.cast(T.ceil(float_shape), dtype='int64')
        self.expression_ = pooled[:, :, 0:output_shape[0],
                                        0:output_shape[1]]
Exemplo n.º 28
0
def check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False):
    """
    check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False)

    Runs a basic sanity check on the `uniform` method of a
    `CURAND_RandomStreams` object.

    Checks that variates

     * are in the range [0, 1]
     * have a mean in the right neighbourhood (near 0.5)
     * are of the specified shape
     * successive calls produce different arrays of variates

    Parameters
    ----------
    shape_as_symbolic : boolean
        If `True`, est the case that the shape tuple is a symbolic
        variable rather than known at compile-time.

    dim_as_symbolic : boolean
        If `True`, test the case that an element of the shape
        tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic`
        is `True`.
    """
    rng = CURAND_RandomStreams(234)
    if shape_as_symbolic:
        # instantiate a TensorConstant with the value (10, 10)
        shape = constant((10, 10))
    else:
        # Only one dimension is symbolic, with the others known
        if dim_as_symbolic:
            shape = (10, constant(10))
        else:
            shape = (10, 10)
    u0 = rng.uniform(shape)
    u1 = rng.uniform(shape)

    f0 = theano.function([], u0, mode=mode_with_gpu)
    f1 = theano.function([], u1, mode=mode_with_gpu)

    v0list = [f0() for i in range(3)]
    v1list = [f1() for i in range(3)]

    # print v0list
    # print v1list
    # assert that elements are different in a few ways
    assert numpy.all(v0list[0] != v0list[1])
    assert numpy.all(v1list[0] != v1list[1])
    assert numpy.all(v0list[0] != v1list[0])

    for v in v0list:
        assert v.shape == (10, 10)
        assert v.min() >= 0
        assert v.max() <= 1
        assert v.min() < v.max()
        assert .25 <= v.mean() <= .75
Exemplo n.º 29
0
 def dtw(i, q_p, b_p, Q, D, inf):
   i0 = T.eq(i, 0)
   # inf = T.cast(1e10,'float32') * T.cast(T.switch(T.eq(self.n,0), T.switch(T.eq(i,0), 0, 1), 1), 'float32')
   penalty = T.switch(T.and_(T.neg(n0), i0), big, T.constant(0.0, 'float32'))
   loop = T.constant(0.0, 'float32') + q_p
   forward = T.constant(0.0, 'float32') + T.switch(T.or_(n0, i0), 0, Q[i - 1])
   opt = T.stack([loop, forward])
   k_out = T.cast(T.argmin(opt, axis=0), 'int32')
   return opt[k_out, T.arange(opt.shape[1])] + D[i] + penalty, k_out
Exemplo n.º 30
0
def test_multiclass_hinge_loss():
    x = np.array([[0, 1], [1, 0], [0.5, 1.5], [0, 0.5]] * 2, dtype=fX)
    y = np.array([0] * 4 + [1] * 4, dtype="int32")
    res = treeano.utils.multiclass_hinge_loss(T.constant(x),
                                              T.constant(y)).eval()
    ans = np.array(
        [[1, 2], [1, 0], [1, 2], [1, 1.5], [0, 1], [2, 1], [0, 1], [0.5, 1]],
        dtype=fX)
    np.testing.assert_equal(res, ans)
Exemplo n.º 31
0
    def _run(self, num_features, num_timesteps, batch_size, mode):
        # determine shapes of inputs and targets depending on the batch size
        if batch_size == 1:
            inputs_size = (num_timesteps, num_features)
            targets_size = (num_timesteps, 1)
        else:
            inputs_size = (num_timesteps, batch_size, num_features)
            targets_size = (num_timesteps, batch_size, 1)

        # make inputs and targets shared variables
        inputs = theano.shared(self.rng.uniform(size=inputs_size).astype(
            config.floatX),
                               borrow=True)
        targets = theano.shared(self.rng.uniform(size=targets_size).astype(
            config.floatX),
                                borrow=True)

        # create symbolic inputs and targets variables
        if batch_size == 1:
            x = T.matrix('inputs')
            t = T.matrix('targets')
        else:
            x = T.tensor3('inputs')
            t = T.tensor3('inputs')
        x.tag.test_value = inputs.get_value(borrow=True)
        t.tag.test_value = targets.get_value(borrow=True)

        # create a set of parameters for a simple RNN
        W_xh = theano.shared(
            (0.01 * self.rng.uniform(size=(num_features, 10))).astype(
                config.floatX),
            borrow=True)
        W_hh = theano.shared(
            (0.01 * self.rng.uniform(size=(10, 10))).astype(config.floatX),
            borrow=True)
        W_hy = theano.shared(
            (0.01 * self.rng.uniform(size=(10, 1))).astype(config.floatX),
            borrow=True)
        b_h = theano.shared(numpy.zeros(10).astype(config.floatX), borrow=True)
        b_y = theano.shared(numpy.zeros(1).astype(config.floatX), borrow=True)

        params = [W_xh, W_hh, W_hy, b_h, b_y]

        # recurrent function
        def step(x_t, h_tm1):
            h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h)
            return h

        # build recurrent graph
        if batch_size == 1:
            h_0 = T.alloc(0.0, 10).astype(config.floatX)
        else:
            h_0 = T.alloc(0.0, batch_size, 10).astype(config.floatX)
        h, updates = theano.scan(step, sequences=[x], outputs_info=[h_0])
        # network output
        y = T.dot(h, W_hy) + b_y

        # Create Gauss-Newton-Matrix object. Not really of any use here, but I
        # need it for Hessian-Free optimization.
        gn = GaussNewtonMatrix(y)

        # compute MSE
        cost = ((t - y)**2).sum(axis=1).mean()

        # Compute the cost at some other point in the parameter
        # space. Not really of any use here, but this is how I do it
        # during certain iterations of CG in the HF algorithm. There,
        # it's in fact `pi + current update proposal`.  For simplicity,
        # I just multiply by 2 here.
        cost_ = theano.clone(cost,
                             replace=dict([(pi, 2 * pi) for pi in params]))

        # Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG,
        # but for simplicity, I just take the parameters vector because it's
        # already there.
        Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0))

        # compile Theano function
        f = theano.function([], [cost_] + Gv,
                            givens={
                                x: inputs,
                                t: targets
                            },
                            mode=mode)
        # execute
        f()
Exemplo n.º 32
0
def const(value):
    return TT.constant(numpy.asarray(value, dtype=theano.config.floatX))
Exemplo n.º 33
0
    def __init__(self,
                 key_index,
                 label_num,
                 pretrain_name=None,
                 encoder='lstm',
                 word_dim=300,
                 hidden='100_100',
                 dropout=0.5,
                 regularization_weight=0.0001,
                 optimizer_name='adagrad',
                 lr=0.1,
                 norm_lim=-1,
                 label2index_filename=None):
        self.label2index, self.index2label = self.load_label_index(
            label2index_filename, label_num)

        self.indexs = T.imatrix()  # (batch, max_len)
        self.golden = T.ivector()  # (batch, )
        self.max_len = T.iscalar()  # max length

        self.s1_mask = self.indexs[:, :self.max_len] > 0
        self.s1_mask = self.s1_mask * T.constant(1.0,
                                                 dtype=theano.config.floatX)

        if pretrain_name is None:
            self.embedding = WordEmbedding(
                key_index,
                dim=word_dim,
                initializer=UniformInitializer(scale=0.01))
        else:
            self.embedding = WordEmbedding(key_index,
                                           filename=pretrain_name,
                                           normalize=False,
                                           binary=True)
            assert self.embedding.dim == word_dim

        self.word_embeddings = self.embedding[self.indexs[:, :self.max_len]]

        if type(hidden) is str:
            hidden_dims = [int(hid) for hid in hidden.split('_')]
        else:
            hidden_dims = [hidden]

        if encoder == 'lstm':
            encoder_layer = LSTMEncoder(in_dim=word_dim,
                                        hidden_dim=hidden_dims[0],
                                        pooling='final',
                                        prefix="LSTM_",
                                        dropout=dropout)
        elif encoder == 'bilstm':
            encoder_layer = BiLSTMEncoder(in_dim=word_dim,
                                          hidden_dim=hidden_dims[0],
                                          pooling='final',
                                          prefix="BiLSTM_",
                                          bidirection_shared=True,
                                          dropout=dropout)
        elif encoder == 'recurrent':
            encoder_layer = RecurrentEncoder(in_dim=word_dim,
                                             hidden_dim=hidden_dims[0],
                                             pooling='final',
                                             prefix="Recurrent_",
                                             dropout=dropout)
        elif encoder == 'birecurrent':
            encoder_layer = BiRecurrentEncoder(in_dim=word_dim,
                                               hidden_dim=hidden_dims[0],
                                               pooling='final',
                                               prefix="BiRecurrent_",
                                               bidirection_shared=True,
                                               dropout=dropout)
        elif encoder == 'gru':
            encoder_layer = GRUEncoder(in_dim=word_dim,
                                       hidden_dim=hidden_dims[0],
                                       pooling='final',
                                       prefix="GRU_",
                                       dropout=dropout)
        elif encoder == 'bigru':
            encoder_layer = BiGRUEncoder(in_dim=word_dim,
                                         hidden_dim=hidden_dims[0],
                                         pooling='final',
                                         prefix="BiGRU_",
                                         bidirection_shared=True,
                                         dropout=dropout)
        elif encoder == 'cbow':
            encoder_layer = CBOWLayer(in_dim=word_dim, )
        elif encoder == 'cnn':
            encoder_layer = MultiFilterConvolutionLayer(
                in_dim=word_dim,
                hidden_dim=hidden_dims[0],
                pooling='max',
                prefix="ConvLayer_",
                kernel_sizes=CONV_FILTER_SIZES)
        else:
            raise NotImplementedError

        self.text_embedding = encoder_layer.forward_batch(
            self.word_embeddings, self.s1_mask)

        if len(hidden_dims) > 1:
            hidden_layer = MultiHiddenLayer(in_dim=encoder_layer.out_dim,
                                            hidden_dims=hidden_dims[1:],
                                            dropout=dropout,
                                            prefix='Full_Connected_Layer_')
            classifier_input = hidden_layer.forward_batch(self.text_embedding)
            classifier_input_dim = hidden_layer.out_dim
        else:
            classifier_input = self.text_embedding
            classifier_input_dim = encoder_layer.out_dim

        self.classifier = SoftmaxClassifier(classifier_input_dim,
                                            label_num,
                                            dropout=dropout)
        self.predict_loss = self.classifier.loss(classifier_input, self.golden)
        self.predict_prob = self.classifier.forward_batch(classifier_input)
        self.predict_label = T.argmax(self.predict_prob, axis=1)
        """Params in TextClassifier"""
        self.params = self.classifier.params + encoder_layer.params
        self.l2_norm = self.classifier.l2_norm + encoder_layer.l2_norm
        if len(hidden_dims) > 1:
            self.params += hidden_layer.params
            self.l2_norm += hidden_layer.l2_norm

        self.l2_loss = regularization_weight * self.l2_norm / 2
        self.loss = self.predict_loss + self.l2_loss
        """Opimizer and Loss"""
        if optimizer_name == 'adagrad':
            sgd_optimizer = AdaGradOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'adadelta':
            sgd_optimizer = AdaDeltaOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'sgd':
            sgd_optimizer = SGDOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'momentum':
            sgd_optimizer = SGDMomentumOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'adam':
            sgd_optimizer = AdamOptimizer(lr=lr, norm_lim=norm_lim)
        else:
            raise NotImplementedError

        self.train_indexs = T.ivector()
        self.train_data_x = shared_zero_matrix(shape=(5, 5),
                                               name="train_data_x",
                                               dtype=np.int32)
        self.train_data_y = shared_zero_matrix(shape=(5, ),
                                               name="train_data_y",
                                               dtype=np.int32)

        self.model_params = self.params + self.embedding.params
        """Theano Function"""
        if EMBEDDING_LR > 0:
            embedding_updates = SGDOptimizer(lr=EMBEDDING_LR,
                                             norm_lim=-1).get_update(
                                                 self.loss,
                                                 self.embedding.params)
            updates = sgd_optimizer.get_update(
                self.loss, self.params, norm_exc_params=self.embedding.params)
            updates.update(embedding_updates)
        elif EMBEDDING_LR < 0:
            # Optimize Embedding using Global Optimizer
            self.params += self.embedding.params
            updates = sgd_optimizer.get_update(
                self.loss, self.params, norm_exc_params=self.embedding.params)
        else:
            # Fix Embedding
            updates = sgd_optimizer.get_update(
                self.loss, self.params, norm_exc_params=self.embedding.params)

        self.train_batch = theano.function(
            inputs=[self.train_indexs, self.max_len],
            outputs=[self.loss, self.predict_loss, self.l2_loss],
            updates=updates,
            givens=[(self.indexs, self.train_data_x[self.train_indexs]),
                    (self.golden, self.train_data_y[self.train_indexs])])

        self.loss_batch = theano.function(
            inputs=[self.indexs, self.golden, self.max_len],
            outputs=[self.loss, self.predict_loss, self.l2_loss],
        )

        self.pred_prob_batch = theano.function(
            inputs=[self.indexs, self.max_len],
            outputs=[self.predict_prob],
        )

        self.pred_label_batch = theano.function(
            inputs=[self.indexs, self.max_len],
            outputs=[self.predict_label],
        )

        self.get_l2_loss = theano.function(
            inputs=[],
            outputs=[self.l2_loss, self.l2_norm],
        )
Exemplo n.º 34
0
def compile_theano_func_build_G_mtx():
    tau_inter_x, tau_inter_y = TT.scalar('tau_inter_x'), TT.scalar(
        'tau_inter_y')
    M, N = TT.scalar('M'), TT.scalar('N')
    m_grid, n_grid = TT.vector('m_grid'), TT.vector('n_grid')
    cross_beamShape_r, cross_beamShape_i = \
        TT.tensor3('cross_beamShape_r'), TT.tensor3('cross_beamShape_i')
    baseline_x, baseline_y = TT.tensor3('baseline_x'), TT.tensor3('baseline_y')
    pi = TT.constant(np.pi)

    def theano_periodic_sinc(in_sig, bandwidth):
        eps = TT.constant(1e-10)
        denominator = TT.mul(TT.sin(TT.true_div(in_sig, bandwidth)), bandwidth)
        idx_modi = TT.lt(TT.abs_(denominator), eps)
        numerator = TT.switch(idx_modi, TT.cos(in_sig), TT.sin(in_sig))
        denominator = TT.switch(idx_modi,
                                TT.cos(TT.true_div(in_sig,
                                                   bandwidth)), denominator)
        return TT.true_div(numerator, denominator)

    # def theano_periodic_sinc(in_sig, bandwidth):
    #     eps = TT.constant(1e-10)
    #     numerator = TT.sin(in_sig)
    #     denominator = TT.mul(TT.sin(TT.true_div(in_sig, bandwidth)), bandwidth)
    #     out0 = TT.true_div(numerator, denominator)
    #     out1 = TT.true_div(TT.cos(in_sig), TT.cos(TT.true_div(in_sig, bandwidth)))
    #     idx_modi = TT.lt(TT.abs_(denominator), eps)
    #     out = TT.switch(idx_modi, out1, out0)
    #     return out

    # define the function
    def f_inner(cross_beamShape_r, cross_beamShape_i, baseline_x, baseline_y,
                tau_inter_x, tau_inter_y, m_grid, n_grid, M, N):
        periodic_sinc_2d = \
            TT.mul(
                theano_periodic_sinc(
                    0.5 * (TT.shape_padright(tau_inter_x * baseline_x, n_ones=1) -
                           2 * pi * TT.shape_padleft(m_grid, n_ones=2)),
                    M * tau_inter_x
                ),
                theano_periodic_sinc(
                    0.5 * (TT.shape_padright(tau_inter_y * baseline_y, n_ones=1) -
                           2 * pi * TT.shape_padleft(n_grid, n_ones=2)),
                    N * tau_inter_y
                )
            )
        G_mtx_r = TT.tensordot(cross_beamShape_r,
                               periodic_sinc_2d,
                               axes=[[0, 1], [0, 1]])
        G_mtx_i = TT.tensordot(cross_beamShape_i,
                               periodic_sinc_2d,
                               axes=[[0, 1], [0, 1]])

        return G_mtx_r, G_mtx_i

    G_mtx_r, G_mtx_i = theano.map(fn=f_inner,
                                  sequences=(cross_beamShape_r,
                                             cross_beamShape_i, baseline_x,
                                             baseline_y),
                                  non_sequences=(tau_inter_x, tau_inter_y,
                                                 m_grid, n_grid, M, N))[0]

    # compile the function
    func = theano.function([
        tau_inter_x, tau_inter_y, M, N, m_grid, n_grid, baseline_x, baseline_y,
        cross_beamShape_r, cross_beamShape_i
    ], [G_mtx_r, G_mtx_i],
                           allow_input_downcast=True)
    return func
Exemplo n.º 35
0
def run(only_forward=False):
    logger = afs_safe_logger.Logger(
        os.path.join(FLAGS.log_path, FLAGS.experiment_name) + ".log")

    if FLAGS.data_type == "bl":
        data_manager = load_boolean_data
    elif FLAGS.data_type == "sst":
        data_manager = load_sst_data
    elif FLAGS.data_type == "snli":
        data_manager = load_snli_data
    else:
        logger.Log("Bad data type.")
        return

    pp = pprint.PrettyPrinter(indent=4)
    logger.Log("Flag values:\n" + pp.pformat(FLAGS.FlagValuesDict()))

    # Load the data.
    raw_training_data, vocabulary = data_manager.load_data(
        FLAGS.training_data_path)

    # Load the eval data.
    raw_eval_sets = []
    if FLAGS.eval_data_path:
        for eval_filename in FLAGS.eval_data_path.split(":"):
            eval_data, _ = data_manager.load_data(eval_filename)
            raw_eval_sets.append((eval_filename, eval_data))

    # Prepare the vocabulary.
    if not vocabulary:
        logger.Log(
            "In open vocabulary mode. Using loaded embeddings without fine-tuning."
        )
        train_embeddings = False
        vocabulary = util.BuildVocabulary(
            raw_training_data,
            raw_eval_sets,
            FLAGS.embedding_data_path,
            logger=logger,
            sentence_pair_data=data_manager.SENTENCE_PAIR_DATA)
    else:
        logger.Log("In fixed vocabulary mode. Training embeddings.")
        train_embeddings = True

    # Load pretrained embeddings.
    if FLAGS.embedding_data_path:
        logger.Log("Loading vocabulary with " + str(len(vocabulary)) +
                   " words from " + FLAGS.embedding_data_path)
        initial_embeddings = util.LoadEmbeddingsFromASCII(
            vocabulary, FLAGS.word_embedding_dim, FLAGS.embedding_data_path)
    else:
        initial_embeddings = None

    # Trim dataset, convert token sequences to integer sequences, crop, and
    # pad.
    logger.Log("Preprocessing training data.")
    training_data = util.PreprocessDataset(
        raw_training_data,
        vocabulary,
        FLAGS.seq_length,
        data_manager,
        eval_mode=False,
        logger=logger,
        sentence_pair_data=data_manager.SENTENCE_PAIR_DATA,
        for_rnn=FLAGS.model_type == "RNN" or FLAGS.model_type == "CBOW")
    training_data_iter = util.MakeTrainingIterator(training_data,
                                                   FLAGS.batch_size)

    eval_iterators = []
    for filename, raw_eval_set in raw_eval_sets:
        logger.Log("Preprocessing eval data: " + filename)
        e_X, e_transitions, e_y, e_num_transitions = util.PreprocessDataset(
            raw_eval_set,
            vocabulary,
            FLAGS.seq_length,
            data_manager,
            eval_mode=True,
            logger=logger,
            sentence_pair_data=data_manager.SENTENCE_PAIR_DATA,
            for_rnn=FLAGS.model_type == "RNN" or FLAGS.model_type == "CBOW")
        eval_iterators.append(
            (filename,
             util.MakeEvalIterator(
                 (e_X, e_transitions, e_y, e_num_transitions),
                 FLAGS.batch_size)))

    # Set up the placeholders.

    y = T.vector("y", dtype="int32")
    lr = T.scalar("lr")
    training_mode = T.scalar(
        "training_mode")  # 1: Training with dropout, 0: Eval
    ground_truth_transitions_visible = T.scalar(
        "ground_truth_transitions_visible", dtype="int32")

    logger.Log("Building model.")
    vs = util.VariableStore(default_initializer=util.UniformInitializer(
        FLAGS.init_range),
                            logger=logger)

    if FLAGS.model_type == "CBOW":
        model_cls = spinn.cbow.CBOW
    elif FLAGS.model_type == "RNN":
        model_cls = spinn.plain_rnn.RNN
    else:
        model_cls = getattr(spinn.fat_stack, FLAGS.model_type)

    # Generator of mask for scheduled sampling
    numpy_random = np.random.RandomState(1234)
    ss_mask_gen = T.shared_randomstreams.RandomStreams(
        numpy_random.randint(999999))

    # Training step number
    ss_prob = T.scalar("ss_prob")

    if data_manager.SENTENCE_PAIR_DATA:
        X = T.itensor3("X")
        transitions = T.itensor3("transitions")
        num_transitions = T.imatrix("num_transitions")

        predicted_premise_transitions, predicted_hypothesis_transitions, logits = build_sentence_pair_model(
            model_cls,
            len(vocabulary),
            FLAGS.seq_length,
            X,
            transitions,
            len(data_manager.LABEL_MAP),
            training_mode,
            ground_truth_transitions_visible,
            vs,
            initial_embeddings=initial_embeddings,
            project_embeddings=(not train_embeddings),
            ss_mask_gen=ss_mask_gen,
            ss_prob=ss_prob)
    else:
        X = T.matrix("X", dtype="int32")
        transitions = T.imatrix("transitions")
        num_transitions = T.vector("num_transitions", dtype="int32")

        predicted_transitions, logits = build_sentence_model(
            model_cls,
            len(vocabulary),
            FLAGS.seq_length,
            X,
            transitions,
            len(data_manager.LABEL_MAP),
            training_mode,
            ground_truth_transitions_visible,
            vs,
            initial_embeddings=initial_embeddings,
            project_embeddings=(not train_embeddings),
            ss_mask_gen=ss_mask_gen,
            ss_prob=ss_prob)

    xent_cost, acc = build_cost(logits, y)

    # Set up L2 regularization.
    l2_cost = 0.0
    for var in vs.trainable_vars:
        l2_cost += FLAGS.l2_lambda * T.sum(T.sqr(vs.vars[var]))

    # Compute cross-entropy cost on action predictions.
    if (not data_manager.SENTENCE_PAIR_DATA) and FLAGS.model_type not in [
            "Model0", "RNN", "CBOW"
    ]:
        transition_cost, action_acc = build_transition_cost(
            predicted_transitions, transitions, num_transitions)
    elif data_manager.SENTENCE_PAIR_DATA and FLAGS.model_type not in [
            "Model0", "RNN", "CBOW"
    ]:
        p_transition_cost, p_action_acc = build_transition_cost(
            predicted_premise_transitions, transitions[:, :, 0],
            num_transitions[:, 0])
        h_transition_cost, h_action_acc = build_transition_cost(
            predicted_hypothesis_transitions, transitions[:, :, 1],
            num_transitions[:, 1])
        transition_cost = p_transition_cost + h_transition_cost
        action_acc = (p_action_acc + h_action_acc
                      ) / 2.0  # TODO(SB): Average over transitions, not words.
    else:
        transition_cost = T.constant(0.0)
        action_acc = T.constant(0.0)
    transition_cost = transition_cost * FLAGS.transition_cost_scale

    total_cost = xent_cost + l2_cost + transition_cost

    if ".ckpt" in FLAGS.ckpt_path:
        checkpoint_path = FLAGS.ckpt_path
    else:
        checkpoint_path = os.path.join(FLAGS.ckpt_path,
                                       FLAGS.experiment_name + ".ckpt")
    if os.path.isfile(checkpoint_path):
        logger.Log("Found checkpoint, restoring.")
        step, best_dev_error = vs.load_checkpoint(
            checkpoint_path,
            num_extra_vars=2,
            skip_saved_unsavables=FLAGS.skip_saved_unsavables)
    else:
        assert not only_forward, "Can't run an eval-only run without a checkpoint. Supply a checkpoint."
        step = 0
        best_dev_error = 1.0

    # Do an evaluation-only run.
    if only_forward:
        if FLAGS.eval_output_paths:
            eval_output_paths = FLAGS.eval_output_paths.strip().split(":")
            assert len(eval_output_paths) == len(
                eval_iterators), "Invalid no. of output paths."
        else:
            eval_output_paths = [
                FLAGS.experiment_name + "-" + os.path.split(eval_set[0])[1] +
                "-parse" for eval_set in eval_iterators
            ]

        # Load model from checkpoint.
        logger.Log("Checkpointed model was trained for %d steps." % (step, ))

        # Generate function for forward pass.
        logger.Log("Building forward pass.")
        if data_manager.SENTENCE_PAIR_DATA:
            eval_fn = theano.function([
                X, transitions, y, num_transitions, training_mode,
                ground_truth_transitions_visible, ss_prob
            ], [
                acc, action_acc, logits, predicted_hypothesis_transitions,
                predicted_premise_transitions
            ],
                                      on_unused_input='ignore',
                                      allow_input_downcast=True)
        else:
            eval_fn = theano.function([
                X, transitions, y, num_transitions, training_mode,
                ground_truth_transitions_visible, ss_prob
            ], [acc, action_acc, logits, predicted_transitions],
                                      on_unused_input='ignore',
                                      allow_input_downcast=True)

        # Generate the inverse vocabulary lookup table.
        ind_to_word = {v: k for k, v in vocabulary.iteritems()}

        # Do a forward pass and write the output to disk.
        for eval_set, eval_out_path in zip(eval_iterators, eval_output_paths):
            logger.Log("Writing eval output for %s." % (eval_set[0], ))
            evaluate_expanded(
                eval_fn, eval_set, eval_out_path, logger, step,
                data_manager.SENTENCE_PAIR_DATA, ind_to_word, FLAGS.model_type
                not in ["Model0", "RNN", "CBOW"])
    else:
        # Train

        new_values = util.RMSprop(total_cost, vs.trainable_vars.values(), lr)
        new_values += [(key, vs.nongradient_updates[key])
                       for key in vs.nongradient_updates]
        # Training open-vocabulary embeddings is a questionable idea right now. Disabled:
        # new_values.append(
        #     util.embedding_SGD(total_cost, embedding_params, embedding_lr))

        # Create training and eval functions.
        # Unused variable warnings are supressed so that num_transitions can be passed in when training Model 0,
        # which ignores it. This yields more readable code that is very slightly slower.
        logger.Log("Building update function.")
        update_fn = theano.function([
            X, transitions, y, num_transitions, lr, training_mode,
            ground_truth_transitions_visible, ss_prob
        ], [total_cost, xent_cost, transition_cost, action_acc, l2_cost, acc],
                                    updates=new_values,
                                    on_unused_input='ignore',
                                    allow_input_downcast=True)
        logger.Log("Building eval function.")
        eval_fn = theano.function([
            X, transitions, y, num_transitions, training_mode,
            ground_truth_transitions_visible, ss_prob
        ], [acc, action_acc],
                                  on_unused_input='ignore',
                                  allow_input_downcast=True)
        logger.Log("Training.")

        # Main training loop.
        for step in range(step, FLAGS.training_steps):
            if step % FLAGS.eval_interval_steps == 0:
                for index, eval_set in enumerate(eval_iterators):
                    acc = evaluate(eval_fn, eval_set, logger, step)
                    if FLAGS.ckpt_on_best_dev_error and index == 0 and (
                            1 - acc) < 0.99 * best_dev_error and step > 1000:
                        best_dev_error = 1 - acc
                        logger.Log(
                            "Checkpointing with new best dev accuracy of %f" %
                            acc)
                        vs.save_checkpoint(checkpoint_path + "_best",
                                           extra_vars=[step, best_dev_error])

            X_batch, transitions_batch, y_batch, num_transitions_batch = training_data_iter.next(
            )
            learning_rate = FLAGS.learning_rate * (
                FLAGS.learning_rate_decay_per_10k_steps**(step / 10000.0))
            ret = update_fn(
                X_batch, transitions_batch, y_batch, num_transitions_batch,
                learning_rate, 1.0, 1.0,
                np.exp(step * np.log(FLAGS.scheduled_sampling_exponent_base)))
            total_cost_val, xent_cost_val, transition_cost_val, action_acc_val, l2_cost_val, acc_val = ret

            if step % FLAGS.statistics_interval_steps == 0:
                logger.Log("Step: %i\tAcc: %f\t%f\tCost: %5f %5f %5f %5f" %
                           (step, acc_val, action_acc_val, total_cost_val,
                            xent_cost_val, transition_cost_val, l2_cost_val))

            if step % FLAGS.ckpt_interval_steps == 0 and step > 0:
                vs.save_checkpoint(checkpoint_path,
                                   extra_vars=[step, best_dev_error])
    def build_model_core(self):

        # gradient clipping function
        self.clipg = lambda x: grad_clip(
            x, -self.conf['GRAD_CLIP_SIZE'], self.conf['GRAD_CLIP_SIZE'])

        shared_layers = {}

        if self.conf['BATCH_NORM']:
            if not hasattr(self, 'gamma_h'):
                gamma_h_val = np.ones(
                    (self.conf['lstm_hidden_size'] * 2,), dtype=theano.config.floatX)
                shared_layers['gamma_h'] = gamma_h_val
            if not hasattr(self, 'beta_h'):
                beta_h_val = np.zeros(
                    (self.conf['lstm_hidden_size'] * 2,), dtype=theano.config.floatX)
                shared_layers['beta_h'] = beta_h_val

        # set the default network weights
        if not hasattr(self, 'wemb'):
            wemb_val = init_layer_k(
                self.conf['vocab_size'], self.conf['emb_size'])
            shared_layers['wemb'] = wemb_val

        if not hasattr(self, 'h0_hidden'):
            h0_hidden_val = np.zeros(
                (self.conf['lstm_hidden_size'], ), dtype=theano.config.floatX)
            shared_layers['h0_hidden'] = h0_hidden_val

        if not hasattr(self, 'h0_cell'):
            h0_cell_val = np.zeros(
                (self.conf['lstm_hidden_size'], ), dtype=theano.config.floatX)
            shared_layers['h0_cell'] = h0_cell_val

        # mapping from visual space to word space
        if not hasattr(self, 'wvm'):
            wvm_val = init_layer_k(
                self.conf['visual_size'], self.conf['emb_size'])
            shared_layers['wvm'] = wvm_val

        if not hasattr(self, 'bmv'):
            bmv_val = np.zeros(
                (self.conf['emb_size'],), dtype=theano.config.floatX)
            shared_layers['bmv'] = bmv_val

        # LSTM layer parameters
        if not hasattr(self, 'w_lstm'):
            w_lstm_val = init_layer_k(
                self.conf['lstm_hidden_size']*2, self.conf['lstm_hidden_size']*4)
            shared_layers['w_lstm'] = w_lstm_val

        # mapping from RNN hidden output to vocabulary
        if not hasattr(self, 'w'):
            w_val = init_layer_k(
                self.conf['lstm_hidden_size'], self.conf['output_size'])
            shared_layers['w'] = w_val

        if not hasattr(self, 'b'):
            b_val = np.zeros(
                (self.conf['output_size'],), dtype=theano.config.floatX)
            if self.conf["INIT_OUTPUT_BIAS"]:
                # set the bias on the last layer to be the log prob of each of the words in the vocab
                wcount = 0
                w2i = self.dp.w2i
                w2c = self.dp.get_word_counts(RNNDataProvider.TRAIN)
                for w in w2i:
                    if w in w2c:
                        wcount += w2c[w]
                wcount += self.X_train.shape[0]
                b_val[w2i[RNNDataProvider.STOP_TOKEN]] = np.log(
                    self.X_train.shape[0]/float(wcount))
                for w in w2i:
                    if w in w2c:
                        b_val[w2i[w]] = np.log(w2c[w]/float(wcount))
                b_val -= np.max(b_val[1:])
            shared_layers['b'] = b_val

        self.build_shared_layers(shared_layers)

        # input variables for training
        self.x = T.imatrix("x")
        self.v = T.matrix("v")
        self.xlen = T.matrix("xlen")

        # input variables for generation
        self.v_single = T.vector("v")
        self.nstep = T.iscalar("nstep")

        # the dropout masks
        self.x_drop = T.tensor3("x_drop")  # drop the input
        self.y_drop = T.tensor3("y_drop")  # drop the output

        self.forced_word = T.imatrix("forced_word")

        h_tm1 = T.vector("h_tm1")  # hidden layer ouput
        word_t = T.ivector("word_t")  # word indexes
        v_i = T.vector("v")  # visual information

        # Generates the next word based on the: previous true word, hidden state & visual features
        # inputs: hiddent_layer, last_predicted word, visual features
        def recurrance(word_t, x_drop_slice, hh_drop_slice, use_v, h_tm1_hidden, h_tm1_cell, v_i):

                #word_t = theano.printing.Print("word_t")(word_t)

                # get the word embedding matrix or the context information
            if self.conf['DECODER']:
                x_t = ifelse(T.eq(use_v, 1), T.dot(
                    v_i, self.wvm) + self.bmv, self.wemb[word_t])
            else:
                x_t = ifelse(T.eq(use_v, 1), T.zeros_like(
                    self.wemb[word_t]), self.wemb[word_t])

            # if we are not doing minibatch training
            if word_t.ndim == 0:
                x_t = x_t.reshape((1, x_t.shape[0]))
                h_tm1_hidden = h_tm1_hidden.reshape((1, h_tm1_hidden.shape[0]))
                h_tm1_cell = h_tm1_cell.reshape((1, h_tm1_cell.shape[0]))

            # dropout on the input embddings
            if self.conf['DROP_INPUT']:
                x_t *= x_drop_slice

            # clip the gradients so they dont get too large
            h_tm1_hidden_clip = self.clipg(h_tm1_hidden)

            in_state = T.concatenate([x_t, h_tm1_hidden_clip], axis=1)

            if self.conf['BATCH_NORM']:
                mu = T.mean(in_state, axis=0, keepdims=True)
                var = T.var(in_state, axis=0, keepdims=True)
                normed_is = (in_state - mu) / T.sqrt(var +
                                                     T.constant(1e-10, dtype=theano.config.floatX))
                in_state = self.gamma_h * in_state + self.beta_h

            # calculate 8 dot products in one go
            dot_out = T.dot(in_state, self.w_lstm)

            lstm_hidden_size = self.conf['lstm_hidden_size']
            # input gate
            ig = T.nnet.sigmoid(dot_out[:, :lstm_hidden_size])
            # forget gate
            fg = T.nnet.sigmoid(
                dot_out[:, lstm_hidden_size:lstm_hidden_size*2])
            # output gate
            og = T.nnet.sigmoid(
                dot_out[:, lstm_hidden_size*2:lstm_hidden_size*3])

            # cell memory
            cc = fg * h_tm1_cell + ig * T.tanh(dot_out[:, lstm_hidden_size*3:])

            # hidden state
            hh = og * cc

            # drop the output state
            if self.conf['DROP_OUTPUT']:
                hh_d = hh * hh_drop_slice

            # the distribution over output words
            if self.conf['SOFTMAX_OUT']:
                s_t = T.nnet.softmax(T.dot(hh_d, self.w) + self.b)
            else:
                s_t = T.nnet.sigmoid(T.dot(hh_d, self.w) + self.b)

            #hh = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_hidden, hh)
            #cc = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_cell, cc)

            if not self.conf['DECODER']:
                keep_idx = T.and_(T.eq(word_t, 0), T.eq(use_v, 0))
                #keep_idx = theano.printing.Print("keep_idx")(keep_idx)
                if word_t.ndim != 0:
                    keep_idx = keep_idx.dimshuffle((0, 'x'))
                #hh_ret = hh
                #hh_ret[keep_idx, :] = h_tm1_hidden[keep_idx, :]
                hh_ret = keep_idx * h_tm1_hidden + (1-keep_idx) * hh
                cc_ret = keep_idx * h_tm1_cell + (1-keep_idx) * cc
            else:
                hh_ret = hh
                cc_ret = cc

            # if we are not doing minibatch training
            if word_t.ndim == 0:
                hh_ret = hh_ret[0]
                cc_ret = cc_ret[0]

            return [hh_ret, cc_ret, s_t]

        # Generates the next word by feeding the old word as input
        # inputs: hiddent_layer, last_predicted word, visual features
        def recurrance_word_feedback(h_tm1_hidden, h_tm1_cell, word_t, use_visual, v_i):
            x_drop_val = T.ones(
                (self.conf['emb_size'],), dtype=theano.config.floatX)
            y_drop_val = T.ones(
                (self.conf['lstm_hidden_size'],), dtype=theano.config.floatX)
            [hh, cc, s_t] = recurrance(
                word_t, x_drop_val, y_drop_val, use_visual, h_tm1_hidden, h_tm1_cell, v_i)

            # the predicted word
            w_idx = T.cast(T.argmax(s_t, axis=1), dtype='int32')[0]

            return [hh, cc, s_t[0], w_idx, T.zeros((0,), dtype='int32')[0]]

        def recurrance_partial_word_feedback(word_t_real, x_drop_val, y_drop_val, use_visual, forced_word, h_tm1_hidden, h_tm1_cell, word_t_pred, v_i):
            word_last = T.switch(forced_word, word_t_real, word_t_pred)
            [hh, cc, s_t] = recurrance(
                word_last, x_drop_val, y_drop_val, use_visual, h_tm1_hidden, h_tm1_cell, v_i)

            # the predicted word
            w_idx = T.cast(T.argmax(s_t, axis=1), dtype='int32')

            return [hh, cc, s_t, w_idx]

        # build the teacher forcing loop
        use_visual_info = T.concatenate([T.ones((1,), dtype=np.int32), T.zeros(
            (self.conf['MAX_SENTENCE_LEN'],), dtype=np.int32)])
        if self.conf['DECODER']:
            #h0_hidden_matrix = self.encoder.hh_out[self.encoder.conf['MAX_SENTENCE_LEN']]

            h0_hidden_matrix = self.h0_hidden * \
                T.ones((self.x.shape[0], self.h0_hidden.shape[0]))
            v_input = T.concatenate(
                [self.encoder.hh_out[self.encoder.conf['MAX_SENTENCE_LEN']], self.v], axis=1)
            #v_input = T.printing.Print("v_input")(v_input)
        else:
            h0_hidden_matrix = self.h0_hidden * \
                T.ones((self.x.shape[0], self.h0_hidden.shape[0]))
            v_input = self.v
            #v_input = T.printing.Print("v_input_v")(v_input)

        h0_cell_matrix = self.h0_cell * \
            T.ones((self.x.shape[0], self.h0_cell.shape[0]))
        x_adj = T.concatenate(
            [T.zeros((1, self.x.T[0].shape[0]), dtype=self.x.dtype), self.x.T])
        y_adj = T.concatenate(
            [self.x.T, T.zeros((1, self.x.T[0].shape[0]), dtype=self.x.dtype)])
        [self.hh_out, self.cc_out, s], _ = theano.scan(fn=recurrance,
                                                       sequences=[x_adj, self.x_drop.dimshuffle(
                                                           (1, 0, 2)), self.y_drop.dimshuffle((1, 0, 2)), use_visual_info],
                                                       n_steps=self.conf['MAX_SENTENCE_LEN']+1,
                                                       non_sequences=v_input,
                                                       outputs_info=[h0_hidden_matrix, h0_cell_matrix, None])

        # build the semi-forced loop
        [_, _, s_semi, _], _ = theano.scan(fn=recurrance_partial_word_feedback,
                                           sequences=[x_adj, self.x_drop.dimshuffle((1, 0, 2)), self.y_drop.dimshuffle((1, 0, 2)),
                                                      use_visual_info, self.forced_word[:, :self.x.shape[0]]],
                                           n_steps=self.conf['MAX_SENTENCE_LEN']+1,
                                           non_sequences=self.v,
                                           outputs_info=[h0_hidden_matrix, h0_cell_matrix, None, T.zeros((self.x.shape[0],), dtype=np.int32)])

        # build the un-forced loop
        [_, _, _, self.wout_fb, _], _ = theano.scan(fn=recurrance_word_feedback,
                                                    non_sequences=self.v_single,
                                                    outputs_info=[self.h0_hidden, self.h0_cell, None, np.array(
                                                        0, dtype=np.int32), T.ones((1,), dtype=np.int32)[0]],
                                                    n_steps=self.nstep)

        if self.conf['SEMI_FORCED'] < 1:
            s = s_semi

        self.new_s = s.reshape((s.shape[0] * s.shape[1], s.shape[2]))
        softmax_out = self.build_loss_function(self.new_s, y_adj)
        self.softmax_out = softmax_out

        # calculate the perplexity
        ff_small = T.constant(1e-20, dtype=theano.config.floatX)
        ppl_idx = softmax_out.shape[1] * \
            T.arange(softmax_out.shape[0]) + T.flatten(y_adj)
        hsum = -T.log2(T.flatten(softmax_out)[ppl_idx] + ff_small)
        hsum_new = hsum.reshape((s.shape[0], s.shape[1])).T
        self.perplexity_sentence = 2 ** (T.sum(hsum_new,
                                               axis=1) / T.sum(self.xlen, axis=1))
        self.perplexity_batch = 2 ** (T.sum(hsum *
                                            T.flatten(self.xlen.T)) / T.sum(self.xlen))
        self.perplexity_batch_v = T.sum(hsum * T.flatten(self.xlen.T))
        self.perplexity_batch_n = T.sum(self.xlen)

        # build the single step code
        h_hid = T.vector("h_hid")
        h_cell = T.vector("h_cell")
        x_drop_val = T.ones(
            (self.conf['emb_size'],), dtype=theano.config.floatX)
        y_drop_val = T.ones(
            (self.conf['lstm_hidden_size'],), dtype=theano.config.floatX)
        use_v = T.iscalar("use_v")
        word_t_s = T.iscalar("word_t_s")
        one_step_theano = recurrance(
            word_t_s, x_drop_val, y_drop_val, use_v, h_hid, h_cell, v_i)

        if self.conf['DECODER']:
            self.one_step = theano.function(
                [word_t_s, use_v, h_hid, h_cell, v_i], outputs=one_step_theano)
        else:
            tmp_x = T.imatrix("tmp_x")
            tmp_v = T.matrix("tmp_v")
            x_d_tmp = T.ones(
                (1, self.conf['MAX_SENTENCE_LEN'], self.conf['emb_size']), dtype=theano.config.floatX)
            y_d_tmp = T.ones(
                (1, self.conf['MAX_SENTENCE_LEN'], self.conf['lstm_hidden_size']), dtype=theano.config.floatX)
            x_d_tmp.type.broadcastable = (False, False, False)
            y_d_tmp.type.broadcastable = (False, False, False)
            self.start_step = theano.function([tmp_x, tmp_v],
                                              outputs=self.hh_out[self.conf['MAX_SENTENCE_LEN']],
                                              givens={self.x_drop: x_d_tmp,
                                                      self.y_drop: y_d_tmp,
                                                      self.x: tmp_x,
                                                      self.v: tmp_v})
    def build_model_trainer(self):
        self.X_sh_train_mask = theano.shared(
            name="X_sh_train_mask", value=self.X_train_mask, borrow=True)
        self.X_sh_train = theano.shared(
            name="X_sh_train", value=self.X_train, borrow=True)
        self.V_sh_train = theano.shared(
            name="V_sh_train", value=self.V_train, borrow=True)
        if self.conf["DECODER"]:
            self.X_sh_train_drop = theano.shared(
                name="X_sh_train_drop", value=self.X_train_drop, borrow=True)
            self.Y_sh_train_drop = theano.shared(
                name="Y_sh_train_drop", value=self.Y_train_drop, borrow=True)
        if self.conf['JOINED_MODEL']:
            self.X_sh_train_lm = theano.shared(
                name="X_sh_train_lm", value=self.X_train_lm, borrow=True)

        params_train = [getattr(self, p)
                        for p in self.conf['param_names_trainable']]

        # build the list of masks (which select which rows may be backpropagated)
        params_bp_mask = []
        for name in self.conf['param_names_trainable']:
            if name in self.conf['params_bp_mask']:
                params_bp_mask.append(self.conf['params_bp_mask'][name])
            else:
                params_bp_mask.append(None)

        if self.conf["DECODER"]:
            encoder_params = [getattr(self.encoder, p)
                              for p in self.encoder.conf['param_names_trainable']]
            params_train = params_train + encoder_params
            for name in self.encoder.conf['param_names_trainable']:
                if name in self.encoder.conf['params_bp_mask']:
                    params_bp_mask.append(
                        self.encoder.conf['params_bp_mask'][name])
                else:
                    params_bp_mask.append(None)

        # storage for historical gradients
        if not self.loaded_model or (not hasattr(self, 'hist_grad') and not hasattr(self, 'delta_grad')):
            self.hist_grad = [theano.shared(value=np.zeros_like(
                var.get_value()), borrow=True) for var in params_train]
            self.delta_grad = [theano.shared(value=np.zeros_like(
                var.get_value()), borrow=True) for var in params_train]

        if not self.conf["DECODER"]:
            return

        # calculate the cost for this minibatch (add L2 reg to loss function)
        regc = T.constant(self.conf['L2_REG_CONST'],
                          dtype=theano.config.floatX)
        self.cost = self.loss + regc * \
            np.sum([(xx ** 2).sum() for xx in params_train])

        # build the SGD weight updates
        batch_size_f = T.constant(
            self.conf['batch_size_val'], dtype=theano.config.floatX)
        comp_grads = T.grad(self.cost, params_train)
        # if self.conf['DECODER']:
        #comp_grads[9] = T.printing.Print("Comp_grads_9")(comp_grads[9]) + 0.0000001*T.printing.Print("params_train_9")(params_train[9])
        comp_grads = [g/batch_size_f for g in comp_grads]
        comp_grads = [T.clip(g, -self.conf['GRAD_CLIP_SIZE'],
                             self.conf['GRAD_CLIP_SIZE']) for g in comp_grads]
        #comp_grads = [g*m if m is not None else g for g,m in zip(comp_grads, params_bp_mask) ]
        weight_updates = get_sgd_weight_updates(self.conf['GRAD_METHOD'], comp_grads, params_train, self.hist_grad, self.delta_grad,
                                                decay=self.conf['DECAY_RATE'], learning_rate=self.conf['LEARNING_RATE'])
        print("Weight updates:", len(weight_updates))
        # if self.conf['DECODER']:
        #    weight_updates[9] = (weight_updates[9][0], T.printing.Print("Comp_grads_9")(weight_updates[9][1]))

        indx = T.iscalar("indx")
        indx_wrap = indx % (
            self.X_sh_train_drop.shape[0] - self.conf['batch_size_val'])
        indx_wrap2 = (
            indx+1) % (self.X_sh_train_drop.shape[0] - self.conf['batch_size_val'])
        if self.conf['JOINED_MODEL']:
            self.train = theano.function([indx],
                                         outputs=[self.loss, self.cost,
                                                  self.perplexity_batch],
                                         updates=weight_updates,
                                         givens={
                self.x: self.X_sh_train[indx:indx+self.conf['batch_size_val']],
                self.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']],
                self.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']],
                self.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']],
                self.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']],
                self.mm_rnn.x: self.X_sh_train[indx:indx+self.conf['batch_size_val']],
                self.mm_rnn.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']],
                self.mm_rnn.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']],
                self.mm_rnn.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']],
                self.mm_rnn.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']],
                self.lm_rnn.x: self.X_sh_train_lm[indx:indx+self.conf['batch_size_val']],
                # self.V_sh_train[indx:indx+self.conf['batch_size_val']],
                self.lm_rnn.v: np.ones((self.conf['batch_size_val'], 1), dtype=theano.config.floatX),
                self.lm_rnn.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']],
                self.lm_rnn.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']],
                self.lm_rnn.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap + \
                                                         self.conf['batch_size_val']]
            },
                on_unused_input='ignore')

        else:
            if self.conf['SEMI_FORCED'] < 1:
                inputs = [indx, self.forced_word]
            else:
                inputs = [indx]
            if self.conf['DECODER']:
                print(len(comp_grads))
                print(len(params_train))
                print(weight_updates)
                self.train = theano.function(inputs,
                                             outputs=[
                                                 self.loss, self.cost, self.perplexity_batch],
                                             updates=weight_updates,
                                             givens={
                                                 self.x: self.X_sh_train[indx:indx+self.conf['batch_size_val']],
                                                 self.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']],
                                                 self.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']],
                                                 self.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']],
                                                 self.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']],
                                                 self.encoder.x: self.encoder.X_sh_train[indx:indx+self.conf['batch_size_val']],
                                                 self.encoder.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']],
                                                 self.encoder.xlen: self.encoder.X_sh_train_mask[indx:indx+self.conf['batch_size_val']],
                                                 self.encoder.x_drop: self.X_sh_train_drop[indx_wrap2:indx_wrap2+self.conf['batch_size_val']],
                                                 self.encoder.y_drop: self.Y_sh_train_drop[indx_wrap2:indx_wrap2+self.conf['batch_size_val']]},
                                             on_unused_input='ignore')
#            else:
#                self.train = theano.function(inputs,
#                            outputs=[self.loss, self.cost, self.perplexity_batch],
#                            updates=weight_updates,
#                            givens={
#                                self.x: self.X_sh_train[indx:indx+self.conf['batch_size_val']],
#                                self.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']],
#                                self.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']],
#                                self.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']],
#                                self.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']]},
#                            on_unused_input='ignore')
        if not self.conf['DECODER']:
            return None

        return self.train
m_theta = theano.shared(name='moment_theta',
                        value=np.zeros(3, dtype=theano.config.floatX))
m_bias = theano.shared(name='moment_bias',
                       value=np.zeros((1, 1), dtype=theano.config.floatX),
                       broadcastable=(True, True))
v_theta = theano.shared(name='velocity_theta',
                        value=np.zeros(3, dtype=theano.config.floatX))
v_bias = theano.shared(name='velocity_bias',
                       value=np.zeros((1, 1), dtype=theano.config.floatX),
                       broadcastable=(True, True))

params = [theta, bias]
moments = [m_theta, m_bias]
vel = [v_theta, v_bias]

one = T.constant(1.0)

# Feedforward Pass
cost = T.mean((T.dot(theta, X.T) + bias - y)**2) / 2

cost_f = theano.function(inputs=[X, y],
                         outputs=cost,
                         allow_input_downcast=True)

# Backward Pass
gradients = T.grad(cost, params)

grads = theano.function(inputs=[X, y],
                        outputs=gradients,
                        allow_input_downcast=True)
 def fmp_shape(x, op):
     return fmp.DisjointPseudorandomFractionalMaxPooling2DOp(
         alpha=alpha, u=u)(T.constant(x)).eval().shape
Exemplo n.º 40
0
def power_of_2(previous_powers, coefficients):
    new_values = previous_powers * coefficients
    index = T.argmax(new_values)
    return new_values, theano.scan_module.until(T.eq(index, T.constant(0)))
Exemplo n.º 41
0
import numpy as np
import theano as th
import theano.tensor as T
from theano.tensor import nlinalg

from utils import jitterChol, t_repeat

from GP_LVM_CMF import SGPDV

floatX = th.config.floatX
log2pi = T.constant(np.log(2*np.pi).astype(floatX))

class IBP_Factor(SGPDV):
    def __init__(self,
            numberOfInducingPoints, # Number of inducing ponts in sparse GP
            batchSize,              # Size of mini batch
            dimX,                   # Dimensionality of the latent co-ordinates
            dimZ,                   # Dimensionality of the latent variables
            data,                   # [NxP] matrix of observations
            kernelType='RBF',
            encoderType_qX='FreeForm',  # 'FreeForm', 'MLP', 'Kernel'.
            encoderType_rX='FreeForm',  # 'FreeForm', 'MLP', 'Kernel', 'NoEncoding'.
            encoderType_ru='FreeForm',  # 'FreeForm', 'MLP', 'NoEncoding'
            z_optimise=False,
            numHiddenUnits_encoder=0,
            numHiddentUnits_decoder=10,
            continuous=True

        ):
                       #self, numberOfInducingPoints, batchSize, dimX, dimZ, data, numHiddenUnits, kernelType_='RBF', continuous_=True, encode_qX=True,encode_rX=False, encode_ru=False, encoder_type='kernel' ):
Exemplo n.º 42
0
    def __init__(self,
                 lookup_table,
                 in_dim,
                 hidden_dims,
                 labels_nums,
                 activation,
                 highway=False,
                 batch_size=64,
                 initializer=default_initializer,
                 optimizer=None,
                 dropout=0,
                 verbose=True):
        self.batch_size = batch_size
        self.num_task = len(labels_nums)
        word_index = T.itensor3()  # (batch, max_len)
        gold_truth = T.ivector()  # (batch, 1)

        mask_query = (word_index > 0) * T.constant(1,
                                                   dtype=theano.config.floatX)
        mask_user = (T.sum(word_index, axis=2) > 0) * T.constant(
            1, dtype=theano.config.floatX)
        word_embedding = lookup_table.W[word_index]
        # max sum averaging
        hidden = get_pooling_batch_word(word_embedding, mask_query,
                                        "averaging")
        hidden = get_pooling_batch(hidden, mask_user, "averaging")
        # hidden = T.mean(hidden, axis=1)
        if len(hidden_dims) == 0 or hidden_dims[0] == 0:
            nn_output = hidden
            nn_output_dim = in_dim
        elif highway:
            encoder = HighwayLayer(in_dim=in_dim,
                                   activation=activation,
                                   initializer=initializer,
                                   dropout=dropout,
                                   verbose=verbose)
            nn_output = encoder.forward_batch(hidden)
            nn_output_dim = encoder.out_dim
        else:
            encoder = MultiHiddenLayer(in_dim=in_dim,
                                       hidden_dims=hidden_dims,
                                       activation=activation,
                                       initializer=initializer,
                                       dropout=dropout,
                                       verbose=verbose)
            nn_output = encoder.forward_batch(hidden)
            nn_output_dim = encoder.out_dim
        if optimizer is None:
            sgd_optimizer = AdaGradOptimizer(lr=0.95, norm_lim=16)
        else:
            sgd_optimizer = optimizer
        self.train_x = shared_zero_matrix((batch_size, 1, 1), dtype=np.int32)
        self.train_y = shared_zero_matrix((1, 1), dtype=np.int32)
        self.dev_x = shared_zero_matrix((batch_size, 1, 1), dtype=np.int32)
        self.test_x = shared_zero_matrix((batch_size, 1, 1), dtype=np.int32)
        self.train_batch_list = list()
        self.pred_train_batch_list = list()
        self.pred_dev_batch_list = list()
        self.pred_test_batch_list = list()
        self.get_y_list = list()
        index = T.ivector()
        classifier_list = list()
        classifier_output_list = list()
        classifier_loss_list = list()
        classifier_param_list = list()
        classifier_updates_list = list()
        for i in xrange(len(labels_nums)):
            classifier = SoftmaxClassifier(num_in=nn_output_dim,
                                           num_out=labels_nums[i],
                                           initializer=initializer)
            classifier_list.append(classifier)
            classifier_output_list.append(
                classifier_list[i].forward(nn_output))
            classifier_loss_list.append(classifier_list[i].loss(
                nn_output, gold_truth))
            if len(hidden_dims) == 0 or hidden_dims[0] == 0:
                classifier_param_list.append(lookup_table.params +
                                             classifier.params)
            else:
                classifier_param_list.append(lookup_table.params +
                                             classifier.params +
                                             encoder.params)
            except_norm_list = [param.name for param in lookup_table.params]
            classifier_updates_list.append(
                sgd_optimizer.get_update(classifier_loss_list[i],
                                         classifier_param_list[i],
                                         except_norm_list))
            train_batch = theano.function(
                inputs=[index],
                outputs=[classifier_output_list[i], classifier_loss_list[i]],
                updates=classifier_updates_list[i],
                givens={
                    word_index: self.train_x[index],
                    gold_truth: self.train_y[index, i]
                })
            self.train_batch_list.append(train_batch)
            pred_train_batch = theano.function(
                inputs=[index],
                outputs=classifier_output_list[i],
                givens={word_index: self.train_x[index]})
            self.pred_train_batch_list.append(pred_train_batch)
            pred_dev_batch = theano.function(
                inputs=[index],
                outputs=classifier_output_list[i],
                givens={word_index: self.dev_x[index]})
            self.pred_dev_batch_list.append(pred_dev_batch)
            pred_test_batch = theano.function(
                inputs=[index],
                outputs=classifier_output_list[i],
                givens={word_index: self.test_x[index]})
            self.pred_test_batch_list.append(pred_test_batch)
            self.get_y_list.append(
                theano.function(inputs=[index], outputs=self.train_y[index,
                                                                     i]))
Exemplo n.º 43
0
)

eval_fn = theano.function(
    [images, total_iters],
    cost.mean()
)

train_data, dev_data, test_data = lib.mnist_binarized.load(
    BATCH_SIZE,
    TEST_BATCH_SIZE
)


#############################################
##############Importance Sampling###########
log2pi = T.constant(np.log(2*np.pi).astype(theano.config.floatX))

k_ = 10

def log_mean_exp(x, axis=1):
    m = T.max(x,  keepdims=True)
    return m + T.log(T.sum(T.exp(x - m), keepdims=True)) - T.log(k_)

def log_lik(samples, mean, log_sigma):
    return -log2pi*T.cast(samples.shape[1], 'float32') / 2 -  \
        T.sum(T.sqr((samples-mean)/T.exp(log_sigma)) + 2*log_sigma, axis=1) / 2

vae_bound = reconst_cost + reg_cost
log_lik_latent_prior = log_lik(latents, 0., 0.)
log_lik_latent_posterior = log_lik(latents, mu, log_sigma)
loglikelihood_normal =  log_lik_latent_prior - reconst_cost - log_lik_latent_posterior
Exemplo n.º 44
0
Arquivo: utils.py Projeto: ehfo0/DVAE
import os, sys
import numpy as np
import scipy as sp
import PIL
import theano
import theano.tensor as T
import pickle, cPickle
from sklearn import preprocessing
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from numpy.lib import stride_tricks
import theano.sandbox.rng_mrg as RNG_MRG
rng = np.random.RandomState()
MRG = RNG_MRG.MRG_RandomStreams(rng.randint(2**30))
c = -T.constant(np.log(2 * np.pi)).astype(theano.config.floatX)
c.tag.test_value = np.log(2 * np.pi).astype(theano.config.floatX)


def unpickle(path):
    ''' For cifar-10 data, it will return dictionary'''
    #Load the cifar 10
    f = open(path, 'rb')
    data = cPickle.load(f)
    f.close()
    return data


def repmat_vec(x, k):

    return T.tile(x.dimshuffle([0, 'x']), [1, k]).T
Exemplo n.º 45
0
    def normal(self,
               size,
               avg=0.0,
               std=1.0,
               ndim=None,
               dtype=None,
               nstreams=None,
               truncate=False,
               **kwargs):
        """
        Sample a tensor of values from a normal distribution.

        Parameters
        ----------
        size : int_vector_like
            Array dimensions for the output tensor.
        avg : float_like, optional
            The mean value for the truncated normal to sample from (defaults to 0.0).
        std : float_like, optional
            The standard deviation for the truncated normal to sample from (defaults to 1.0).
        truncate : bool, optional
            Truncates the normal distribution at 2 standard deviations if True (defaults to False).
            When this flag is set, the standard deviation of the result will be less than the one specified.
        ndim : int, optional
            The number of dimensions for the output tensor (defaults to None).
            This argument is necessary if the size argument is ambiguous on the number of dimensions.
        dtype : str, optional
            The data-type for the output tensor. If not specified,
            the dtype is inferred from avg and std, but it is at least as precise as floatX.
        kwargs
            Other keyword arguments for random number generation (see uniform).

        Returns
        -------
        samples : TensorVariable
            A Theano tensor of samples randomly drawn from a normal distribution.

        """
        size = _check_size(size)
        avg = undefined_grad(as_tensor_variable(avg))
        std = undefined_grad(as_tensor_variable(std))

        if dtype is None:
            dtype = scal.upcast(config.floatX, avg.dtype, std.dtype)

        avg = tensor.cast(avg, dtype=dtype)
        std = tensor.cast(std, dtype=dtype)

        # generate even number of uniform samples
        # Do manual constant folding to lower optiimizer work.
        if isinstance(size, theano.Constant):
            n_odd_samples = size.prod(dtype='int64')
        else:
            n_odd_samples = tensor.prod(size, dtype='int64')
        n_even_samples = n_odd_samples + n_odd_samples % 2
        uniform = self.uniform((n_even_samples, ),
                               low=0.,
                               high=1.,
                               ndim=1,
                               dtype=dtype,
                               nstreams=nstreams,
                               **kwargs)

        # box-muller transform
        u1 = uniform[:n_even_samples // 2]
        u2 = uniform[n_even_samples // 2:]
        r = tensor.sqrt(-2.0 * tensor.log(u1))
        theta = np.array(2.0 * np.pi, dtype=dtype) * u2
        cos_theta, sin_theta = tensor.cos(theta), tensor.sin(theta)
        z0 = r * cos_theta
        z1 = r * sin_theta

        if truncate:
            # use valid samples
            to_fix0 = (z0 < -2.) | (z0 > 2.)
            to_fix1 = (z1 < -2.) | (z1 > 2.)
            z0_valid = z0[tensor.nonzero(~to_fix0)]
            z1_valid = z1[tensor.nonzero(~to_fix1)]

            # re-sample invalid samples
            to_fix0 = tensor.nonzero(to_fix0)[0]
            to_fix1 = tensor.nonzero(to_fix1)[0]
            n_fix_samples = to_fix0.size + to_fix1.size
            lower = tensor.constant(1. / np.e**2, dtype=dtype)
            u_fix = self.uniform((n_fix_samples, ),
                                 low=lower,
                                 high=1.,
                                 ndim=1,
                                 dtype=dtype,
                                 nstreams=nstreams,
                                 **kwargs)
            r_fix = tensor.sqrt(-2. * tensor.log(u_fix))
            z0_fixed = r_fix[:to_fix0.size] * cos_theta[to_fix0]
            z1_fixed = r_fix[to_fix0.size:] * sin_theta[to_fix1]

            # pack everything together to a useful result
            norm_samples = tensor.join(0, z0_valid, z0_fixed, z1_valid,
                                       z1_fixed)
        else:
            norm_samples = tensor.join(0, z0, z1)
        if isinstance(n_odd_samples, theano.Variable):
            samples = norm_samples[:n_odd_samples]
        elif n_odd_samples % 2 == 1:
            samples = norm_samples[:-1]
        else:
            samples = norm_samples
        samples = tensor.reshape(samples, newshape=size, ndim=ndim)
        samples *= std
        samples += avg

        return samples
Exemplo n.º 46
0
def train_conv_net(datasets,
                   U,
                   ofile,
                   cv=0,
                   attr=0,
                   img_w=300,
                   filter_hs=[3, 4, 5],
                   hidden_units=[100, 2],
                   dropout_rate=[0.5],
                   shuffle_batch=True,
                   n_epochs=25,
                   batch_size=50,
                   lr_decay=0.95,
                   conv_non_linear="relu",
                   activations=[Iden],
                   sqr_norm_lim=9,
                   non_static=True):
    """
    Train a simple conv net
    img_h = sentence length (padded where necessary)
    img_w = word vector length (300 for word2vec)
    filter_hs = filter window sizes
    hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer
    sqr_norm_lim = s^2 in the paper
    lr_decay = adadelta decay parameter
    """
    rng = np.random.RandomState(3435)
    img_h = len(datasets[0][0][0])
    filter_w = img_w
    feature_maps = hidden_units[0]
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))
    parameters = [("image shape", img_h, img_w),
                  ("filter shape", filter_shapes),
                  ("hidden_units", hidden_units), ("dropout", dropout_rate),
                  ("batch_size", batch_size), ("non_static", non_static),
                  ("learn_decay", lr_decay),
                  ("conv_non_linear", conv_non_linear),
                  ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim),
                  ("shuffle_batch", shuffle_batch)]
    print parameters

    #define model architecture
    index = T.lscalar()
    x = T.tensor3('x')
    y = T.ivector('y')
    mair = T.fmatrix('mair')
    Words = theano.shared(value=U, name="Words")
    zero_vec_tensor = T.vector()
    zero_vec = np.zeros(img_w)
    set_zero = theano.function([zero_vec_tensor],
                               updates=[
                                   (Words,
                                    T.set_subtensor(Words[0, :],
                                                    zero_vec_tensor))
                               ],
                               allow_input_downcast=True)

    conv_layers = []

    for i in xrange(len(filter_hs)):
        filter_shape = filter_shapes[i]
        pool_size = pool_sizes[i]
        conv_layer = LeNetConvPoolLayer(rng,
                                        image_shape=None,
                                        filter_shape=filter_shape,
                                        poolsize=pool_size,
                                        non_linear=conv_non_linear)
        conv_layers.append(conv_layer)

    layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0], x.shape[1], x.shape[2], Words.shape[1]))

    def convolve_user_statuses(statuses):
        layer1_inputs = []

        def sum_mat(mat, out):
            z = ifelse(T.neq(T.sum(mat), T.constant(0)), T.constant(1),
                       T.constant(0))
            return out + z, theano.scan_module.until(T.eq(z, T.constant(0)))

        status_count, _ = theano.scan(fn=sum_mat,
                                      sequences=statuses,
                                      outputs_info=T.constant(
                                          0, dtype=theano.config.floatX))

        # Slice-out dummy (zeroed) sentences
        relv_input = statuses[:T.cast(status_count[-1], dtype='int32'
                                      )].dimshuffle(0, 'x', 1, 2)

        for conv_layer in conv_layers:
            layer1_inputs.append(
                conv_layer.set_input(input=relv_input).flatten(2))

        features = T.concatenate(layer1_inputs, axis=1)

        avg_feat = T.max(features, axis=0)

        return avg_feat

    conv_feats, _ = theano.scan(fn=convolve_user_statuses,
                                sequences=layer0_input)

    # Add Mairesse features
    layer1_input = T.concatenate([conv_feats, mair], axis=1)  ##mairesse_change
    hidden_units[0] = feature_maps * len(filter_hs) + datasets[4].shape[
        1]  ##mairesse_change
    classifier = MLPDropout(rng,
                            input=layer1_input,
                            layer_sizes=hidden_units,
                            activations=activations,
                            dropout_rates=dropout_rate)

    svm_data = T.concatenate(
        [classifier.layers[0].output,
         y.dimshuffle(0, 'x')], axis=1)
    #define parameters of the model and update functions using adadelta
    params = classifier.params
    for conv_layer in conv_layers:
        params += conv_layer.params
    if non_static:
        #if word vectors are allowed to change, add them as model parameters
        params += [Words]
    cost = classifier.negative_log_likelihood(y)
    dropout_cost = classifier.dropout_negative_log_likelihood(y)
    grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6,
                                        sqr_norm_lim)

    #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate
    #extra data (at random)
    np.random.seed(3435)
    if datasets[0].shape[0] % batch_size > 0:
        extra_data_num = batch_size - datasets[0].shape[0] % batch_size
        rand_perm = np.random.permutation(range(len(datasets[0])))
        train_set_x = datasets[0][rand_perm]
        train_set_y = datasets[1][rand_perm]
        train_set_m = datasets[4][rand_perm]
        extra_data_x = train_set_x[:extra_data_num]
        extra_data_y = train_set_y[:extra_data_num]
        extra_data_m = train_set_m[:extra_data_num]
        new_data_x = np.append(datasets[0], extra_data_x, axis=0)
        new_data_y = np.append(datasets[1], extra_data_y, axis=0)
        new_data_m = np.append(datasets[4], extra_data_m, axis=0)
    else:
        new_data_x = datasets[0]
        new_data_y = datasets[1]
        new_data_m = datasets[4]
    rand_perm = np.random.permutation(range(len(new_data_x)))
    new_data_x = new_data_x[rand_perm]
    new_data_y = new_data_y[rand_perm]
    new_data_m = new_data_m[rand_perm]
    n_batches = new_data_x.shape[0] / batch_size
    n_train_batches = int(np.round(n_batches * 0.9))
    #divide train set into train/val sets
    test_set_x = datasets[2]
    test_set_y = np.asarray(datasets[3], "int32")
    test_set_m = datasets[5]
    train_set_x, train_set_y, train_set_m = shared_dataset(
        (new_data_x[:n_train_batches * batch_size],
         new_data_y[:n_train_batches * batch_size],
         new_data_m[:n_train_batches * batch_size]))
    val_set_x, val_set_y, val_set_m = shared_dataset(
        (new_data_x[n_train_batches * batch_size:],
         new_data_y[n_train_batches * batch_size:],
         new_data_m[n_train_batches * batch_size:]))
    n_val_batches = n_batches - n_train_batches
    val_model = theano.function(
        [index],
        classifier.errors(y),
        givens={
            x: val_set_x[index * batch_size:(index + 1) * batch_size],
            y: val_set_y[index * batch_size:(index + 1) * batch_size],
            mair: val_set_m[index * batch_size:(index + 1) * batch_size]
        },  ##mairesse_change
        allow_input_downcast=True)

    #compile theano functions to get train/val/test errors
    test_model = theano.function(
        [index],
        [classifier.errors(y), svm_data],
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
            mair: train_set_m[index * batch_size:(index + 1) * batch_size]
        },  ##mairesse_change
        allow_input_downcast=True)
    train_model = theano.function(
        [index],
        cost,
        updates=grad_updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
            mair: train_set_m[index * batch_size:(index + 1) * batch_size]
        },  ##mairesse_change
        allow_input_downcast=True)

    test_y_pred = classifier.predict(layer1_input)
    test_error = T.sum(T.neq(test_y_pred, y))
    true_p = T.sum(test_y_pred * y)
    false_p = T.sum(test_y_pred *
                    T.mod(y + T.ones_like(y), T.constant(2, dtype='int32')))
    false_n = T.sum(
        y * T.mod(test_y_pred + T.ones_like(y), T.constant(2, dtype='int32')))
    test_model_all = theano.function(
        [
            x,
            y,
            mair  ##mairesse_change
        ],
        [test_error, true_p, false_p, false_n, svm_data],
        allow_input_downcast=True)

    test_batches = test_set_x.shape[0] / batch_size

    #start training over mini-batches
    print '... training'
    epoch = 0
    best_val_perf = 0
    val_perf = 0
    test_perf = 0
    fscore = 0
    cost_epoch = 0
    while (epoch < n_epochs):
        start_time = time.time()
        epoch = epoch + 1
        if shuffle_batch:
            for minibatch_index in np.random.permutation(
                    range(n_train_batches)):
                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        else:
            for minibatch_index in xrange(n_train_batches):
                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        train_losses = [test_model(i) for i in xrange(n_train_batches)]
        train_perf = 1 - np.mean([loss[0] for loss in train_losses])
        val_losses = [val_model(i) for i in xrange(n_val_batches)]
        val_perf = 1 - np.mean(val_losses)
        epoch_perf = 'epoch: %i, training time: %.2f secs, train perf: %.2f %%, val perf: %.2f %%' % (
            epoch, time.time() - start_time, train_perf * 100.,
            val_perf * 100.)
        print(epoch_perf)
        ofile.write(epoch_perf + "\n")
        ofile.flush()
        if val_perf >= best_val_perf:
            best_val_perf = val_perf
            test_loss_list = [
                test_model_all(
                    test_set_x[idx * batch_size:(idx + 1) * batch_size],
                    test_set_y[idx * batch_size:(idx + 1) * batch_size],
                    test_set_m[idx * batch_size:(idx + 1) *
                               batch_size]  ##mairesse_change
                ) for idx in xrange(test_batches)
            ]
            if test_set_x.shape[0] > test_batches * batch_size:
                test_loss_list.append(
                    test_model_all(
                        test_set_x[test_batches * batch_size:],
                        test_set_y[test_batches * batch_size:],
                        test_set_m[test_batches *
                                   batch_size:]  ##mairesse_change
                    ))
            test_loss_list_temp = test_loss_list
            test_loss_list = np.asarray([t[:-1] for t in test_loss_list])
            test_loss = np.sum(test_loss_list[:, 0]) / float(
                test_set_x.shape[0])
            test_perf = 1 - test_loss
            tp = np.sum(test_loss_list[:, 1])
            fp = np.sum(test_loss_list[:, 2])
            fn = np.sum(test_loss_list[:, 3])
            tn = test_set_x.shape[0] - (tp + fp + fn)
            fscore = np.mean([
                2 * tp / float(2 * tp + fp + fn),
                2 * tn / float(2 * tn + fp + fn)
            ])
            svm_test = np.concatenate([t[-1] for t in test_loss_list_temp],
                                      axis=0)
            svm_train = np.concatenate([t[1] for t in train_losses], axis=0)
            output = "Test result: accu: " + str(
                test_perf) + ", macro_fscore: " + str(fscore) + "\ntp: " + str(
                    tp) + " tn:" + str(tn) + " fp: " + str(fp) + " fn: " + str(
                        fn)
            print output
            ofile.write(output + "\n")
            ofile.flush()
            # dump train and test features
            cPickle.dump(svm_test,
                         open("cvte" + str(attr) + str(cv) + ".p", "wb"))
            cPickle.dump(svm_train,
                         open("cvtr" + str(attr) + str(cv) + ".p", "wb"))
        updated_epochs = refresh_epochs()
        if updated_epochs != None and n_epochs != updated_epochs:
            n_epochs = updated_epochs
            print 'Epochs updated to ' + str(n_epochs)
    return test_perf, fscore
        def recurrance(word_t, x_drop_slice, hh_drop_slice, use_v, h_tm1_hidden, h_tm1_cell, v_i):

                #word_t = theano.printing.Print("word_t")(word_t)

                # get the word embedding matrix or the context information
            if self.conf['DECODER']:
                x_t = ifelse(T.eq(use_v, 1), T.dot(
                    v_i, self.wvm) + self.bmv, self.wemb[word_t])
            else:
                x_t = ifelse(T.eq(use_v, 1), T.zeros_like(
                    self.wemb[word_t]), self.wemb[word_t])

            # if we are not doing minibatch training
            if word_t.ndim == 0:
                x_t = x_t.reshape((1, x_t.shape[0]))
                h_tm1_hidden = h_tm1_hidden.reshape((1, h_tm1_hidden.shape[0]))
                h_tm1_cell = h_tm1_cell.reshape((1, h_tm1_cell.shape[0]))

            # dropout on the input embddings
            if self.conf['DROP_INPUT']:
                x_t *= x_drop_slice

            # clip the gradients so they dont get too large
            h_tm1_hidden_clip = self.clipg(h_tm1_hidden)

            in_state = T.concatenate([x_t, h_tm1_hidden_clip], axis=1)

            if self.conf['BATCH_NORM']:
                mu = T.mean(in_state, axis=0, keepdims=True)
                var = T.var(in_state, axis=0, keepdims=True)
                normed_is = (in_state - mu) / T.sqrt(var +
                                                     T.constant(1e-10, dtype=theano.config.floatX))
                in_state = self.gamma_h * in_state + self.beta_h

            # calculate 8 dot products in one go
            dot_out = T.dot(in_state, self.w_lstm)

            lstm_hidden_size = self.conf['lstm_hidden_size']
            # input gate
            ig = T.nnet.sigmoid(dot_out[:, :lstm_hidden_size])
            # forget gate
            fg = T.nnet.sigmoid(
                dot_out[:, lstm_hidden_size:lstm_hidden_size*2])
            # output gate
            og = T.nnet.sigmoid(
                dot_out[:, lstm_hidden_size*2:lstm_hidden_size*3])

            # cell memory
            cc = fg * h_tm1_cell + ig * T.tanh(dot_out[:, lstm_hidden_size*3:])

            # hidden state
            hh = og * cc

            # drop the output state
            if self.conf['DROP_OUTPUT']:
                hh_d = hh * hh_drop_slice

            # the distribution over output words
            if self.conf['SOFTMAX_OUT']:
                s_t = T.nnet.softmax(T.dot(hh_d, self.w) + self.b)
            else:
                s_t = T.nnet.sigmoid(T.dot(hh_d, self.w) + self.b)

            #hh = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_hidden, hh)
            #cc = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_cell, cc)

            if not self.conf['DECODER']:
                keep_idx = T.and_(T.eq(word_t, 0), T.eq(use_v, 0))
                #keep_idx = theano.printing.Print("keep_idx")(keep_idx)
                if word_t.ndim != 0:
                    keep_idx = keep_idx.dimshuffle((0, 'x'))
                #hh_ret = hh
                #hh_ret[keep_idx, :] = h_tm1_hidden[keep_idx, :]
                hh_ret = keep_idx * h_tm1_hidden + (1-keep_idx) * hh
                cc_ret = keep_idx * h_tm1_cell + (1-keep_idx) * cc
            else:
                hh_ret = hh
                cc_ret = cc

            # if we are not doing minibatch training
            if word_t.ndim == 0:
                hh_ret = hh_ret[0]
                cc_ret = cc_ret[0]

            return [hh_ret, cc_ret, s_t]
Exemplo n.º 48
0
 def sum_mat(mat, out):
     z = ifelse(T.neq(T.sum(mat), T.constant(0)), T.constant(1),
                T.constant(0))
     return out + z, theano.scan_module.until(T.eq(z, T.constant(0)))
Exemplo n.º 49
0
def rollout(x0,
            H,
            gamma0,
            pol,
            dyn,
            cost,
            angle_dims=[],
            z=None,
            mm_state=True,
            mm_cost=True,
            noisy_policy_input=True,
            noisy_cost_input=True,
            truncate_gradient=-1,
            extra_shared=[],
            split_H=2,
            **kwargs):
    ''' Given some initial state particles x0, and a prediction horizon H
    (number of timesteps), returns a set of trajectories sampled from the
    dynamics model and the discounted costs for each step in the
    trajectory.
    '''
    msg = 'Building computation graph for rollout'
    utils.print_with_stamp(msg, 'mc_pilco.rollout')
    msg = 'Moment-matching [state: %s, cost:%s]'
    msg += ', State measurement noise [policy: %s, cost: %s]'
    opts = (mm_state, mm_cost, noisy_policy_input, noisy_cost_input)
    utils.print_with_stamp(msg % opts, 'mc_pilco.rollout')

    # define internal scan computations
    def step_rollout(z1, z2, z2_prev, cumm_cost, x, sn, gamma, *args):
        '''
            Single step of rollout.
        '''
        n = x.shape[0]
        n = n.astype(theano.config.floatX)

        # noisy state measruement for control
        xn = x + z2_prev * (0.5 * sn) if noisy_policy_input else x

        # get next state distribution
        x_next, sn_next = propagate_particles(x, xn, pol, dyn, angle_dims,
                                              **kwargs)

        def eval_cost(xn, mxn=None, Sxn=None):
            c = cost(xn, None)
            # moment-matching for cost
            if mm_cost:
                # compute input moments
                if mxn is None:
                    mxn = xn.mean(0)
                if Sxn is None:
                    Sxn = (xn.T.dot(xn) / n - tt.outer(mxn, mxn))
                # propagate gaussian through cost (should be implemented in
                # cost func)
                mc = cost(mxn, Sxn)[0]
            # no moment-matching
            else:
                mc = c.sum() / n
            return mc, c

        # if resampling (moment-matching for state)
        if mm_state:
            mx_next = x_next.mean(0)
            Sx_next = x_next.T.dot(x_next) / n - tt.outer(mx_next, mx_next)
            x_next = mx_next + z1.dot(tt.slinalg.cholesky(Sx_next).T)
            # noisy state measurement for cost
            xn_next = x_next
            if noisy_cost_input:
                xn_next += z2 * sn_next
                #  get cost of applying action:
                mc_next, c_next = eval_cost(xn_next)
            else:
                mc_next, c_next = eval_cost(xn_next, mx_next, Sx_next)
        # no moment-matching for state
        else:
            # noisy state measurement for cost
            xn_next = x_next + z2 * sn_next if noisy_cost_input else x_next
            #  get cost of applying action:
            mc_next, c_next = eval_cost(xn_next)

        c_next = gamma * c_next
        mc_next = gamma * mc_next
        cumm_cost += mc_next
        return [c_next, cumm_cost, x_next, sn_next, gamma * gamma0]

    # these are the shared variables that will be used in the scan graph.
    # we need to pass them as non_sequences here
    # see: http://deeplearning.net/software/theano/library/scan.html
    nseq = [gamma0]
    nseq.extend(dyn.get_intermediate_outputs())
    nseq.extend(pol.get_intermediate_outputs())
    nseq.extend(extra_shared)

    # loop over the planning horizon
    mode = theano.compile.mode.get_mode('FAST_RUN')
    accum_cost = tt.constant(0, dtype=x0.dtype)
    costs, trajectories = [], []
    # if split_H > 1, this results in truncated BPTT
    H_ = tt.ceil(H * 1.0 / split_H).astype('int32')
    for i in range(1, split_H + 1):
        start_idx = (i - 1) * H_ + 1
        end_idx = start_idx + H_
        output = theano.scan(fn=step_rollout,
                             sequences=[
                                 z[0, start_idx:end_idx], z[1,
                                                            start_idx:end_idx],
                                 z[1, -end_idx:-start_idx]
                             ],
                             outputs_info=[
                                 None, accum_cost, x0, 1e-4 * tt.ones_like(x0),
                                 gamma0
                             ],
                             non_sequences=nseq,
                             strict=True,
                             allow_gc=False,
                             truncate_gradient=truncate_gradient,
                             name="mc_pilco>rollout_scan_%d" % i,
                             mode=mode)

        rollout_output, rollout_updts = output
        costs_i, accum_cost, trajectories_i = rollout_output[:3]
        accum_cost = accum_cost[-1]
        costs.append(costs_i)
        trajectories.append(trajectories_i)
        x0 = trajectories_i[-1, :, :]
        # x0 = theano.gradient.disconnected_grad(x0)

    costs = tt.concatenate(costs)
    trajectories = tt.concatenate(trajectories)

    trajectories.name = 'trajectories'

    # first axis: batch, second axis: time step
    costs = costs.T
    # first axis; batch, second axis: time step
    trajectories = trajectories.transpose(1, 0, 2)

    return [accum_cost, costs, trajectories], rollout_updts
Exemplo n.º 50
0
    def setup(self,
              params,
              gparams,
              shapes=None,
              max_norm=5.0,
              lr=0.01,
              eps=1e-6,
              rho=0.95,
              method="ADADELTA",
              beta=0.0,
              count=None,
              weight_l2=0):
        # Setup only once
        assert not self.updates

        if not shapes:
            shapes = params

        if not count:
            count = T.constant(1, dtype=FLOATX)
        else:
            count = T.cast(count, FLOATX)

        gcache = [
            theano.shared(np.zeros_like(param.get_value(borrow=True),
                                        dtype=FLOATX),
                          name="gcache_%s" % param.name) for param in shapes
        ]
        gcache_mean = [g / self.batch_counter for g in gcache]

        optimize_updates = optimize_parameters(params,
                                               gcache_mean,
                                               shapes,
                                               max_norm,
                                               lr,
                                               eps,
                                               rho,
                                               method,
                                               beta,
                                               gsum_regularization=0.0001,
                                               weight_l2=weight_l2,
                                               clip=self.clip)
        self.updates.extend(optimize_updates)
        self.caches.extend(gcache)

        if self.realtime:
            # Realtime update
            needs_update = self.batch_counter >= T.constant(self.batch_size)
            update_dict = OrderedDict()
            for param, update_val in optimize_updates:
                update_dict[param] = ifelse(needs_update, update_val, param)
            for cache, g in zip(gcache, gparams):
                update_dict[cache] = ifelse(needs_update, g, cache + g)
            update_dict[self.batch_counter] = ifelse(
                needs_update, count, self.batch_counter + count)
            return update_dict.items()

        else:
            # Manual update, perhaps in the end of one iteration
            gcache_updates = [(c, c + g) for c, g in zip(gcache, gparams)] + [
                (self.batch_counter, self.batch_counter + count)
            ]
            return gcache_updates
Exemplo n.º 51
0
    def __init__(self,
                 vocab,
                 encoding,
                 units,
                 opt,
                 initializer,
                 srng,
                 layers=1,
                 regularizer=None,
                 activity_reg=0,
                 temporal_activity_reg=0,
                 zoneout=0.5,
                 input_droput=0.1,
                 output_dropout=0.5,
                 eps=1e-9):
        # Parameters
        self.vocab = vocab
        self.encoding = T.constant(np.int32(encoding), name='encoding')
        assert len(encoding.shape) == 2
        x_k = encoding.shape[0]
        code_k = encoding.shape[1]
        self.lstm = LSTMModel(x_k=x_k,
                              srng=srng,
                              initializer=initializer,
                              units=units,
                              layers=layers,
                              activity_reg=activity_reg,
                              temporal_activity_reg=temporal_activity_reg,
                              zoneout=zoneout,
                              input_droput=input_droput,
                              output_dropout=output_dropout)

        yw = K.variable(initializer((units, code_k)))
        yb = K.variable(initializer((code_k, )))
        self.params = self.lstm.params + [yw, yb]

        # Training
        p1 = T.nnet.sigmoid(T.dot(self.lstm.train_y, yw) +
                            yb)  # (depth, n, code)
        xcode = self.encoding[self.lstm.xr, :]  # (depth, n, code)
        assert xcode.ndim == 3
        # nllrp = (xcode * T.log2(eps + p1)) + ((1 - xcode) * (T.log2(eps + 1. - p1))) # (depth, n, code)
        nllrp = T.switch(xcode, p1, 1. - p1)  # (depth, n, code)
        nllr = -T.sum(T.log(eps + nllrp), axis=2)
        nll = T.mean(nllr, axis=None)
        loss_param_reg = T.constant(0.)
        if regularizer:
            for p in self.params:
                if p.ndim > 1:
                    loss_param_reg += regularizer(p)
        loss = nll + self.lstm.loss_activity + self.lstm.loss_temporal_activity + loss_param_reg
        updates = opt.get_updates(loss, self.params)
        self.train_fun = theano.function([self.lstm.input_x], [
            nll, self.lstm.loss_activity, self.lstm.loss_temporal_activity,
            loss_param_reg, loss
        ],
                                         updates=updates)

        # Testing
        # old version
        """
        p1 = T.nnet.sigmoid(T.dot(self.lstm.test_y, yw) + yb)  # (depth, n, code)
        #nllrp = (xcode * T.log(eps + p1)) + ((1 - xcode) * (T.log(eps + 1. - p1)))
        nllrp = T.switch(xcode, p1, 1. - p1)
        nllr = -T.sum(T.log(eps+nllrp), axis=2)  # (depth, n)
        nll_part = T.transpose(nllr, (1, 0))  # (n, depth)
        self.nll_fun = theano.function([self.lstm.input_x], nll_part)
        """
        p1 = T.nnet.sigmoid(T.dot(self.lstm.test_y, yw) +
                            yb)  # (depth, n, code)
        # xcode: (depth, n, code)
        # encoding: (x_k, code)
        h = (T.dot(T.log(eps + p1), T.transpose(self.encoding, (1, 0))) +
             T.dot(T.log(eps + 1. - p1), T.transpose(1 - self.encoding,
                                                     (1, 0)))
             )  # (depth, n, x_k)
        p2 = softmax_nd(h)  # (depth, n, x_k)

        mg = T.mgrid[0:p2.shape[0], 0:p2.shape[1]]
        pt = p2[mg[0], mg[1], self.lstm.xr]  # (depth, n)
        nll_part = T.transpose(-T.log(eps + pt), (1, 0))
        self.nll_fun = theano.function([self.lstm.input_x], nll_part)

        train_headers = [
            'NLL', 'Activity Reg', 'Temporal Reg', 'Weight Reg', 'Loss'
        ]
        val_headers = ['NLL', 'PPL']
        weights = self.params + opt.weights
        super(LSTMSoftmaxSparse, self).__init__(weights=weights,
                                                train_headers=train_headers,
                                                val_headers=val_headers)
Exemplo n.º 52
0
    class Opt(object):
        merge = theano.gof.MergeOptimizer()
        gemm_opt_1 = theano.gof.TopoOptimizer(theano.tensor_opt.gemm_pattern_1)

        gemm_opt_2 = theano.gof.TopoOptimizer(  # d -= a * (dot()+transpose(dot))
            theano.gof.PatternSub(
                (T.sub_inplace, 'd', (T.mul,
                                      dict(pattern=(T.DimShuffle(
                                          (), ['x', 'x'], inplace=True), 'a'),
                                           allow_multiple_clients=True),
                                      (T.add, (T.dot, 'b', 'c'),
                                       (T.transpose_inplace,
                                        (T.dot, 'f', 'g'))))),
                (T.gemm,
                 (T.gemm, 'd', (T.neg, 'a'), (T.transpose_inplace, 'g'),
                  (T.transpose_inplace, 'f'), T.constant(1.0)),
                 (T.neg, 'a'), 'b', 'c', T.constant(1.0)),
                allow_multiple_clients=False))

        sqr = []
        sqr.append(
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub((T.mul, 'x', 'x'), (T.sqr, 'x'),
                                      allow_multiple_clients=True)))
        sqr.append(
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub((T.pow, 'x', (T.DimShuffle(
                    (), ['x', 'x'], inplace=True), T.constant(2))),
                                      (T.sqr, 'x'),
                                      allow_multiple_clients=True)))

        ident_opt_list = []
        ident_opt_list.append(  # remove explicit copies
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub((T.tensor_copy, 'x'),
                                      'x',
                                      allow_multiple_clients=True)))
        ident_opt_list.append(  # remove double-transpose
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub(
                    (T.transpose_inplace, (T.transpose_inplace, 'x')),
                    'x',
                    allow_multiple_clients=True)))

        ident_opt_list.append(
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub((T.sqr, (T.sqrt, 'x')),
                                      'x',
                                      allow_multiple_clients=True)))
        ident_opt_list.append(
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub((T.sqrt, (T.sqr, 'x')),
                                      'x',
                                      allow_multiple_clients=True)))
        ident_opt_list.append(
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub((T.mul, 'x', (T.div, 'y', 'x')),
                                      'y',
                                      allow_multiple_clients=True)))

        ident_opt_list.append(
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub((T.mul, (T.div, 'y', 'x'), 'x'),
                                      'y',
                                      allow_multiple_clients=True)))

        ident_opt_list.append(
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub((T.div, (T.mul, 'y', 'x'), 'x'),
                                      'y',
                                      allow_multiple_clients=True)))

        ident_opt_list.append(
            theano.gof.TopoOptimizer(
                theano.gof.PatternSub((T.div, (T.mul, 'y', 'x'), 'y'),
                                      'x',
                                      allow_multiple_clients=True)))

        def __call__(self, env):
            self.merge(env)
            #eliminate identities
            if 0:
                print 'SKIPPING optimizations'
            else:

                for opt in self.ident_opt_list:
                    opt(env)

                for opt in self.sqr:
                    opt(env)

                self.gemm_opt_1(env)
                self.gemm_opt_2(env)

                self.merge(env)
Exemplo n.º 53
0
def _infer_ndim_bcast(ndim, shape, *args):
    """
    Infer the number of dimensions from the shape or the other arguments.

    Returns
    -------
    (int, variable, tuple) triple, where the variable is an integer vector,
    and the tuple contains Booleans
        The first element returned is the inferred number of dimensions.
        The second element is the shape inferred (combining symbolic and
        constant informations from shape and args).
        The third element is a broadcasting pattern corresponding to that shape.

    """

    # Find the minimum value of ndim required by the *args
    if args:
        args_ndim = max(arg.ndim for arg in args)
    else:
        args_ndim = 0

    if isinstance(shape, (tuple, list)):
        # there is a convention that -1 means the corresponding shape of a
        # potentially-broadcasted symbolic arg
        #
        # This case combines together symbolic and non-symbolic shape
        # information
        shape_ndim = len(shape)
        if ndim is None:
            ndim = shape_ndim
        else:
            if shape_ndim != ndim:
                raise ValueError(
                    'ndim should be equal to len(shape), but\n',
                    'ndim = %s, len(shape) = %s, shape = %s' %
                    (ndim, shape_ndim, shape))

        bcast = []
        pre_v_shape = []
        for i, s in enumerate(shape):
            if hasattr(s, 'type'):  # s is symbolic
                bcast.append(False)  # todo - introspect further
                pre_v_shape.append(s)
            else:
                if s >= 0:
                    pre_v_shape.append(tensor.as_tensor_variable(s))
                    bcast.append((s == 1))
                elif s == -1:
                    n_a_i = 0
                    for a in args:
                        # ndim: _   _   _   _   _   _
                        # ashp:         s0  s1  s2  s3
                        #           i
                        if i >= ndim - a.ndim:
                            n_a_i += 1
                            a_i = i + a.ndim - ndim
                            if not a.broadcastable[a_i]:
                                pre_v_shape.append(a.shape[a_i])
                                bcast.append(False)
                                break
                    else:
                        if n_a_i == 0:
                            raise ValueError(
                                ('Auto-shape of -1 must overlap'
                                 'with the shape of one of the broadcastable'
                                 'inputs'))
                        else:
                            pre_v_shape.append(tensor.as_tensor_variable(1))
                            bcast.append(True)
                else:
                    ValueError('negative shape', s)
        # post-condition: shape may still contain both symbolic and
        # non-symbolic things
        if len(pre_v_shape) == 0:
            v_shape = tensor.constant([], dtype='int64')
        else:
            v_shape = tensor.stack(pre_v_shape)

    elif shape is None:
        # The number of drawn samples will be determined automatically,
        # but we need to know ndim
        if not args:
            raise TypeError(('_infer_ndim_bcast cannot infer shape without'
                             ' either shape or args'))
        template = reduce(lambda a, b: a + b, args)
        v_shape = template.shape
        bcast = template.broadcastable
        ndim = template.ndim
    else:
        v_shape = tensor.as_tensor_variable(shape)
        if v_shape.ndim != 1:
            raise TypeError(
                "shape must be a vector or list of scalar, got '%s'" % v_shape)

        if ndim is None:
            ndim = tensor.get_vector_length(v_shape)
        bcast = [False] * ndim

    if v_shape.ndim != 1:
        raise TypeError("shape must be a vector or list of scalar, got '%s'" %
                        v_shape)

    if (not (v_shape.dtype.startswith('int')
             or v_shape.dtype.startswith('uint'))):
        raise TypeError('shape must be an integer vector or list',
                        v_shape.dtype)

    if args_ndim > ndim:
        raise ValueError(
            'ndim should be at least as big as required by args value',
            (ndim, args_ndim), args)

    assert ndim == len(bcast)
    return ndim, tensor.cast(v_shape, 'int64'), tuple(bcast)
Exemplo n.º 54
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        eps = numpy.float32(1e-24)
        xdata = theano.shared(data['train_x'], name='xdata')
        ydata = theano.shared(data['train_y'], name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # Store eucledian gradients
        self.gs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store riemannian gradients
        self.rs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store jacobi diagonal
        self.js = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'

        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None] * n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout *
                                     (numpy.float32(1) - tnwout)) * factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [
                    x for x in model.params
                    if x in theano.gof.graph.inputs([nw_out])
                ]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [
                oj + final_results[p]
                for oj, p in zip(args[1 + n_params:1 +
                                      2 * n_params], model.params)
            ]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        ij = [
            TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              mode=gpu_mode,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, shared_data)]

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        #theano.printing.pydotprint(self.compute_eucledian_gradients,
        #        'eucledian_grad', scan_graphs=True)

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']
        self.damping = theano.shared(numpy.float32(options['mreg']))
        mode = gpu_mode

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))

        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              Ms=self.js,
                              rtol=options['mrtol'],
                              shift=self.damping,
                              maxit=options['miters'],
                              mode=mode,
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc0, acc1):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_cost2 = safe_clone(model.train_cost,
                                  replace=dict(zip(model.inputs, nw_inps)))
            return [_idx + const(1), acc0 + nw_cost, acc1 + nw_cost2]

        acc0 = const([0])
        acc1 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0, acc1],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1].sum() / const(n_steps)
        cost0 = rvals[2].sum() / const(n_steps)
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, shared_data)]

        denom = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)])
        rho = (final_cost - cost0) / denom
        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr], [final_cost, rho],
                                       givens=dict(grad_inps),
                                       on_unused_input='warn',
                                       updates=updates,
                                       name='eval_fn',
                                       mode=gpu_mode,
                                       profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        self.update_params = theano.function([lr], [],
                                             updates=update_dict,
                                             name='update_params',
                                             on_unused_input='warn',
                                             mode=mode,
                                             profile=options['profile'])
        self.options = options
        self.old_cost = numpy.inf
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])
Exemplo n.º 55
0
Arquivo: updates.py Projeto: pyx123/QA
def adadelta(loss_or_grads, params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    """ Adadelta updates
    Scale learning rates by the ratio of accumulated gradients to accumulated
    updates, see [1]_ and notes for further description.
    Parameters
    ----------
    loss_or_grads : symbolic expression or list of expressions
        A scalar loss expression, or a list of gradient expressions
    params : list of shared variables
        The variables to generate update expressions for
    learning_rate : float or symbolic scalar
        The learning rate controlling the size of update steps
    rho : float or symbolic scalar
        Squared gradient moving average decay factor
    epsilon : float or symbolic scalar
        Small value added for numerical stability
    Returns
    -------
    OrderedDict
        A dictionary mapping each parameter to its update expression
    Notes
    -----
    rho should be between 0 and 1. A value of rho close to 1 will decay the
    moving average slowly and a value close to 0 will decay the moving average
    fast.
    rho = 0.95 and epsilon=1e-6 are suggested in the paper and reported to
    work for multiple datasets (MNIST, speech).
    In the paper, no learning rate is considered (so learning_rate=1.0).
    Probably best to keep it at this value.
    epsilon is important for the very first update (so the numerator does
    not become 0).
    Using the step size eta and a decay factor rho the learning rate is
    calculated as:
    .. math::
       r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
       \\eta_t &= \\eta \\frac{\\sqrt{s_{t-1} + \\epsilon}}
                             {\sqrt{r_t + \epsilon}}\\\\
       s_t &= \\rho s_{t-1} + (1-\\rho)*(\\eta_t*g)^2
    References
    ----------
    .. [1] Zeiler, M. D. (2012):
           ADADELTA: An Adaptive Learning Rate Method.
           arXiv Preprint arXiv:1212.5701.
    """
    grads = get_or_compute_grads(loss_or_grads, params)
    updates = OrderedDict()

    # Using theano constant to prevent upcasting of float32
    one = T.constant(1)

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)
        # accu: accumulate gradient magnitudes
        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        # delta_accu: accumulate update magnitudes (recursively!)
        delta_accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                   broadcastable=param.broadcastable)

        # update accu (as in rmsprop)
        accu_new = rho * accu + (one - rho) * grad**2
        updates[accu] = accu_new

        # compute parameter update, using the 'old' delta_accu
        update = (grad * T.sqrt(delta_accu + epsilon) /
                  T.sqrt(accu_new + epsilon))
        updates[param] = param - learning_rate * update

        # update delta_accu (as accu, but accumulating updates)
        delta_accu_new = rho * delta_accu + (one - rho) * update**2
        updates[delta_accu] = delta_accu_new

    return updates
Exemplo n.º 56
0
def adam_vlr(loss_or_grads,
             params,
             lr_map,
             beta1=0.9,
             beta2=0.999,
             epsilon=1e-8):
    """Adam updates with Variable Learning Rates

    Adam updates implemented as in [1]_.

    Parameters
    ----------
    loss_or_grads : symbolic expression or list of expressions
        A scalar loss expression, or a list of gradient expressions
    params : list of shared variables
        The variables to generate update expressions for
    lr_map : dictionary of floats
        Learning rate map containing layer name and associated learning rate
    beta1 : float
        Exponential decay rate for the first moment estimates.
    beta2 : float
        Exponential decay rate for the second moment estimates.
    epsilon : float
        Constant for numerical stability.

    Returns
    -------
    OrderedDict
        A dictionary mapping each parameter to its update expression

    Notes
    -----
    The paper [1]_ includes an additional hyperparameter lambda. This is only
    needed to prove convergence of the algorithm and has no practical use
    (personal communication with the authors), it is therefore omitted here.

    References
    ----------
    .. [1] Kingma, Diederik, and Jimmy Ba (2014):
           Adam: A Method for Stochastic Optimization.
           arXiv preprint arXiv:1412.6980.
    """
    all_grads = lasagne.updates.get_or_compute_grads(loss_or_grads, params)
    t_prev = theano.shared(utils.floatX(0.))
    updates = OrderedDict()

    # Using theano constant to prevent upcasting of float32
    one = T.constant(1)

    t = t_prev + 1

    for param, g_t in zip(params, all_grads):
        a_t = lr_map[param] * T.sqrt(one - beta2**t) / (one - beta1**t)
        value = param.get_value(borrow=True)
        m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)
        v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)

        m_t = beta1 * m_prev + (one - beta1) * g_t
        v_t = beta2 * v_prev + (one - beta2) * g_t**2
        step = a_t * m_t / (T.sqrt(v_t) + epsilon)

        updates[m_prev] = m_t
        updates[v_prev] = v_t
        updates[param] = param - step

    updates[t_prev] = t
    return updates
Exemplo n.º 57
0
 def batch_norm(self,
                h,
                dim,
                use_shift=True,
                use_std=True,
                use_sample=0.0,
                force_sample=False,
                index=None,
                sample_mean=None,
                gamma=None,
                beta=None,
                depth_norm=False):
     x = h
     if h.ndim == 3:
         if index is None: index = self.index
         x = h.reshape((h.shape[0] * h.shape[1],
                        h.shape[2]))[(index.flatten() > 0).nonzero()]
     elif h.ndim == 4:  # index is sizes here
         assert index is not None
         x = h.reshape((h.shape[0] * h.shape[1] * h.shape[2], h.shape[3]))
         #x = x[(T.gt(x,numpy.float32(0))>0).nonzero()]
     mean = T.mean(x, axis=0)
     std = T.sqrt(T.mean((x - mean)**2, axis=0))
     if sample_mean is None:
         sample_mean = self.add_param(theano.shared(
             numpy.zeros((dim, ), 'float32'),
             '%s_%s_mean' % (self.name, h.name)),
                                      custom_update=mean,
                                      custom_update_normalized=True)
     self.sample_mean = sample_mean
     sample_std = T.sqrt(T.mean((x - sample_mean)**2, axis=0))
     if not self.train_flag and not force_sample:
         use_sample = 1.0
     mean = T.constant(1. - use_sample, 'float32') * mean + T.constant(
         use_sample, 'float32') * sample_mean
     std = T.constant(1. - use_sample, 'float32') * std + T.constant(
         use_sample, 'float32') * sample_std
     if h.ndim == 3:
         mean = mean.dimshuffle('x', 'x',
                                0).repeat(h.shape[0],
                                          axis=0).repeat(h.shape[1], axis=1)
         std = std.dimshuffle('x', 'x', 0).repeat(h.shape[0],
                                                  axis=0).repeat(h.shape[1],
                                                                 axis=1)
     elif h.ndim == 4:
         mean = mean.dimshuffle('x', 'x', 'x', 0).repeat(
             h.shape[0], axis=0).repeat(h.shape[1],
                                        axis=1).repeat(h.shape[2], axis=2)
         std = std.dimshuffle('x', 'x', 'x', 0).repeat(
             h.shape[0], axis=0).repeat(h.shape[1],
                                        axis=1).repeat(h.shape[2], axis=2)
     else:
         mean = mean.dimshuffle('x', 0).repeat(h.shape[0], axis=0)
         std = std.dimshuffle('x', 0).repeat(h.shape[0], axis=0)
     bn = (h - mean) / (std + numpy.float32(1e-10))
     if use_std:
         if gamma is None:
             gamma = self.add_param(
                 self.shared(
                     numpy.zeros((dim, ), 'float32') + numpy.float32(0.1),
                     "%s_%s_gamma" % (self.name, h.name)))
         self.gamma = gamma
         if h.ndim == 3:
             bn *= gamma.dimshuffle('x', 'x',
                                    0).repeat(h.shape[0],
                                              axis=0).repeat(h.shape[1],
                                                             axis=1)
         elif h.ndim == 4:
             bn *= gamma.dimshuffle('x', 'x', 'x', 0).repeat(
                 h.shape[0], axis=0).repeat(h.shape[1],
                                            axis=1).repeat(h.shape[2],
                                                           axis=2)
         else:
             bn *= gamma.dimshuffle('x', 0).repeat(h.shape[0], axis=0)
     if use_shift:
         if beta is None:
             beta = self.add_param(
                 self.shared(numpy.zeros((dim, ), 'float32'),
                             "%s_%s_beta" % (self.name, h.name)))
         self.beta = beta
         bn += beta
     if depth_norm:
         bn = bn / (T.sqrt(2)**self.D)
     return bn
Exemplo n.º 58
0
    def normal(self, size, avg=0.0, std=1.0, ndim=None,
               dtype=None, nstreams=None):
        """
        :param size:
          Can be a list of integers or Theano variables (ex: the shape
          of another Theano Variable)

        :param dtype:
          The output data type. If dtype is not specified, it will be
          inferred from the dtype of low and high, but will be at
          least as precise as floatX.

        :param nstreams:
          Number of streams.

        """
        # We need an even number of ]0,1[ samples. Then we split them
        # in two halves. First half becomes our U1's for Box-Muller,
        # second half our U2's. See Wikipedia page:
        # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
        avg = as_tensor_variable(avg)
        std = as_tensor_variable(std)

        if dtype is None:
            dtype = scal.upcast(config.floatX, avg.dtype, std.dtype)

        avg = cast(avg, dtype)
        std = cast(std, dtype)

        evened = False
        constant = False
        if isinstance(size, tuple) and all([isinstance(i, (numpy.integer, int)) for i in size]):
            constant = True
            # Force dtype because it defaults to float when size is empty
            n_samples = numpy.prod(size, dtype='int64')

            if n_samples % 2 == 1:
                n_samples += 1
                evened = True
        else:
            #if even, don't change, if odd, +1
            n_samples = prod(size) + (prod(size) % 2)
        flattened = self.uniform(size=(n_samples,), dtype=dtype,
                                 nstreams=nstreams)

        if constant:
            U1 = flattened[:n_samples // 2]
            U2 = flattened[n_samples // 2:]
        else:
            U1 = flattened[:prod(flattened.shape) // 2]
            U2 = flattened[prod(flattened.shape) // 2:]

        #normal_samples = zeros_like(flattened)
        sqrt_ln_U1 = sqrt(-2.0 * log(U1))
        # TypeError: 'TensorVariable' object does not support item assignment
        # so this doesn't work...
        #normal_samples[:n_samples/2] = sqrt_ln_U1 * cos(2.0*numpy.pi*U2)
        #normal_samples[n_samples/2:] = sqrt_ln_U1 * sin(2.0*numpy.pi*U2)

        # so trying this instead
        first_half = sqrt_ln_U1 * cos(numpy.array(2.0 * numpy.pi, dtype=dtype) * U2)
        second_half = sqrt_ln_U1 * sin(numpy.array(2.0 * numpy.pi, dtype=dtype) * U2)
        normal_samples = join(0, first_half, second_half)

        final_samples = None
        if evened:
            final_samples = normal_samples[:-1]
        elif constant:
            final_samples = normal_samples
        else:
            final_samples = normal_samples[:prod(size)]

        if not size:
            # Force the dtype to be int64, otherwise reshape complains
            size = tensor.constant(size, dtype='int64')
        final_samples = final_samples.reshape(size)

        final_samples = avg + std * final_samples

        assert final_samples.dtype == dtype
        return final_samples
Exemplo n.º 59
0
 def __init__(self,
              sources,
              n_out,
              index,
              y_in=None,
              target=None,
              target_index=None,
              sparse=False,
              cost_scale=1.0,
              input_scale=1.0,
              L1=0.0,
              L2=0.0,
              L2_eye=None,
              varreg=0.0,
              output_L2_reg=0.0,
              output_entropy_reg=0.0,
              output_entropy_exp_reg=0.0,
              with_bias=True,
              mask="unity",
              dropout=0.0,
              batch_drop=False,
              batch_norm=False,
              bn_use_sample=False,
              layer_drop=0.0,
              residual=False,
              carry=False,
              sparse_filtering=False,
              gradient_scale=1.0,
              trainable=True,
              device=None,
              dtype='float32',
              **kwargs):
     """
 :param list[NetworkBaseLayer.Layer] sources: list of source layers
 :param int n_out: output dim of W_in and dim of bias
 :param float L1: l1-param-norm regularization
 :param float L2: l2-param-norm regularization
 :param str mask: "unity" or "dropout"
 :type dropout: float
 """
     super(Layer, self).__init__(**kwargs)
     self.index = index
     self.sources = sources
     ":type: list[Layer]"
     self.num_sources = len(sources)
     self.D = max([s.D for s in sources if isinstance(s, Layer)] + [0])
     if mask is None: mask = 'none'
     self.set_attr('mask', mask)
     self.set_attr('dropout', dropout)
     self.set_attr('sparse', sparse)
     self.set_attr('bn_use_sample', bn_use_sample)
     self.set_attr('sparse_filtering', sparse_filtering)
     if not trainable:
         self.set_attr('trainable', trainable)  # only store if not default
         self.gradient_scale = 0.0  # just to be sure
     else:
         self.gradient_scale = gradient_scale
     if gradient_scale != 1.0:
         self.set_attr('gradient_scale', gradient_scale)
     self.set_attr('layer_drop', layer_drop)
     assert not carry, "not supported anymore"
     self.set_attr('residual', residual)
     self.set_attr('n_out', n_out)
     self.set_attr('L1', L1)
     self.set_attr('L2', L2)
     if L2_eye:
         self.set_attr('L2_eye', L2_eye)
     self.device = device  # if device else str(theano.config.device)
     for s in self.sources:
         s.transfer_output(self.device)
     self.set_attr('varreg', varreg)
     if output_L2_reg:
         self.set_attr('output_L2_reg', output_L2_reg)
     if output_entropy_reg:
         self.set_attr('output_entropy_reg', output_entropy_reg)
     if output_entropy_exp_reg:
         self.set_attr('output_entropy_exp_reg', output_entropy_exp_reg)
     self.set_attr('batch_norm', batch_norm)
     self.set_attr('input_scale', input_scale)
     if y_in is not None:
         self.y_in = {}
         for k in y_in:
             if not isinstance(y_in[k], T.Variable): continue
             self.y_in[k] = time_batch_make_flat(
                 y_in[k])  # TODO: better not flatten here...
             self.y_in[k].n_out = getattr(y_in[k], "n_out", None)
     else:
         self.y_in = None
     self.constraints = T.constant(0)
     if target:
         self.set_attr('target', target)
     if target_index:
         self.set_attr('target_index', target_index)
         assert target_index in self.network.j
         self.index = index = self.network.j[target_index]
     if cost_scale != 1:
         self.set_attr("cost_scale", cost_scale)
     if with_bias:
         self.b = self.add_param(self.create_bias(n_out),
                                 'b_%s' % self.name)
     else:
         self.set_attr('with_bias', False)
         self.b = numpy.float32(0)
     self.mass = T.constant(1., name="mass_%s" % self.name, dtype='float32')
     self.masks = [None] * len(self.sources)
     assert mask in ['dropout', 'unity', 'none'], "invalid mask: %s" % mask
     if mask == "dropout" or (mask == 'none' and dropout > 0):
         assert 0.0 < dropout < 1.0
         # If we apply this mass during training then we don't need any mask or mass for testing.
         # The expected weight should be 1 in
         #   E[x] = mass * (1-dropout)
         # so mass has to be 1 / (1 - dropout).
         self.mass = T.constant(1.0 / (1.0 - dropout), dtype='float32')
         from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
         srng = RandomStreams(self.rng.randint(1234) + 1)
         if self.depth > 1:
             self.masks = [
                 T.cast(
                     srng.binomial(n=1,
                                   p=1 - dropout,
                                   size=(s.attrs['n_out'], self.depth)),
                     theano.config.floatX) for s in self.sources
             ]
         else:
             if batch_drop:
                 self.masks = [
                     T.cast(
                         srng.binomial(n=1,
                                       p=1 - dropout,
                                       size=s.output.shape),
                         theano.config.floatX) for s in self.sources
                 ]
             else:
                 self.masks = [
                     T.cast(
                         srng.binomial(n=1,
                                       p=1 - dropout,
                                       size=(s.attrs['n_out'], )),
                         theano.config.floatX) for s in self.sources
                 ]
Exemplo n.º 60
0
    def __init__(self, X, n_in, n_out, n_hidden_layers,
                 n_units_in, n_units_hidden,
                 M_lst=None, m_lst=None,
                 sigma_W_params_lst=None, sigma_b_params_lst=None,
                 sigma_W=1e-3, tune_sigma_W=True,
                 sigma_b=1e-6, tune_sigma_b=True,
                 l_W=1e-6, l_b=1e-6,
                 diag_noise=True, approx_cols=False,
                 divide_1st_layer_by_its_n_out=False,
                 b_out_deterministic=False, seed=None):
        assert n_hidden_layers > 0, 'n_layers must be positive'

        n_layers = n_hidden_layers + 1

        M_lst = [None] * (n_layers) if M_lst is None else M_lst
        m_lst = [None] * (n_layers) if m_lst is None else m_lst

        if sigma_W_params_lst is None:
            sigma_W_params_lst = [None] * (n_layers)
        if sigma_b_params_lst is None:
            sigma_b_params_lst = [None] * (n_layers)

        assert \
            len(M_lst) ==  len(m_lst) == len(sigma_W_params_lst) == \
            len(sigma_b_params_lst) == n_layers, \
            'length of all lists must be hte same and equal to ' \
            '(n_layers + 1) where the +1 is for the output layer mapping'

        # set seed to ensure each layer is init differently (cf. seed += 1)
        seed = np.random.randint(int(1e6)) if seed is None else seed
        np.random.seed(seed)

        def activation(x):
            return T.nnet.relu(x, alpha=0.1)

        self.in_layer = GaussLayer(
            input=X, n_in=n_in, n_out=n_units_in,
            M=M_lst[0], m=m_lst[0],
            sigma_W=sigma_W, tune_sigma_W=tune_sigma_W,
            sigma_W_params=sigma_W_params_lst[0],
            sigma_b=sigma_b, tune_sigma_b=tune_sigma_b,
            sigma_b_params=sigma_b_params_lst[0],
            l_W=l_W, l_b=l_b, diag_noise=diag_noise,
            activation=activation, approx_cols=approx_cols,
            seed=seed, name='h1'
        )
        self.layers = [self.in_layer]
        seed += 1

        # specific settings necessary for initialisation of deep GPs
        if divide_1st_layer_by_its_n_out:
            sqrt_n_out = T.constant(self.in_layer.n_out ** 0.5, dtype=floatX)
            self.in_layer.output /= sqrt_n_out

        # the first hidden layer was already set up above
        for i in xrange(1, n_hidden_layers):
            prev_layer = self.layers[-1]
            layer = GaussLayer(
                input=prev_layer.output,
                n_in=prev_layer.n_out, n_out=n_units_hidden,
                M=M_lst[i], m=m_lst[i],
                sigma_W=sigma_W, tune_sigma_W=tune_sigma_W,
                sigma_W_params=sigma_W_params_lst[i],
                sigma_b=sigma_b, tune_sigma_b=tune_sigma_b,
                sigma_b_params=sigma_b_params_lst[i],
                l_W=l_W, l_b=l_b, diag_noise=diag_noise,
                activation=activation, name='h' + str(i + 1),
                approx_cols=approx_cols, seed=seed
            )
            self.layers += [layer]
            seed += 1

        # initialised separately because of the necessary linear activation
        prev_layer = self.layers[-1]
        self.out_layer = GaussLayer(
            input=prev_layer.output, n_in=prev_layer.n_out, n_out=n_out,
            M=M_lst[-1], m=m_lst[-1],
            sigma_W=sigma_W, tune_sigma_W=tune_sigma_W,
            sigma_W_params=sigma_W_params_lst[-1],
            sigma_b=sigma_b, tune_sigma_b=tune_sigma_b,
            sigma_b_params=sigma_b_params_lst[-1],
            l_W=l_W, l_b=l_b, diag_noise=diag_noise,
            b_is_deterministic=b_out_deterministic,
            approx_cols=approx_cols, name='out', seed=seed
        )
        self.layers += [self.out_layer]

        self.softmax = SoftmaxLayer(
            input=self.out_layer.output, name='softmax'
        )

        self.params = reduce(
            lambda x, y: x + y, [layer.grad_params for layer in self.layers]
        )

        self.input = X

        self.p_y_given_x = self.softmax.p_y_given_x
        self.y_pred = self.softmax.y_pred
        self.mean_log_likelihood = self.softmax.mean_log_likelihood
        self.errors = self.softmax.errors

        # self.kl_W = T.sum([layer.kl_W() for layer in self.layers])
        # self.kl_b = T.sum([layer.kl_b() for layer in self.layers])
        # self.kl = self.kl_W + self.kl_b

        self.effect_kl_W = T.sum([layer.effect_kl_W() for layer in self.layers])
        self.effect_kl_b = T.sum([layer.effect_kl_b() for layer in self.layers])
        self.effect_kl = self.effect_kl_W + self.effect_kl_b