Exemplo n.º 1
0
def test_multinomial_n_samples():
    mode_ = mode
    if mode == 'FAST_COMPILE':
        mode_ = 'FAST_RUN'

    if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
        mode == 'Mode' and config.linker in ['py']):
        sample_size = (49, 5)
    else:
        sample_size = (450, 6)
    mode_ = theano.compile.mode.get_mode(mode_)

    pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
    pvals = numpy.apply_along_axis(lambda row: row / numpy.sum(row), 1, pvals)
    R = MRG_RandomStreams(234, use_cuda=False)
    
    for n_samples, steps in zip([5, 10, 100, 1000], [20, 10, 1, 1]):
        m = R.multinomial(pvals=pvals, n=n_samples, dtype=config.floatX, nstreams=30 * 256)
        f = theano.function([], m, mode=mode_)
        basic_multinomialtest(f, steps, sample_size, pvals, n_samples, prefix='mrg ')
        sys.stdout.flush()
        
        if mode != 'FAST_COMPILE' and cuda_available:
            R = MRG_RandomStreams(234, use_cuda=True)
            pvals = numpy.asarray(pvals, dtype='float32')
            n = R.multinomial(pvals=pvals, n=n_samples, dtype='float32', nstreams=30 * 256)
            assert n.dtype == 'float32'
            f = theano.function(
                [],
                theano.sandbox.cuda.basic_ops.gpu_from_host(n),
                mode=mode_.including('gpu'))
        
            sys.stdout.flush()
            basic_multinomialtest(f, steps, sample_size, pvals, n_samples, prefix='gpu mrg ')
Exemplo n.º 2
0
def test_multinomial():
    steps = 100
    mode_ = mode
    if mode == 'FAST_COMPILE':
        mode_ = 'FAST_RUN'

    if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
            mode == 'Mode' and config.linker in ['py']):
        sample_size = (49, 5)
    else:
        sample_size = (450, 6)
    mode_ = theano.compile.mode.get_mode(mode_)
    # print ''
    # print 'ON CPU:'

    pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
    pvals = numpy.apply_along_axis(lambda row: row / numpy.sum(row), 1, pvals)
    R = MRG_RandomStreams(234, use_cuda=False)
    # Note: we specify `nstreams` to avoid a warning.
    m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
    f = theano.function([], m, mode=mode_)
    # theano.printing.debugprint(f)
    out = f()
    basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
                          prefix='mrg ')

    sys.stdout.flush()

    if mode != 'FAST_COMPILE' and cuda_available:
        # print ''
        # print 'ON GPU:'
        R = MRG_RandomStreams(234, use_cuda=True)
        pvals = numpy.asarray(pvals, dtype='float32')
        # We give the number of streams to avoid a warning.
        n = R.multinomial(pvals=pvals, dtype='float32', nstreams=30 * 256)
        # well, it's really that this test w GPU doesn't make sense otw
        assert n.dtype == 'float32'
        f = theano.function(
            [],
            theano.sandbox.cuda.basic_ops.gpu_from_host(n),
            mode=mode_.including('gpu'))

        # theano.printing.debugprint(f)
        gpu_out = f()
        sys.stdout.flush()
        basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
                              prefix='gpu mrg ')
        numpy.testing.assert_array_almost_equal(out, gpu_out, decimal=6)
Exemplo n.º 3
0
    def prediction(self, h, bias):
        srng = RandomStreams(seed=42)

        prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \
            self.compute_parameters(h, bias)

        mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1)

        v = T.arange(0, mean_x.shape[0])
        m_x = mean_x[v, mode]
        m_y = mean_y[v, mode]
        s_x = std_x[v, mode]
        s_y = std_y[v, mode]
        r = rho[v, mode]
        # cov = r * (s_x * s_y)

        normal = srng.normal((h.shape[0], 2))
        x = normal[:, 0]
        y = normal[:, 1]

        # x_n = T.shape_padright(s_x * x + cov * y + m_x)
        # y_n = T.shape_padright(s_y * y + cov * x + m_y)

        x_n = T.shape_padright(m_x + s_x * x)
        y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1.-r**2)))

        uniform = srng.uniform((h.shape[0],))
        pin = T.shape_padright(T.cast(bernoulli > uniform, floatX))

        return T.concatenate([x_n, y_n, pin], axis=1)
Exemplo n.º 4
0
Arquivo: intro.py Projeto: FynnBe/ML2
def dropout(X, p_use=1.):
    if p_use < 1.:
        rs = RandomStreams()
        out = rs.multinomial(pvals=[[p_use, 1.-p_use]]*len(X))
        print out

    else:
        return X
Exemplo n.º 5
0
Arquivo: test.py Projeto: FynnBe/ML2
def dropout(X, p_use=1.):
    if p_use < 1.:
        rs = RandomStreams()
        out = rs.multinomial(pvals=[[p_use, 1.-p_use]])
        print out.flatten()

        print dir(out.T)

    else:
        return X
Exemplo n.º 6
0
def test_multinomial():

    steps = 100
    mode_ = mode
    if mode == "FAST_COMPILE":
        mode_ = "FAST_RUN"

    if mode in ["DEBUG_MODE", "DebugMode", "FAST_COMPILE"]:
        sample_size = (49, 5)
    else:
        sample_size = (450, 6)
    mode_ = theano.compile.mode.get_mode(mode_)
    print ""
    print "ON CPU:"

    pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
    pvals = numpy.apply_along_axis(lambda row: row / numpy.sum(row), 1, pvals)
    R = MRG_RandomStreams(234, use_cuda=False)
    # Note: we specify `nstreams` to avoid a warning.
    m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
    f = theano.function([], m, mode=mode_)
    theano.printing.debugprint(f)
    out = f()
    basic_multinomialtest(f, steps, sample_size, pvals, prefix="mrg ")

    sys.stdout.flush()

    if mode != "FAST_COMPILE" and cuda_available:
        print ""
        print "ON GPU:"
        R = MRG_RandomStreams(234, use_cuda=True)
        pvals = numpy.asarray(pvals, dtype="float32")
        # We give the number of streams to avoid a warning.
        n = R.multinomial(pvals=pvals, dtype="float32", nstreams=30 * 256)
        assert n.dtype == "float32"  # well, it's really that this test w GPU doesn't make sense otw
        f = theano.function([], theano.sandbox.cuda.basic_ops.gpu_from_host(n), mode=mode_.including("gpu"))

        theano.printing.debugprint(f)
        gpu_out = f()
        sys.stdout.flush()
        basic_multinomialtest(f, steps, sample_size, pvals, prefix="gpu mrg ")
        numpy.testing.assert_array_almost_equal(out, gpu_out, decimal=6)
Exemplo n.º 7
0
    def __init__(self, seq_len, emb_size, n_hidden, size_dict, batch_size, lr):

        self.seq_len = seq_len
        self.batch_size = batch_size

        w_emb = shared(np.random.normal(
            0, 0.01, size=(size_dict, emb_size)).astype(dtype=floatX))

        w_in = shared(np.random.normal(
            0, 0.01, size=(emb_size, n_hidden)).astype(dtype=floatX))

        b_in = shared(np.random.normal(
            0, 0.01, size=(n_hidden,)).astype(dtype=floatX))

        # IRNN initialization
        # w_hidden = shared(np.eye(n_hidden).astype(dtype=floatX))
        w_hidden = shared(np.random.normal(
            0, 0.01, size=(n_hidden, n_hidden)).astype(dtype=floatX))

        b_hidden = shared(np.random.normal(
            0, 0.01, size=(n_hidden,)).astype(dtype=floatX))

        w_out = shared(np.random.normal(
            0, 0.01, size=(n_hidden, size_dict)).astype(dtype=floatX))
        b_out = shared(np.random.normal(
            0, 0.01, size=(size_dict,)).astype(dtype=floatX))

        self.params = [w_emb, w_in, b_in, w_hidden, b_hidden, w_out, b_out]

        x = t.imatrix('x')
        y = t.ivector('y')

        self.init_state = shared(np.zeros((batch_size, n_hidden), dtype=floatX))
        buff = self.init_state
        for e in xrange(seq_len):
            emb = w_emb[x[:, e]]
            emb = emb.reshape((x.shape[0], -1))
            buff = relu(t.dot(emb, w_in) + t.dot(buff, w_hidden) + b_hidden)

        y_hat = t.nnet.softmax((t.dot(buff, w_out)) + b_out)

        cost = t.nnet.categorical_crossentropy(y_hat, y).mean()

        params = [w_emb, w_in, w_hidden, b_hidden, w_out, b_out]

        grads = t.grad(cost, params)
        updates = [(self.init_state, buff)] + \
                  [(w, w - lr * p) for w, p in zip(params, grads)]

        self.fun_cost = function([x, y], cost, updates=updates)

        rng = MRG_RandomStreams(42)
        next_char = t.argmax(rng.multinomial(pvals=y_hat), axis=1)
        self.fun_predict = function([x], next_char)
Exemplo n.º 8
0
def test_target_parameter():
    srng = MRG_RandomStreams()
    pvals = np.array([[.98, .01, .01], [.01, .49, .50]])

    def basic_target_parameter_test(x):
        f = theano.function([], x)
        assert isinstance(f(), np.ndarray)

    basic_target_parameter_test(srng.uniform((3, 2), target='cpu'))
    basic_target_parameter_test(srng.binomial((3, 2), target='cpu'))
    basic_target_parameter_test(srng.multinomial(pvals=pvals.astype('float32'), target='cpu'))
    basic_target_parameter_test(srng.choice(p=pvals.astype('float32'), replace=False, target='cpu'))
    basic_target_parameter_test(srng.multinomial_wo_replacement(pvals=pvals.astype('float32'), target='cpu'))
Exemplo n.º 9
0
def test_undefined_grad_opt():
    # Make sure that undefined grad get removed in optimized graph.
    random = RandomStreams(np.random.randint(1, 2147462579))
    pvals = theano.shared(np.random.rand(10, 20).astype(theano.config.floatX))
    pvals = pvals / pvals.sum(axis=1)
    pvals = gradient.zero_grad(pvals)
    samples = random.multinomial(pvals=pvals, n=1)
    samples = theano.tensor.cast(samples, pvals.dtype)
    samples = gradient.zero_grad(samples)
    cost = theano.tensor.sum(samples + pvals)
    grad = theano.tensor.grad(cost, samples)
    f = theano.function([], grad)
    theano.printing.debugprint(f)
    assert not any([isinstance(node.op, gradient.UndefinedGrad) for node in f.maker.fgraph.apply_nodes])
Exemplo n.º 10
0
    def get_decide_func(self):
        """
        Returns a theano function that takes a minibatch
        (num_examples, num_features) of contexts and returns
        a minibatch (num_examples, num_classes) of one-hot codes
        for actions.
        """

        X = T.matrix()
        y_hat = self.mlp.fprop(X)

        theano_rng = MRG_RandomStreams(2013 + 11 + 20)
        if self.stochastic:
            a = theano_rng.multinomial(pvals=y_hat, dtype='float32')
        else:
            mx = T.max(y_hat, axis=1).dimshuffle(0, 'x')
            a = T.eq(y_hat, mx)

        if self.epsilon is not None:
            a = theano_rng.multinomial(pvals = (1. - self.epsilon) * a +
                    self.epsilon * T.ones_like(y_hat) / y_hat.shape[1],
                    dtype = 'float32')

        if self.epsilon_stochastic is not None:
            a = theano_rng.multinomial(pvals = (1. - self.epsilon_stochastic) * a +
                    self.epsilon_stochastic * y_hat,
                    dtype = 'float32')

        print "Compiling classifier agent learning function"
        t1 = time.time()
        f = function([X], a)
        t2 = time.time()

        print "...done, took", t2 - t1

        return f
Exemplo n.º 11
0
def stochastic_pool(neibs, axis, deterministic):
    """
    NOTE: assumes that inputs are >= 0
    """
    assert axis == 1
    # TODO parameterize
    epsilon = 1e-6
    as_p = neibs / (neibs.sum(axis=axis, keepdims=True) + epsilon)
    if deterministic:
        mask = as_p
    else:
        # FIXME save state in network
        srng = MRG_RandomStreams()
        mask = srng.multinomial(pvals=as_p).astype(fX)
    return (neibs * mask).sum(axis=axis)
Exemplo n.º 12
0
def softmax_sample_layer(list_of_multinomial_inputs, name, random_state=None):
    theano_seed = random_state.randint(-2147462579, 2147462579)
    # Super edge case...
    if theano_seed == 0:
        print("WARNING: prior layer got 0 seed. Reseeding...")
        theano_seed = random_state.randint(-2**32, 2**32)
    theano_rng = MRG_RandomStreams(seed=theano_seed)
    conc_multinomial = concatenate(list_of_multinomial_inputs, name, axis=1)
    shape = expression_shape(conc_multinomial)
    conc_multinomial /= len(list_of_multinomial_inputs)
    tag_expression(conc_multinomial, name, shape)
    samp = theano_rng.multinomial(pvals=conc_multinomial,
                                  dtype="int32")
    tag_expression(samp, name, (shape[0], shape[1]))
    return samp
Exemplo n.º 13
0
def test_multinomial():
    steps = 100

    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
            config.mode == 'Mode' and config.linker in ['py']):
        sample_size = (49, 5)
    else:
        sample_size = (450, 6)

    pvals = np.asarray(np.random.uniform(size=sample_size))
    pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals)
    R = MRG_RandomStreams(234)
    # Note: we specify `nstreams` to avoid a warning.
    m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
    f = theano.function([], m)
    f()
    basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
                          prefix='mrg ')
Exemplo n.º 14
0
def test_multinomial_n_samples():
    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
            config.mode == 'Mode' and config.linker in ['py']):
        sample_size = (49, 5)
    else:
        sample_size = (450, 6)

    pvals = np.asarray(np.random.uniform(size=sample_size))
    pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals)
    R = MRG_RandomStreams(234)

    for n_samples, steps in zip([5, 10, 100, 1000], [20, 10, 1, 1]):
        m = R.multinomial(pvals=pvals, n=n_samples,
                          dtype=config.floatX, nstreams=30 * 256)
        f = theano.function([], m)
        basic_multinomialtest(f, steps, sample_size, pvals,
                              n_samples, prefix='mrg ')
        sys.stdout.flush()
Exemplo n.º 15
0
    def get_cost(self, X, Y, **kwargs):

        # Dream
        theano_rng = MRG_RandomStreams(2012 + 12 + 18)
        exp_y = T.nnet.softmax(T.alloc(0., self.batch_size, self.n_classes) + self.gyb)
        dy = theano_rng.multinomial(pvals = exp_y, dtype='float32')
        dy = block_gradient(dy)
        exp_h2 = T.nnet.sigmoid(T.dot(dy, self.gh2w) + self.gh2b)
        dh2 = theano_rng.binomial(p = exp_h2, size = exp_h2.shape, dtype='float32')
        dh2 = block_gradient(dh2)
        exp_h1 = T.nnet.sigmoid(T.dot(dh2, self.gh1w) + self.gh1b)
        dh1 = theano_rng.binomial(p = exp_h1, size = exp_h1.shape, dtype='float32')
        dh1 = block_gradient(dh1)
        exp_v = T.nnet.sigmoid(T.dot(dh1, self.gvw) + self.gvb)
        dv = theano_rng.binomial(p = exp_v, size = exp_v.shape, dtype='float32')
        dv = block_gradient(dv)

        # Explanation of dream
        zh1, rh1 = self.infer_h1(dv)
        zh2 = T.dot(rh1, self.rh2w) + self.rh2b
        rh2 = T.nnet.sigmoid(zh2)
        zy = T.dot(rh2, self.ryw) + self.ryb

        # Probability of dream
        dream_prob = sigmoid_prob(zh1, dh1) + sigmoid_prob(zh2, dh2) + softmax_prob(zy, dy)

        # Explanation of reality
        zh1, rh1 = self.infer_h1(X)
        rh1 = block_gradient(rh1)
        zh2 = T.dot(rh1, self.rh2w) + self.rh2b
        rh2 = theano_rng.binomial(p = T.nnet.sigmoid(zh2), size = zh2.shape, dtype='float32')
        rh2 = block_gradient(rh2)

        # Probability of reality
        real_prob = softmax_prob(T.alloc(0., self.batch_size, self.n_classes) + self.gyb, Y) + \
                sigmoid_prob(T.dot(Y, self.gh2w) + self.gh2b, rh2) + \
                sigmoid_prob(T.dot(rh2, self.gh1w) + self.gh1b, rh1) + \
                sigmoid_prob(T.dot(rh1, self.gvw) + self.gvb, X)

        return - dream_prob - real_prob + .0001 * (
            T.sqr(self.gvw).sum() + T.sqr(self.gh1w).sum() + \
                    T.sqr(self.gh2w).sum()
                )
Exemplo n.º 16
0
    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.
        """

        t1 = time.time()

        empty_input = self.output_space.get_origin_batch(num_examples)
        h_state = sharedX(empty_input)

        default_z = T.zeros_like(h_state) + self.b

        theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16))

        h_exp = T.nnet.softmax(default_z)

        h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype)

        p_state = sharedX( self.output_space.get_origin_batch(
            num_examples))


        t2 = time.time()

        f = function([], updates = {
            h_state : h_sample
            })

        t3 = time.time()

        f()

        t4 = time.time()

        print str(self)+'.make_state took',t4-t1
        print '\tcompose time:',t2-t1
        print '\tcompile time:',t3-t2
        print '\texecute time:',t4-t3

        h_state.name = 'softmax_sample_shared'

        return h_state
Exemplo n.º 17
0
def softmax_sample_layer(list_of_multinomial_inputs, graph, name,
                         random_state=None):
    theano_seed = random_state.randint(-2147462579, 2147462579)
    # Super edge case...
    if theano_seed == 0:
        print("WARNING: prior layer got 0 seed. Reseeding...")
        theano_seed = random_state.randint(-2**32, 2**32)
    theano_rng = MRG_RandomStreams(seed=theano_seed)
    conc_multinomial = concatenate(list_of_multinomial_inputs, graph,
                                   name, axis=1)
    conc_multinomial /= len(list_of_multinomial_inputs)
    samp = theano_rng.multinomial(pvals=conc_multinomial,
                                  dtype="int32")
    # We know shape of conc_multinomial == shape of random sample
    shape = calc_expected_dims(graph, conc_multinomial)
    list_of_random = [samp, ]
    list_of_names = [name + "_random", ]
    list_of_shapes = [shape, ]
    add_random_to_graph(list_of_random, list_of_shapes, list_of_names, graph)
    return samp
Exemplo n.º 18
0
    def __init__(self, seq_len, emb_size, n_hidden, size_dict, lr):
        self.seq_len = seq_len

        # Parameters
        w_emb = shared(np.random.normal(
            0, 0.01, size=(size_dict, emb_size)).astype(dtype=floatX))

        w_hidden = shared(np.random.normal(
            0, 0.01, size=(seq_len * emb_size, n_hidden)).astype(dtype=floatX))
        b_hidden = shared(np.random.normal(
            0, 0.01, size=(n_hidden,)).astype(dtype=floatX))

        w_out = shared(np.random.normal(
            0, 0.01, size=(n_hidden, size_dict)).astype(dtype=floatX))
        b_out = shared(np.random.normal(
            0, 0.01, size=(size_dict,)).astype(dtype=floatX))

        # Graph
        x = t.imatrix('x')
        target = t.ivector('y')

        emb = w_emb[x]
        buff = relu(t.dot(emb.reshape((x.shape[0], -1)), w_hidden) +
                     b_hidden)
        y_hat = t.nnet.softmax((t.dot(buff, w_out)) + b_out)

        cost = t.nnet.categorical_crossentropy(y_hat, target).mean()

        params = [w_emb, w_hidden, b_hidden, w_out, b_out]

        grads = t.grad(cost, params)
        updates = [(w, w - lr * p) for w, p in zip(params, grads)]

        self.fun_cost = theano.function([x, target], cost, updates=updates)

        # Sampling function
        rng = MRG_RandomStreams(42)
        next_char = t.argmax(rng.multinomial(pvals=y_hat), axis=1)
        self.fun_predict = theano.function([x], next_char)
Exemplo n.º 19
0
def lwta(p, block_size):
    """
    The hard local winner take all non-linearity from "Compete to Compute"
    by Rupesh Srivastava et al
    Our implementation differs slightly from theirs--we break ties randomly,
    they break them by earliest index. This difference is just due to ease
    of implementation in theano.
    """
    batch_size = p.shape[0]
    num_filters = p.shape[1]
    num_blocks = num_filters // block_size
    w = p.reshape((batch_size, num_blocks, block_size))
    block_max = w.max(axis=2).dimshuffle(0, 1, 'x') * T.ones_like(w)
    max_mask = T.cast(w >= block_max, 'float32')
    theano_rng = MRG_RandomStreams(20131206 % (2 ** 16))
    denom = max_mask.sum(axis=2).dimshuffle(0, 1, 'x')
    probs = max_mask / denom
    probs = probs.reshape((batch_size * num_blocks, block_size))
    max_mask = theano_rng.multinomial(pvals=probs, dtype='float32')
    max_mask = max_mask.reshape((batch_size, num_blocks, block_size))
    w = w * max_mask
    w = w.reshape((p.shape[0], p.shape[1]))
    return w
Exemplo n.º 20
0
class SLmodel():
	
	#This is the switched conditional linear model for integrating 
	#action with sensation
	
	def __init__(self, nx, ns, nh, na, npcl, xvar=1.0):
		
		#for this model I assume one linear generative model and a 
		#combination of nh linear dynamical models
		
		#generative matrix
		init_W=np.asarray(np.random.randn(nx,ns)/10.0,dtype='float32')
		
		#observed variable means
		init_c=np.asarray(np.zeros(nx),dtype='float32')
		
		#dynamical matrices
		init_M=np.asarray((np.tile(np.eye(ns),(1,nh))),dtype='float32')  #for state-based predictions
		init_C=np.asarray((np.tile(np.zeros((na,ns)),(1,nh))),dtype='float32')  #for action-based predictions
		
		#state-variable variances
		#(covariance matrix of state variable noise assumed to be diagonal)
		init_b=np.asarray(np.ones(ns)*10.0,dtype='float32')
		
		#Switching parameter matrices
		init_A=np.asarray(np.zeros((ns,nh)),dtype='float32') #associated with the state
		init_B=np.asarray(np.zeros((na,nh)),dtype='float32') #associated with actions
		
		#priors for switching variable
		init_ph=np.asarray(np.zeros(nh),dtype='float32')
		
		init_s_now=np.asarray(np.zeros((npcl,ns)),dtype='float32')
		init_weights_now=np.asarray(np.ones(npcl)/float(npcl),dtype='float32')
		
		init_s_past=np.asarray(np.zeros((npcl,ns)),dtype='float32')
		init_h_past=np.asarray(np.zeros((npcl,nh)),dtype='float32')
		init_h_past[:,0]=1.0
		init_weights_past=np.asarray(np.ones(npcl)/float(npcl),dtype='float32')
		
		init_a_past=np.asarray(np.zeros((1,na)),dtype='float32')
		
		self.W=theano.shared(init_W)
		self.c=theano.shared(init_c)
		self.M=theano.shared(init_M)
		self.C=theano.shared(init_C)
		self.b=theano.shared(init_b)
		self.A=theano.shared(init_A)
		self.B=theano.shared(init_B)
		self.ph=theano.shared(init_ph)
		
		#this is to help vectorize operations
		self.sum_mat=T.as_tensor_variable(np.asarray((np.tile(np.eye(ns),nh)).T,dtype='float32'))
		
		self.s_now=theano.shared(init_s_now)
		self.weights_now=theano.shared(init_weights_now)
		
		self.s_past=theano.shared(init_s_past)
		self.h_past=theano.shared(init_h_past)
		self.a_past=theano.shared(init_a_past)
		self.weights_past=theano.shared(init_weights_past)
		
		self.xvar=np.asarray(xvar,dtype='float32')
		
		self.nx=nx		#dimensionality of observed variables
		self.ns=ns		#dimensionality of latent variables
		self.nh=nh		#number of (linear) dynamical modes
		self.na=na		#dimensionality of action variables
		self.npcl=npcl	#numer of particles in particle filter
		
		self.theano_rng = RandomStreams()
		
		self.params=				[self.W, self.M, self.C, self.b, self.A, self.B, self.c, self.ph]
		self.rel_lrates=np.asarray([  0.1,    1.0,    1.0,    0.01,   10.0,    10.0,  0.1,     1.0]   ,dtype='float32')
	
	
	def sample_proposal_s(self, s, a, h, xpred, sig):
		
		s_pred=self.get_prediction(s, a, h)
		
		n=self.theano_rng.normal(size=T.shape(s))
		
		#This is the proposal distribution that arises when one assumes that W'W=I
		
		mean=2.0*(xpred+s_pred*(self.b**2))*sig
		
		s_prop=mean+n*T.sqrt(sig)
		
		#I compute the term inside the exponent for the pdf of the proposal distrib
		prop_term=-T.sum(n**2)/2.0
		
		return T.cast(s_prop,'float32'), T.cast(s_pred,'float32'), T.cast(prop_term,'float32')
	
	
	#This function is required if we allow multiple generative models
	
	#def get_recon(self, s, h):
		
		#W_vec=T.sum(self.W*h, axis=0)
		#W=W.reshape((self.nx, self.ns))
		
		#xr=T.dot(W, s)
		
		#return xr
	
	
	def calc_h_probs(self, s, a):
		
		#this function takes an np by ns matrix of s samples plus
		#an action vector a
		#and returns an nh by np set of h probabilities
		
		exp_terms=T.dot(s, self.A)+ T.reshape(T.dot(a, self.B),(1,self.nh)) + T.reshape(self.ph,(1,self.nh))
		
		#re-centering for numerical stability
		exp_terms_recentered=exp_terms-T.max(exp_terms,axis=1)
		
		#exponentiation and normalization
		rel_probs=T.exp(exp_terms)
		probs=rel_probs.T/T.sum(rel_probs, axis=1)
		
		return probs.T
	
		
	
	def forward_filter_step(self, a, xp):
		
		#first sample from h given s and a
		
		h_probs = self.calc_h_probs(self.s_now, a)
		h_samps=self.theano_rng.multinomial(pvals=h_probs)
		
		#need to sample from the proposal distribution
		#these terms are the same for every particle
		xpred=T.dot(self.W.T,(xp-self.c))/(2.0*self.xvar**2)
		sig=(1.0/(self.b**2+1.0/(2.0*self.xvar**2)))/2.0
		
		#sig=1.0/(self.b**2)
		
		#vectorized version
		s_pred=self.get_prediction(self.s_now, a, h_samps)
		
		n=self.theano_rng.normal(size=T.shape(self.s_now))
		
		mean=2.0*(xpred+s_pred*(self.b**2))*sig
		
		#mean=s_pred  #trying out using solely predictive proposal distrib
		
		s_samps=mean+n*T.sqrt(sig)
		
		prop_terms=-T.sum(n**2,axis=1)/2.0
		
		updates={}
		
		#now that we have samples from the proposal distribution, we need to reweight them
		
		recons=T.dot(self.W, s_samps.T) + T.reshape(self.c,(self.nx,1))
		
		x_terms=-T.sum((recons-T.reshape(xp,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)
		s_terms=-T.sum(((s_samps-s_pred)*self.b)**2,axis=1)/2.0
		
		energies=x_terms+s_terms-prop_terms
		
		#to avoid exponentiating large or very small numbers, I 
		#"re-center" the reweighting factors by adding a constant, 
		#as this has no impact on the resulting new weights
		
		energies_recentered=energies-T.max(energies)
		
		alpha=T.exp(energies_recentered) #these are the reweighting factors
		
		new_weights_unnorm=self.weights_now*alpha
		normalizer=T.sum(new_weights_unnorm)
		new_weights=new_weights_unnorm/normalizer  #need to normalize new weights
		
		updates[self.h_past]=T.cast(h_samps,'float32')
		updates[self.s_past]=T.cast(self.s_now,'float32')
		updates[self.a_past]=T.cast(a,'float32')
		updates[self.s_now]=T.cast(s_samps,'float32')
		
		updates[self.weights_past]=T.cast(self.weights_now,'float32')
		updates[self.weights_now]=T.cast(new_weights,'float32')
		
		#return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates
		#return normalizer, energies_recentered, updates
		#return h_samps, updates
		return updates
		
	
	def get_prediction(self, s, a, h):
		
		s_dot_M=T.dot(s, self.M)  #this is np by nh*ns
		a_dot_C=T.dot(a, self.C)  #this is 1 by nh*ns
		tot=s_dot_M+a_dot_C  #should be np by nh*ns
		s_pred=T.dot(tot*T.extra_ops.repeat(h,self.ns,axis=1),self.sum_mat) #should be np by ns
		
		return T.cast(s_pred,'float32')
	
	
	def sample_joint(self, sp):
		
		t2_samp=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T
		s2_samp=T.cast(T.sum(self.s_now*T.addbroadcast(t2_samp,1),axis=0),'float32')
		h2_samp=T.cast(T.sum(self.h_now*T.addbroadcast(t2_samp,1),axis=0),'float32')
		
		diffs=self.b*(s2_samp-sp)
		sqr_term=T.sum(diffs**2,axis=1)
		alpha=T.exp(-sqr_term)
		probs_unnorm=self.weights_past*alpha
		probs=probs_unnorm/T.sum(probs_unnorm)
		
		t1_samp=self.theano_rng.multinomial(pvals=T.reshape(probs,(1,self.npcl))).T
		s1_samp=T.cast(T.sum(self.s_past*T.addbroadcast(t1_samp,1),axis=0),'float32')
		h1_samp=T.cast(T.sum(self.h_past*T.addbroadcast(t1_samp,1),axis=0),'float32')
		
		return [s1_samp, h1_samp, s2_samp, h2_samp]
	
	
	def calc_mean_h_energy(self, s, a, h):
		
		#you give this function a set of samples of s, a, and h,
		#it gives you the average energy of those samples
		
		exp_terms=T.dot(s, self.A)+ T.reshape(T.dot(a, self.B),(1,self.nh)) + T.reshape(self.ph,(1,self.nh))  #np by nh
		
		energies=T.sum(h*exp_terms,axis=1) - T.log(T.sum(T.exp(exp_terms),axis=1)) #should be np by 1
		
		energy=T.mean(energies)
		
		return energy
	
	
	def update_params(self, x1, x2, n_samps, lrate):
		
		#this function samples from the joint posterior and performs
		# a step of gradient ascent on the log-likelihood
		
		sp=self.get_prediction(self.s_past, self.a_past, self.h_past)
									
		#sp should be np by ns
		
		
		[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint,
									outputs_info=[None, None, None, None],
									non_sequences=[sp],
									n_steps=n_samps)
		
		
		
		x1_recons=T.dot(self.W, s1_samps.T) + T.reshape(self.c,(self.nx,1))
		x2_recons=T.dot(self.W, s2_samps.T) + T.reshape(self.c,(self.nx,1))
		
		s_pred = self.get_prediction(s1_samps, h1_samps)
		
		
		hterm1=self.calc_mean_h_energy(s1_samps, h1_samps)
		#hterm2=self.calc_mean_h_energy(s2_samps, h2_samps)
		
		sterm=-T.mean(T.sum((self.b*(s2_samps-s_pred))**2,axis=1))/2.0
		
		xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2))
		xterm2=-T.mean(T.sum((x2_recons-T.reshape(x2,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2))
		
		#energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2))
		energy = hterm1 + xterm1 + xterm2 + sterm 
		
		gparams=T.grad(energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps])
		
		# constructs the update dictionary
		for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates):
			#gnat=T.dot(param, T.dot(param.T,param))
			updates[param] = T.cast(param + gparam*lrate*rel_lr,'float32')
		
		
		#make sure W has unit-length columns
		#new_W=updates[self.W]
		#updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32')
		
		#MIGHT NEED TO NORMALIZE A
		
		
		return energy, updates
		
	
	def get_ESS(self):
		
		return 1.0/T.sum(self.weights_now**2)
	
	
	def resample_step(self):
		
		idx=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T
		s_samp=T.sum(self.s_now*T.addbroadcast(idx,1),axis=0)
		h_samp=T.sum(self.h_now*T.addbroadcast(idx,1),axis=0)
		
		return T.cast(s_samp,'float32'), T.cast(h_samp,'float32')
	
	
	def resample(self):
		
		[s_samps, h_samps], updates = theano.scan(fn=self.resample_step,
												outputs_info=[None, None],
												n_steps=self.npcl)
		
		updates[self.s_now]=T.cast(s_samps,'float32')
		updates[self.h_now]=T.cast(h_samps,'float32')
		updates[self.weights_now]=T.cast(T.ones_like(self.weights_now)/T.cast(self.npcl,'float32'),'float32') #dtype paranoia
		
		return updates
	
	
	def simulate_step(self, s, a):
		
		s=T.reshape(s,(1,self.ns))
		a=T.reshape(a,(1,self.na))
		#get h probabilities
		h_probs = self.calc_h_probs(s,a)
		h_samp=self.theano_rng.multinomial(pvals=h_probs)
		
		sp=self.get_prediction(s,a,h_samp)
		
		xp=T.dot(self.W, sp.T) + T.reshape(self.c,(self.nx,1))
		
		return T.cast(sp,'float32'), T.cast(xp,'float32'), h_samp
		
	
	def simulate_forward(self, a, n_steps):
		
		#a should be n_steps by na
		
		s0=T.sum(self.s_now*T.reshape(self.weights_now,(self.npcl,1)),axis=0)
		s0=T.reshape(s0,(1,self.ns))
		[sp, xp, hs], updates = theano.scan(fn=self.simulate_step,
										outputs_info=[s0, None, None],
										sequences=[a],
										n_steps=n_steps)
		
		return sp, xp, hs, updates
Exemplo n.º 21
0
class MultiRBM(RBM):
    def __init__(self,
                 input,
                 n_vis,
                 n_hid,
                 n_cate,
                 W=None,
                 vbias=None,
                 hbias=None):
        '''
        The input should be a 3D tensor with (n_cat, N_sample, n_vis)
        '''
        self.input = input
        self.n_vis = n_vis
        self.n_hid = n_hid
        self.n_cate = n_cate

        if W is None:
            W = theano.shared(np.random.normal(size=(self.n_cate, self.n_vis,
                                                     self.n_hid)).astype(
                                                         theano.config.floatX),
                              borrow=True)
        if vbias is None:
            vbias = theano.shared(np.zeros(shape=(
                self.n_cate,
                self.n_vis,
            )).astype(theano.config.floatX),
                                  borrow=True)

        if hbias is None:
            hbias = theano.shared(np.zeros(shape=(self.n_hid, )).astype(
                theano.config.floatX),
                                  borrow=True)
        self.numpy_rng = np.random.RandomState(1234)
        self.theano_rng = MRG_RandomStreams(self.numpy_rng.randint(2**30))
        self.W = W
        self.vbias = vbias
        self.hbias = hbias

    def free_energy(self, vis):
        vW_b = T.batched_dot(vis, self.W) + T.addbroadcast(self.hbias, 1)
        visible_term = T.batched_dot(vis, self.vbias)
        hidden_term = T.sum(T.log(1 + T.exp(vW_b)), axis=2)
        return T.sum(-hidden_term - visible_term, axis=0)

    def propup(self, vis):
        x = T.batched_dot(vis, self.W) + self.hbias
        return [x, T.nnet.sigmoid(x)]

    def propdown(self, hid):
        x = T.batched_dot(hid, self.W.dimshuffle(0, 2, 1)) + \
            self.vbias.dimshuffle((0, 'x', 1))
        e_x = T.exp(x - x.max(axis=0, keepdims=True))
        out = e_x / e_x.sum(axis=0, keepdims=True)
        return [x, out]

    def sample_v_given_h(self, hid):
        x, out = self.propdown(hid)
        v_sample = []
        for v in range(self.n_vis):
            v_sample += [
                self.theano_rng.multinomial(n=1, pvals=out[:, :,
                                                           v].T).dimshuffle(
                                                               1, 0, 'x')
            ]
        v_sample = T.concate(v_sample, axis=2)
        return [x, out, v_sample]

    def sample_h_given_v(self, vis):
        x, out = self.propup(vis)
        h_sample = self.theano_rng.binomial(n=1, p=out, size=out.shape)
        return [x, out, h_sample]
Exemplo n.º 22
0
def random_multinomial(shape=None, pvals=None, dtype=_FLOATX, seed=None):
    if seed is None:
        seed = np.random.randint(10e6)
    rng = RandomStreams(seed=seed)
    return rng.multinomial(size=shape, pvals=pvals, dtype=dtype)
Exemplo n.º 23
0
def sample(p, seed=None):
    if seed is None:
        seed = np.random.randint(10e6)
    rng = RandomStreams(seed=seed)
    return rng.multinomial(n=1, pvals=p, dtype=theano.config.floatX)
Exemplo n.º 24
0
class ImportanceSampler():
    '''Implements importance sampling/resampling'''
    def __init__(self, ndims, n_particles, true_log_probs, proposal_func=None):
        '''
		true_log_probs: a function that returns the true relative log probabilities
		proposal_func: a function that returns (samples, relative_log_probabilities)
		n_particles: the number of particles to use
		'''
        self.true_log_probs = true_log_probs
        self.proposal_func = proposal_func
        self.n_particles = n_particles
        self.ndims = ndims

        init_particles = np.zeros((n_particles, self.ndims))
        init_weights = np.ones(n_particles) / float(n_particles)

        self.particles = theano.shared(init_particles.astype(np.float32))
        self.weights = theano.shared(init_weights.astype(np.float32))

        self.theano_rng = RandomStreams()

        self.get_ESS = None
        self.perform_resampling = None
        self.perform_sampling = None

    def set_proposal_func(self, proposal_func):
        '''You might need to use this if you want to make the proposal
		function depend on the current particles'''
        self.proposal_func = proposal_func
        return

    def sample_reweight(self):
        '''Samples new particles and reweights them'''
        samples, prop_log_probs = self.proposal_func()
        true_log_probs = self.true_log_probs(samples)
        diffs = true_log_probs - prop_log_probs
        weights_unnorm = T.exp(diffs)
        weights = weights_unnorm / T.sum(weights_unnorm)
        updates = OrderedDict()
        updates[self.weights] = T.cast(weights, 'float32')
        updates[self.particles] = T.cast(samples, 'float32')
        return updates

    def compute_ESS(self):
        '''Returns the effective sample size'''
        return 1.0 / T.sum(self.weights**2)

    def resample(self):
        '''Resamples using the current weights'''
        samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat(
            self.weights.dimshuffle('x', 0), self.n_particles, axis=0))
        idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64')
        updates = OrderedDict()
        updates[self.particles] = self.particles[idxs]
        updates[self.weights] = T.cast(
            T.ones_like(self.weights) / float(self.n_particles), 'float32')
        return updates

    def compile(self):
        '''Compiles the ESS, resampling, and sampling functions'''
        ess = self.compute_ESS()
        self.get_ESS = theano.function([], ess)
        resample_updates = self.resample()
        self.perform_resampling = theano.function([], updates=resample_updates)
        sample_updates = self.sample_reweight()
        self.perform_sampling = theano.function([], updates=sample_updates)
        return
Exemplo n.º 25
0
    m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")
    A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64')
    s2 = numpy.random.randint(0, i32max, 3).astype('int32')
    m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")

    f0.input_storage[0].storage[0] = A1
    f0.input_storage[1].storage[0] = s1
    f0.input_storage[2].storage[0] = m1
    f0.input_storage[3].storage[0] = A2
    f0.input_storage[4].storage[0] = s2
    f0.input_storage[5].storage[0] = m2

    r_a1 = rng_mrg.matVecModM(A1, s1, m1)
    r_a2 = rng_mrg.matVecModM(A2, s2, m2)
    f0.fn()
    r_b = f0.output_storage[0].value

    assert numpy.allclose(r_a1, r_b[:3])
    assert numpy.allclose(r_a2, r_b[3:])


if __name__ == "__main__":
    rng = MRG_RandomStreams(numpy.random.randint(2147462579))
    import time
    print theano.__file__
    pvals = theano.tensor.fmatrix()
    for i in range(10):
        t0 = time.time()
        multinomial = rng.multinomial(pvals=pvals)
        print time.time() - t0
Exemplo n.º 26
0
def test_undefined_grad():
    srng = MRG_RandomStreams(seed=1234)

    # checking uniform distribution
    low = tensor.scalar()
    out = srng.uniform((), low=low)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, low)

    high = tensor.scalar()
    out = srng.uniform((), low=0, high=high)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, high)

    out = srng.uniform((), low=low, high=high)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, (low, high))

    # checking binomial distribution
    prob = tensor.scalar()
    out = srng.binomial((), p=prob)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, prob)

    # checking multinomial distribution
    prob1 = tensor.scalar()
    prob2 = tensor.scalar()
    p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(theano.tensor.sum(out), prob1)

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(theano.tensor.sum(out), (prob1, prob2))

    # checking choice
    p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out[0], (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out[0], (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out[0], prob1)

    # checking normal distribution
    avg = tensor.scalar()
    out = srng.normal((), avg=avg)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, avg)

    std = tensor.scalar()
    out = srng.normal((), avg=0, std=std)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, std)

    out = srng.normal((), avg=avg, std=std)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, (avg, std))

    # checking truncated normal distribution
    avg = tensor.scalar()
    out = srng.truncated_normal((), avg=avg)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, avg)

    std = tensor.scalar()
    out = srng.truncated_normal((), avg=0, std=std)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, std)

    out = srng.truncated_normal((), avg=avg, std=std)
    with pytest.raises(theano.gradient.NullTypeGradError):
        theano.grad(out, (avg, std))
Exemplo n.º 27
0
class VisibleLayer(object):
    def __init__(self, v_dim, h_dim, v_type, mrng=None, rng=None, name=''):

        self.name = name if name != '' else 'v_layer'

        self.v_dim = v_dim
        self.h_dim = h_dim
        self.v_type = v_type

        seed = np.random.randint(1, 2**30)
        self._rng = RandomStreams(seed) if rng is None else rng
        self._mrng = MRG_RandomStreams(seed) if mrng is None else mrng

        self._build_params()

    def set_total_count(self, total_count):
        if not (self.v_type == InputType.poisson):
            raise ValueError(
                "The input type should be Poisson to set total count")
        self.total_count = total_count

    def _build_params(self):
        # W to connect with hidden layer
        self.params = []
        if self.v_type == InputType.poisson:
            init_W = np.random.uniform(low=-1 / self.h_dim,
                                       high=1 / self.h_dim,
                                       size=(self.v_dim, self.h_dim))
            self.W = init_weight(self.v_dim,
                                 self.h_dim,
                                 value=init_W,
                                 name=self.name + '-W')
        else:
            self.W = init_weight(self.v_dim, self.h_dim, name=self.name + '-W')
        self.b_v = init_bias(self.v_dim, name=self.name + '-b_v')

        # Ca binary, gaussian, and categorical
        self.params.extend([self.W, self.b_v])

        # Truong hop gaussian co them sigma
        if self.v_type == InputType.gaussian:
            self.sigma_v = T.ones(shape=(self.v_dim, ),
                                  dtype=theano.config.floatX)
            self.sigma_v.name = self.name + "-sigma_v"

    # Result in a vector of (n, 1)
    def v_free_term(self, v):
        if self.v_type == InputType.poisson:
            return -T.sum(T.gammaln(1 + v), axis=1)
        else:
            return 0

    # Result in a vector of (n, 1)
    def v_bias_term(self, v):
        # Note that for gaussian case, the v_bias should be negative
        if self.v_type == InputType.gaussian:
            return -T.sum((v - self.b_v)**2 / (2 * self.sigma_v**2), axis=1)
        else:
            return T.dot(v, self.b_v)

    # Result in a vector of (n, H)
    def v_weight_term(self, v):
        if self.v_type == InputType.gaussian:
            return T.dot((v / (self.sigma_v**2)), self.W)
        else:
            return T.dot(v, self.W)

    # Only support binary, gaussian and categorical
    def v_given_h(self, h):
        if self.v_type == InputType.binary:
            p_v_h = T.nnet.sigmoid(self.b_v + T.dot(h, self.W.T))
            return p_v_h

        elif self.v_type == InputType.gaussian:
            mu_v = self.b_v + T.dot(h, self.W.T)
            return mu_v

        elif self.v_type == InputType.categorical:
            p_v_h = T.nnet.softmax(self.b_v + T.dot(h, self.W.T))
            return p_v_h

        elif self.v_type == InputType.poisson:
            if not hasattr(self, 'total_count') or self.total_count is None:
                raise ValueError(
                    'Total count should be set for constrained Poisson')

            unconstrained_lmbd_v = T.exp(self.b_v + T.dot(h, self.W.T))
            lmbd_v = unconstrained_lmbd_v * 1.0 / T.sum(unconstrained_lmbd_v, axis=1, keepdims=True) \
                     * self.total_count
            return lmbd_v

    # Only support binary, gaussian and categorical
    def sample_v_given_h(self, h0_sample):
        if self.v_type == InputType.binary:
            v1_mean = self.v_given_h(h0_sample)
            v1_sample = self._mrng.binomial(size=v1_mean.shape,
                                            n=1,
                                            p=v1_mean,
                                            dtype=theano.config.floatX)
            return [v1_mean, v1_sample]

        elif self.v_type == InputType.gaussian:
            mu_v1 = self.v_given_h(h0_sample)  # Note that mu_v1 is returned

            v1_sample = self._mrng.normal(size=mu_v1.shape,
                                          avg=mu_v1,
                                          std=self.sigma_v,
                                          dtype=theano.config.floatX)
            return [mu_v1, v1_sample]

        # Note that there is constraint in the case of Multinomial
        elif self.v_type == InputType.categorical:
            prob_v1 = self.v_given_h(h0_sample)
            v1_sample = self._mrng.multinomial(pvals=prob_v1,
                                               n=1,
                                               dtype=theano.config.floatX)
            return [prob_v1, v1_sample]

        elif self.v_type == InputType.poisson:
            lmbd_v1 = self.v_given_h(h0_sample)
            # We have to use RandomStreams, not MRG_RandomStreams
            v1_sample = self._rng.poisson(size=lmbd_v1.shape,
                                          lam=lmbd_v1,
                                          dtype=theano.config.floatX)
            return [lmbd_v1, v1_sample]

    def l1_grad(self, l1):
        gW = l1_grad(self.W, l1)
        return [gW, 0]

    def l2_grad(self, l2):
        gW = l2_grad(self.W, l2)
        return [gW, 0]

    def nll_grad_formula(self, v0, vk, h0, hk):
        n_instances = v0.shape[0]

        gW = (T.dot(vk.T, hk) - T.dot(v0.T, h0)) / n_instances

        if self.v_type == InputType.gaussian:
            gb_v = T.mean((vk - v0) / (self.sigma_v**2), axis=0)
            grads = [gW, gb_v]
        else:
            gb_v = T.mean(vk - v0, axis=0)
            grads = [gW, gb_v]

        return grads

    def get_viewed_cost(self, v0, vk_stat):
        # Binary cross-entropy
        cost = 0
        if self.v_type == InputType.binary:
            # Clip to avoid log(0)
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001),
                                  np.float32(0.999999))
            cost = -T.sum(v0 * T.log(clip_vk_stat) +
                          (1 - v0) * T.log(1 - clip_vk_stat),
                          axis=1)

        # Sum square error
        elif self.v_type == InputType.gaussian:
            cost = T.sum((v0 - vk_stat)**2, axis=1)

        # Categorical cross-entropy
        elif self.v_type == InputType.categorical:
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001),
                                  np.float32(0.999999))
            cost = -T.sum(v0 * T.log(clip_vk_stat), axis=1)

        elif self.v_type == InputType.poisson:
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.inf)
            cost = -T.sum(
                -vk_stat + v0 * T.log(clip_vk_stat) - T.gammaln(1 + v0),
                axis=1)

        return cost

    def get_params(self):
        return self.params
Exemplo n.º 28
0
class RBM(Model):
    def __init__(self, v_dim=784, h_dim=500, input_type=InputType.binary,
                 W=None, b_h=None, b_v=None, sigma=None, input_var=None,
                 mrng=None, rng=None, name='', **kwargs):

        name = 'rbm' if name == '' else name
        super(RBM, self).__init__(name=name)

        model_file = kwargs.get('model_file')
        if model_file is not None:
            self.load(model_file)
            self._load_params()

        else:
            # v_dim is the dimensions of visible variable v. v_dim = D
            self.v_dim = v_dim
            # v_dim is the dimensions of visible variable v. h_dim = H
            self.h_dim = h_dim
            self.input_type = input_type

            seed = np.random.randint(1, 2**30)
            self._rng = RandomStreams(seed) if rng is None else rng
            self._mrng = MRG_RandomStreams(seed) if mrng is None else mrng

            self._build_params(W, b_h, b_v, sigma)

        self.input = input_var if input_var is not None else T.matrix('input')
        if self.input_type == InputType.poisson or self.input_type == InputType.replicated_softmax:
            self.total_count = T.sum(self.input, axis=1, keepdims=True)

    def _load_params(self):
        [self.W, self.b_h, self.b_v] = self.params

    def _build_params(self, W, b_h, b_v, sigma):
        self.params = []

        self.W = W if W is not None else init_weight(self.v_dim, self.h_dim, name=self.name+'-W')
        self.b_h = b_h if b_h is not None else init_bias(self.h_dim, name=self.name+'-b_h')
        self.b_v = b_v if b_v is not None else init_bias(self.v_dim, name=self.name+'-b_v')

        # sigma_v is not considered to be a param
        self.params.extend([self.W, self.b_h, self.b_v])

        # Truong hop gaussian co them sigma
        self.sigma_v = None
        if self.input_type == InputType.gaussian:
            print "Your input must be whitened to achieve the desire result."

            if sigma is not None:
                sigma = np.asarray(sigma)
                if sigma.ndim == 0:
                    print "Sigma is set to {} for all input dimensions.".format(sigma)
                    self.sigma_v = theano.shared(sigma * np.ones((self.v_dim, ), dtype=theano.config.floatX))
                    self.sigma_v.name = self.name + "-sigma_v"
                else:
                    assert sigma.ndim == 1 and sigma.shape[0] == self.v_dim, \
                        "Sigma must be 1D array with the length of {}".format(self.v_dim)
                    self.sigma_v = theano.shared(sigma)
                    self.sigma_v.name = self.name + "-sigma_v"
            else:
                print "Default value of sigma is 1.0 for all input dimensions."
                self.sigma_v = theano.shared(np.ones(self.v_dim, dtype=theano.config.floatX))
                self.sigma_v.name = self.name + "-sigma_v"

    def print_model_info(self):
        print "\nInfo of model {}".format(self.name)
        print "v_dims: {} | h_dim: {} | input_type: {}".format(self.v_dim, self.h_dim, self.input_type)

    def get_save(self):
        return [self.name, self.v_dim, self.h_dim, self.input_type,
                self._mrng, self._rng, self.params, self.sigma_v]

    def set_load(self, saved_data):
        [self.name, self.v_dim, self.h_dim, self.input_type,
         self._mrng, self._rng, self.params, self.sigma_v] = saved_data

    def score(self, v_data):
        free_fn = theano.function([self.input], self.free_energy(self.input))
        return free_fn(v_data)

    def reconstruct(self, v_data):
        h = self.h_given_v(self.input)
        rv = self.v_given_h(h)
        rec_fn = theano.function([self.input], rv)
        return rec_fn(v_data)

    def reconstruct_from_hidden(self, h_data):
        h = self.input.type('hidden')
        rv = self.v_given_h(h)
        rec_fn = theano.function([h], rv)
        return rec_fn(h_data)

    def encode(self, v_data):
        h_code = self.h_given_v(self.input)
        fn = theano.function([self.input], h_code)
        return fn(v_data)

    # Energy from many v an 1 h
    def energy(self, v, h):
        v_free = self.v_free_term(v)
        v_bias = self.v_bias_term(v)
        v_weight = self.v_weight_term(v)

        return -(v_free + v_bias + v_weight * h + T.dot(h, self.b_h))

    def free_energy(self, v):
        v_free = self.v_free_term(v)
        v_bias = self.v_bias_term(v)
        v_weight = self.v_weight_term(v)

        h_term = T.sum(T.log(1 + T.exp(v_weight + self.b_h)), axis=1)
        return -(v_bias + v_free + h_term)

    def v_weight_term(self, v):
        if self.input_type == InputType.gaussian:
            return T.dot(v/(self.sigma_v ** 2), self.W)
        else:
            return T.dot(v, self.W)

    def v_bias_term(self, v):
        # Note that for gaussian case, the v_bias should be negative
        if self.input_type == InputType.gaussian:
            return -T.sum((v - self.b_v) ** 2 / (2 * self.sigma_v ** 2), axis=1)
        else:
            return T.dot(v, self.b_v)

    def v_free_term(self, v):
        if self.input_type == InputType.poisson:
            return -T.sum(T.gammaln(1 + v), axis=1)
        else:
            return 0

    def rv(self, v):
        h = self.h_given_v(v)
        rv = self.v_given_h(h)
        return rv

    def h_given_v(self, v):
        v_weight = self.v_weight_term(v)
        p_h_v = T.nnet.sigmoid(v_weight + self.b_h)
        return p_h_v

    def v_given_h(self, h):
        if self.input_type == InputType.binary:
            p_v_h = T.nnet.sigmoid(self.b_v + T.dot(h, self.W.T))
            return p_v_h

        elif self.input_type == InputType.gaussian:
            mu_v = self.b_v + T.dot(h, self.W.T)
            return mu_v

        elif self.input_type == InputType.categorical or \
             self.input_type == InputType.replicated_softmax:
            p_v_h = T.nnet.softmax(self.b_v + T.dot(h, self.W.T))
            return p_v_h

        elif self.input_type == InputType.poisson:
            if not hasattr(self, 'total_count') or self.total_count is None:
                raise ValueError('Total count should be set for constrained Poisson')

            unconstrained_lmbd_v = T.exp(self.b_v + T.dot(h, self.W.T))
            lmbd_v = unconstrained_lmbd_v * 1.0 / T.sum(unconstrained_lmbd_v, axis=1, keepdims=True) \
                     * self.total_count
            return lmbd_v

    def sample_h_given_v(self, v0_sample):
        h1_mean = self.h_given_v(v0_sample)
        h1_sample = self._mrng.binomial(size=h1_mean.shape, n=1, p=h1_mean,
                                        dtype=theano.config.floatX)
        return [h1_mean, h1_sample]

    def sample_v_given_h(self, h0_sample):
        if self.input_type == InputType.binary:
            v1_mean = self.v_given_h(h0_sample)
            v1_sample = self._mrng.binomial(size=v1_mean.shape, n=1, p=v1_mean,
                                           dtype=theano.config.floatX)
            return [v1_mean, v1_sample]

        elif self.input_type == InputType.gaussian:
            mu_v1 = self.v_given_h(h0_sample)  # Note that mu_v1 is returned

            v1_sample = self._mrng.normal(size=mu_v1.shape, avg=mu_v1, std=self.sigma_v,
                                          dtype=theano.config.floatX)
            return [mu_v1, v1_sample]
        # Note that there is constraint in the case of Multinomial

        elif self.input_type == InputType.categorical:
            prob_v1 = self.v_given_h(h0_sample)
            # Multinomial with n=1 (It is equal to categorical)
            v1_sample = self._mrng.multinomial(pvals=prob_v1, n=1, dtype=theano.config.floatX)
            return [prob_v1, v1_sample]

        elif self.input_type == InputType.poisson:
            lmbd_v1 = self.v_given_h(h0_sample)
            # We have to use RandomStreams, not MRG_RandomStreams
            v1_sample = self._rng.poisson(size=lmbd_v1.shape, lam=lmbd_v1,
                                          dtype=theano.config.floatX)
            return [lmbd_v1, v1_sample]

        elif self.input_type == InputType.replicated_softmax:
            if not hasattr(self, 'total_count') or self.total_count is None:
                raise ValueError('Total count should be set for replicated Softmax')

            prob_v1 = self.v_given_h(h0_sample)
            # We have to sample the vocabulary distribution given topic D times and sum over D samples
            v1_sample = self._mrng.multinomial(pvals=prob_v1, n=self.total_count, ndim=prob_v1.shape[1])
            return [prob_v1, v1_sample]

    # One step of gibbs sampling
    def gibbs_hvh(self, h0_sample):
        # Here we use v1_stat to show that it is sufficient statistics of v1
        [v1_stat, v1_sample] = self.sample_v_given_h(h0_sample)
        [h1_mean, h1_sample] = self.sample_h_given_v(v1_sample)

        return [v1_stat, v1_sample, h1_mean, h1_sample]

    def gibbs_vhv(self, v0_sample):
        [h1_mean, h1_sample] = self.sample_h_given_v(v0_sample)
        [v1_stat, v1_sample] = self.sample_v_given_h(h1_sample)

        return [h1_mean, h1_sample, v1_stat, v1_sample]

    def run_CD_from_h(self, k, data_h):
        start_h = T.matrix("start_h")
        # [v_stats, v_samples, h_means, h_samples], updates \
        outputs, updates \
            = theano.scan(fn=self.gibbs_hvh, outputs_info=[None, None, None, start_h],
                          n_steps=k, name="gibbs_hvh")
        # Return the last h_sample after k steps
        CD_fn = theano.function([start_h], outputs=outputs[-1], updates=updates)
        return CD_fn(data_h)

    def run_CD_from_v(self, k, data_v):
        start_v = T.matrix("start_v")
        # [h_means, h_samples, v_stats, v_samples], updates \
        outputs, updates \
            = theano.scan(fn=self.gibbs_vhv, outputs_info=[None, None, None, start_v],
                          n_steps=k, name="gibbs_vhv")
        # Return the last v_sample after k steps
        CD_fn = theano.function([start_v], outputs=outputs[-1], updates=updates)
        return CD_fn(data_v)

    # Return visible variables
    def _gibbs_vhv_to_v_fn(self, steps, persis_v, is_sample=True, name=''):
        [h_means, h_samples, v_stats, v_samples], updates \
            = theano.scan(self.gibbs_vhv,
                          outputs_info=[None, None, None, persis_v],
                          n_steps=steps,  # init_gibbs dung de init
                          name='gibbs_vhv')
        updates.update({persis_v: v_samples[-1]})
        if is_sample:
            gibbs_fn = theano.function([], v_samples[-1], updates=updates, name=name)
        else:
            gibbs_fn = theano.function([], v_stats[-1], updates=updates, name=name)
        return gibbs_fn

    # Also return visible variables
    def _gibbs_hvh_to_v_fn(self, steps, persis_h, is_sample=True, name=''):
        [v_stats, v_samples, h_means, h_samples], updates \
            = theano.scan(self.gibbs_hvh,
                          outputs_info=[None, None, None, persis_h],
                          n_steps=steps,  # init_gibbs dung de init
                          name='gibbs_hvh')
        updates.update({persis_h: h_samples[-1]})
        if is_sample:
            gibbs_fn = theano.function([], v_samples[-1], updates=updates, name=name)
        else:
            gibbs_fn = theano.function([], v_stats[-1], updates=updates, name=name)
        return gibbs_fn

    def sample_given_data(self, v_data, init_gibbs=1000, betw_gibbs=100, loops=10, is_sample=False):
        print "\nSample data from input using model {}".format(self.name)
        # Neu kich thuoc input la 1 thi phai chuyen no ve kich thuoc 2
        if len(v_data.shape) == 1:
            persis_v = theano.shared(np.asarray(v_data.reshape(1, v_data.shape[0]),
                                                dtype=theano.config.floatX))
        else:
            persis_v = theano.shared(np.asarray(v_data, dtype=theano.config.floatX))

        if init_gibbs > 0:
            init_sampling_fn = self._gibbs_vhv_to_v_fn(init_gibbs, persis_v,
                                                       is_sample=True, name='init_sampling_fn')
        else:
            init_sampling_fn = None

        sample_fn = self._gibbs_vhv_to_v_fn(betw_gibbs, persis_v,
                                            is_sample=is_sample, name='sample_fn')

        rvs_data = []
        if init_sampling_fn is not None:
            init_sampling_fn()
        for idx in range(loops):
            print "Running sampling loop %d" % idx
            rv_data = sample_fn()
            rvs_data.append(rv_data)

        return np.asarray(rvs_data)

    # Sample randomly
    # We start from h and run gibbs chain until it reaches equilibrium
    def sample(self, init_gibbs=1000, betw_gibbs=100, n_samples=20, loops=10, is_sample=False):
        print "\nSample random data using model {}".format(self.name)
        persis_h = theano.shared(np.zeros((n_samples, self.h_dim), dtype=theano.config.floatX))

        if init_gibbs > 0:
            init_sampling_fn = self._gibbs_hvh_to_v_fn(init_gibbs, persis_h,
                                                       is_sample=True, name='init_sampling_fn')
        else:
            init_sampling_fn = None
        sample_fn = self._gibbs_hvh_to_v_fn(betw_gibbs, persis_h,
                                            is_sample=is_sample, name='sample_fn')

        rvs_data = []
        if init_sampling_fn is not None:
            init_sampling_fn()
        for idx in range(loops):
            print "Running sampling loop %d" % idx
            rv_data = sample_fn()
            rvs_data.append(rv_data)

        return np.asarray(rvs_data)

    def get_cost_udpates(self, lr, k, persis_h, l1, l2, stable_update, store_grad):
        # Run one sample step to get h
        h_mean, h_sample = self.sample_h_given_v(self.input)

        # Run normal CD
        start_h = persis_h if persis_h is not None else h_sample

        [v_stats, v_samples, h_means, h_samples], updates \
            = theano.scan(fn=self.gibbs_hvh, outputs_info=[None, None, None, start_h],
                          n_steps=k, name="gibbs_hvh")

        vk = v_samples[-1]
        v_stat_k = v_stats[-1]

        if persis_h is not None:
            updates[persis_h] = h_samples[-1]

        cost = self.get_viewed_cost(self.input, v_stat_k)
        cost = T.mean(cost)

        # For stable update, use mean value instead of random sampled value
        if stable_update:
            print "\nStable update is set to be True"
            updates = self.params_updates(self.input, v_stat_k, lr, l1, l2, updates, store_grad)
        else:
            print "\nStable update is set to be False"
            updates = self.params_updates(self.input, vk, lr, l1, l2, updates, store_grad)

        # return cost, updates
        return cost, updates

    def get_viewed_cost(self, v0, vk_stat):
        # Binary cross-entropy
        cost = 0
        if self.input_type == InputType.binary:
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.float32(0.999999))
            cost = -T.sum(v0 * T.log(clip_vk_stat) + (1 - v0) * T.log(1 - clip_vk_stat), axis=1)

        # Sum square error
        elif self.input_type == InputType.gaussian:
            cost = T.sum((v0 - vk_stat) ** 2, axis=1)

        # Categorical cross-entropy
        elif self.input_type == InputType.categorical:
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.float32(0.999999))
            cost = -T.sum(v0 * T.log(clip_vk_stat), axis=1)

        elif self.input_type == InputType.poisson:
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.inf)
            cost = -T.sum(-vk_stat + v0 * T.log(clip_vk_stat) - T.gammaln(1 + v0), axis=1)

        if self.input_type == InputType.replicated_softmax:
            clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.inf)
            cost = -T.sum((v0 / self.total_count) * T.log(clip_vk_stat), axis=1)

        return cost

    def params_updates(self, v0, vk, lr, l1, l2, updates, store_grad):
        if updates is None:
            updates = OrderedDict()
        if store_grad:
            self.stored_grads = OrderedDict()

        grads = [0 for _ in xrange(len(self.params))]

        o_grads = self.nll_grad_formula(v0, vk)
        grads = [grads[i] + o_grads[i] for i in xrange(len(self.params))]

        if store_grad:
            print "\nGradients over negative log-likelihood are stored in original_grads"
            o_shared_grads, updates = store_grads_in_update(self.params, o_grads, updates)
            self.stored_grads['original_grads'] = o_shared_grads

        if l1 is not None:
            print "Add L1 regularization ({}) to parameter updates".format(l1)
            l1_gW = l1_grad(self.W, l1)
            grads[0] = grads[0] + l1_gW

            if store_grad:
                print "\nGradients over L1 regularization are stored in l1_grads"
                l1_shared_grads, updates = store_grads_in_update([self.W], [l1_gW], updates)
                self.stored_grads['l1_grads'] = l1_shared_grads

        if l2 is not None:
            print "Add L2 regularization ({}) to parameter updates".format(l2)
            l2_gW = l2_grad(self.W, l2)
            grads[0] = grads[0] + l2_gW

            if store_grad:
                print "\nGradients over L2 regularization are stored in l2_grads"
                l2_shared_grads, updates = store_grads_in_update([self.W], [l2_gW], updates)
                self.stored_grads['l2_grads'] = l2_shared_grads

        if store_grad:
            print "\nGradients over total cost are stored in total_grads"
            t_shared_grads, updates = store_grads_in_update(self.params, grads, updates)
            self.stored_grads['total_grads'] = t_shared_grads

        grads = [grad.astype(theano.config.floatX) for grad in grads]

        if self.check_learning_algor():
            params_updates = self.learning_algor(grads, self.params, lr, **self.learning_config)
            updates.update(params_updates)
        else:
            print "\nSimple SGD is used as training algorithm"
            for grad, param in zip(grads, self.params):
                updates[param] = param - grad * lr

        return updates

    def nll_grad_formula(self, v0, vk):
        n_instances = v0.shape[0]

        h0 = self.h_given_v(v0)
        hk = self.h_given_v(vk)

        gW = (T.dot(vk.T, hk) - T.dot(v0.T, h0)) / n_instances
        gb_h = T.mean(hk - h0, axis=0)

        if self.input_type == InputType.gaussian:
            gb_v = T.mean((vk - v0) / (self.sigma_v ** 2), axis=0)

            ugz_v = (((vk - self.b_v) ** 2 - 2 * vk * T.dot(hk, self.W.T)) - \
                    ((v0 - self.b_v) ** 2 - 2 * v0 * T.dot(h0, self.W.T))) / (self.sigma_v ** 2)
            gz_v = T.mean(ugz_v, axis=0)

            grads = [gW, gb_h, gb_v, gz_v]

        else:
            gb_v = T.mean(vk - v0, axis=0)
            grads = [gW, gb_h, gb_v]

        return grads

    def nll_grad_theano(self, v0, vk):
        cost = T.mean(self.free_energy(v0)) - T.mean(self.free_energy(vk))
        # Note here we have to use consider_constant
        grads = T.grad(cost, self.params, consider_constant=[vk])
        return grads

    def grad_check(self, data_v0, data_vk):
        # data_v0 and data_vk is numpy array
        # data_vk is computed by calling CD-k
        v0 = T.matrix('v0')
        vk = T.matrix('vk')

        theano_grads = self.nll_grad_theano(v0, vk)
        formula_grads = self.nll_grad_formula(v0, vk)

        grad_diffs = []
        for t_grad, f_grad in zip(theano_grads, formula_grads):
            grad_diffs.append(abs(t_grad - f_grad))

        grad_test_fn = theano.function([v0, vk], grad_diffs)

        diffs_results = grad_test_fn(data_v0, data_vk)
        for i in xrange(len(self.params)):
            if self.params[i].name is not None:
                name = self.params[i].name
            else:
                name = ""
            print ("Max " + name + " diffs: {}").format(np.max(diffs_results[i]))
            print ("Min " + name + " diffs: {}").format(np.min(diffs_results[i]))
            print ("Average " + name + " diffs: {}").format(np.mean(diffs_results[i]))
            print
        return diffs_results

    def config_train(self, **kwargs):
        k = kwargs.get('CD_k')
        persis_h_data = kwargs.get('persis_h')
        l1 = kwargs.get('L1')
        l2 = kwargs.get('L2')

        if l1 is None:
            print "L1 should be set to enable sparse weight regularization"
        if l2 is None:
            print "L2 should be set to enable sparse weight regularization"

        stable_update = kwargs.get('stable_update')
        if stable_update is None:
            stable_update = False

        store_grad = kwargs.get('store_grad')
        if store_grad is None:
            store_grad = False

        self._build_train(k, persis_h_data, l1, l2, stable_update, store_grad)

    # persis_v_data is a numpy array
    def _build_train(self, k, persis_h_data, l1, l2, stable_update, store_grad):
        print "\nBuild training function of model {}".format(self.name)

        if persis_h_data is not None:
            persis_h = theano.shared(persis_h_data, borrow=True)
        else:
            persis_h = None

        lr = T.scalar('lr')
        cost, updates = self.get_cost_udpates(lr, k, persis_h, l1, l2, stable_update, store_grad)
        print "\nBuild computation graph for training function of model {}".format(self.name)
        self.train_fn = theano.function([self.input, lr], cost, updates=updates)

        rv = self.v_given_h(self.h_given_v(self.input))
        test_cost = self.get_viewed_cost(self.input, rv)
        test_cost = T.mean(test_cost)
        print "\nBuild computation graph for validation function of model {}".format(self.name)
        self.valid_fn = theano.function([self.input], test_cost)
Exemplo n.º 29
0
class TextDecoder(EncoderDecoderBase):
    EVALUATION = 1
    SAMPLING = 2
    BEAM_SEARCH = 3

    def __init__(self, state, rng, parent):
        EncoderDecoderBase.__init__(self, state, rng, parent)
        self.trng = MRG_RandomStreams(self.seed)
        self.init_params()

    def init_params(self):
        if self.multiplicative_input_from_encoders:
            if self.bidirectional_encoder:
                self.input_dim = self.qdim * 2
            else:
                self.input_dim = self.qdim
        else:
            if self.bidirectional_encoder:
                self.input_dim = self.qdim * 4
            else:
                self.input_dim = self.qdim * 2

        if self.use_precomputed_features:
            self.input_dim += self.precomputed_features_count
        """ Decoder weights """
        self.Wd_in = add_to_params(
            self.params,
            theano.shared(value=NormalInit(self.rng, self.input_dim,
                                           self.mlp_out_dim),
                          name='Wd_in'))
        self.bd_in = add_to_params(
            self.params,
            theano.shared(value=np.zeros((self.mlp_out_dim, ),
                                         dtype='float32'),
                          name='bd_in'))

        if self.condition_on_previous_speaker_class:
            self.Wd_softmax_first = add_to_params(
                self.params,
                theano.shared(value=NormalInit3D(
                    self.rng, self.segmentation_token_count, self.mlp_out_dim,
                    self.segmentation_token_count),
                              name='Wd_softmax_first'))
            self.bd_softmax_first = add_to_params(
                self.params,
                theano.shared(value=np.zeros((self.segmentation_token_count,
                                              self.segmentation_token_count),
                                             dtype='float32'),
                              name='bd_softmax__first'))

            self.Wd_softmax_second = add_to_params(
                self.params,
                theano.shared(value=NormalInit3D(
                    self.rng, self.segmentation_token_count, self.mlp_out_dim,
                    self.segmentation_token_count),
                              name='Wd_softmax_second'))
            self.bd_softmax_second = add_to_params(
                self.params,
                theano.shared(value=np.zeros((self.segmentation_token_count,
                                              self.segmentation_token_count),
                                             dtype='float32'),
                              name='bd_softmax__second'))

            self.Wd_softmax_third = add_to_params(
                self.params,
                theano.shared(value=NormalInit3D(
                    self.rng, self.segmentation_token_count, self.mlp_out_dim,
                    self.segmentation_token_count),
                              name='Wd_softmax_third'))
            self.bd_softmax_third = add_to_params(
                self.params,
                theano.shared(value=np.zeros((self.segmentation_token_count,
                                              self.segmentation_token_count),
                                             dtype='float32'),
                              name='bd_softmax__third'))
        else:
            self.Wd_softmax_first = add_to_params(
                self.params,
                theano.shared(value=NormalInit(self.rng, self.mlp_out_dim,
                                               self.segmentation_token_count),
                              name='Wd_softmax_first'))
            self.bd_softmax_first = add_to_params(
                self.params,
                theano.shared(value=np.zeros((self.segmentation_token_count, ),
                                             dtype='float32'),
                              name='bd_softmax__first'))

            self.Wd_softmax_second = add_to_params(
                self.params,
                theano.shared(value=NormalInit(self.rng, self.mlp_out_dim,
                                               self.segmentation_token_count),
                              name='Wd_softmax_second'))
            self.bd_softmax_second = add_to_params(
                self.params,
                theano.shared(value=np.zeros((self.segmentation_token_count, ),
                                             dtype='float32'),
                              name='bd_softmax__second'))

            self.Wd_softmax_third = add_to_params(
                self.params,
                theano.shared(value=NormalInit(self.rng, self.mlp_out_dim,
                                               self.segmentation_token_count),
                              name='Wd_softmax_third'))
            self.bd_softmax_third = add_to_params(
                self.params,
                theano.shared(value=np.zeros((self.segmentation_token_count, ),
                                             dtype='float32'),
                              name='bd_softmax__third'))

    def build_next_probs_predictor(self, inp, x, prev_state):
        """ 
        Return output probabilities given prev_words x, hierarchical pass hs, and previous hd
        hs should always be the same (and should not be updated).
        """
        return self.build_decoder(inp,
                                  x,
                                  mode=TextDecoder.BEAM_SEARCH,
                                  prev_state=prev_state)

    def build_decoder(self, decoder_inp, y=None, y_prev=None, mode=EVALUATION):
        # Run the decoder

        if self.mlp_activation_function == 'tanh':
            hidden_activation = T.tanh(
                T.dot(decoder_inp, self.Wd_in) + self.bd_in)
        elif self.mlp_activation_function == 'rectifier':
            hidden_activation = relu(
                T.dot(decoder_inp, self.Wd_in) + self.bd_in)
        elif self.mlp_activation_function == 'linear':
            hidden_activation = T.dot(decoder_inp, self.Wd_in) + self.bd_in
        else:
            raise Exception("Invalid activation function specified for MLP!")

        if self.condition_on_previous_speaker_class:
            first_output = T.nnet.softmax(
                T.dot(hidden_activation, self.Wd_softmax_first[y_prev[0]][
                    0, :, :]) + self.bd_softmax_first[y_prev[0]])

            second_output = T.nnet.softmax(
                T.dot(hidden_activation, self.Wd_softmax_second[y_prev[0]][
                    0, :, :]) + self.bd_softmax_second[y_prev[0]])
            third_output = T.nnet.softmax(
                T.dot(hidden_activation, self.Wd_softmax_third[y_prev[0]][
                    0, :, :]) + self.bd_softmax_third[y_prev[0]])

            outputs = T.concatenate(
                [first_output, second_output, third_output])

        else:
            first_output = T.nnet.softmax(
                T.dot(hidden_activation, self.Wd_softmax_first) +
                self.bd_softmax_first)
            second_output = T.nnet.softmax(
                T.dot(hidden_activation, self.Wd_softmax_second) +
                self.bd_softmax_second)
            third_output = T.nnet.softmax(
                T.dot(hidden_activation, self.Wd_softmax_third) +
                self.bd_softmax_third)
            outputs = T.concatenate(
                [first_output, second_output, third_output])

        # EVALUATION  / BEAM SEARCH: Return outputs
        if mode == TextDecoder.EVALUATION:
            first_target_outputs = GrabProbs(first_output, y[0])
            second_target_outputs = GrabProbs(second_output, y[1])
            third_target_outputs = GrabProbs(third_output, y[1])
            target_outputs = T.concatenate([
                first_target_outputs, second_target_outputs,
                third_target_outputs
            ])

            return outputs, target_outputs
        elif mode == TextDecoder.BEAM_SEARCH:
            return outputs
        # SAMPLING    : Return a vector with sample
        elif mode == TextDecoder.SAMPLING:
            first_sample = self.trng.multinomial(pvals=first_output,
                                                 dtype='int64').argmax(axis=-1)
            second_sample = self.trng.multinomial(
                pvals=second_output, dtype='int64').argmax(axis=-1)
            third_sample = self.trng.multinomial(pvals=third_output,
                                                 dtype='int64').argmax(axis=-1)
            return T.concatenate([first_sample, second_sample, third_sample])
Exemplo n.º 30
0
def theano_multinomial(n, pvals, seed):
    rng = RandomStreams(seed)
    return rng.multinomial(n=n, pvals=pvals, dtype='float32')
Exemplo n.º 31
0
class OptionCritic_Network():
    def __init__(self,
                 model_network=None,
                 gamma=0.99,
                 learning_method="rmsprop",
                 actor_lr=0.00025,
                 batch_size=32,
                 input_size=None,
                 learning_params=None,
                 dnn_type=True,
                 clip_delta=0,
                 scale=255.,
                 freeze_interval=100,
                 grad_clip=0,
                 termination_reg=0,
                 num_options=8,
                 double_q=False,
                 temp=1,
                 entropy_reg=0,
                 BASELINE=False,
                 **kwargs):
        x = T.ftensor4()
        next_x = T.ftensor4()
        a = T.ivector()
        o = T.ivector()
        r = T.fvector()
        terminal = T.ivector()
        self.freeze_interval = freeze_interval

        self.theano_rng = MRG_RandomStreams(1000)

        self.x_shared = theano.shared(
            np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32'))
        self.next_x_shared = theano.shared(
            np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32'))
        self.a_shared = theano.shared(np.zeros((batch_size), dtype='int32'))
        self.o_shared = theano.shared(np.zeros((batch_size), dtype='int32'))
        self.terminal_shared = theano.shared(
            np.zeros((batch_size), dtype='int32'))
        self.r_shared = theano.shared(np.zeros((batch_size), dtype='float32'))

        state_network = model_network[:-1]
        termination_network = copy.deepcopy([model_network[-1]])
        termination_network[0]["activation"] = "sigmoid"
        print "NUM OPTIONS --->", num_options
        termination_network[0]["out_size"] = num_options
        option_network = copy.deepcopy([model_network[-1]])
        option_network[0]["activation"] = "softmax"
        Q_network = copy.deepcopy([model_network[-1]])
        Q_network[0]["out_size"] = num_options

        self.state_model = Model(state_network,
                                 input_size=input_size,
                                 dnn_type=dnn_type)
        self.state_model_prime = Model(state_network,
                                       input_size=input_size,
                                       dnn_type=dnn_type)
        output_size = [None, model_network[-2]["out_size"]]
        self.Q_model = Model(Q_network,
                             input_size=output_size,
                             dnn_type=dnn_type)
        self.Q_model_prime = Model(Q_network,
                                   input_size=output_size,
                                   dnn_type=dnn_type)
        self.termination_model = Model(termination_network,
                                       input_size=output_size,
                                       dnn_type=dnn_type)
        self.options_model = MLP3D(num_options, model_network, temp=temp)

        s = self.state_model.apply(x / scale)
        next_s = self.state_model.apply(next_x / scale)
        next_s_prime = self.state_model_prime.apply(next_x / scale)

        termination_probs = self.termination_model.apply(
            theano.gradient.disconnected_grad(s))
        option_term_prob = termination_probs[T.arange(o.shape[0]), o]
        next_termination_probs = self.termination_model.apply(
            theano.gradient.disconnected_grad(next_s))
        next_option_term_prob = next_termination_probs[T.arange(o.shape[0]), o]
        termination_sample = T.gt(option_term_prob,
                                  self.theano_rng.uniform(size=o.shape))

        Q = self.Q_model.apply(s)
        next_Q = self.Q_model.apply(next_s)
        next_Q_prime = theano.gradient.disconnected_grad(
            self.Q_model_prime.apply(next_s_prime))

        disc_option_term_prob = theano.gradient.disconnected_grad(
            next_option_term_prob)

        action_probs = self.options_model.apply(s, o)
        sampled_actions = T.argmax(self.theano_rng.multinomial(
            pvals=action_probs, n=1),
                                   axis=1).astype("int32")

        if double_q:
            print "TRAINING DOUBLE_Q"
            y = r + (1 - terminal) * gamma * (
                (1 - disc_option_term_prob) *
                next_Q_prime[T.arange(o.shape[0]), o] +
                disc_option_term_prob * next_Q_prime[T.arange(next_Q.shape[0]),
                                                     T.argmax(next_Q, axis=1)])
        else:
            y = r + (1 - terminal) * gamma * (
                (1 - disc_option_term_prob) *
                next_Q_prime[T.arange(o.shape[0]), o] +
                disc_option_term_prob * T.max(next_Q_prime, axis=1))

        y = theano.gradient.disconnected_grad(y)

        option_Q = Q[T.arange(o.shape[0]), o]
        td_errors = y - option_Q

        if clip_delta > 0:
            quadratic_part = T.minimum(abs(td_errors), clip_delta)
            linear_part = abs(td_errors) - quadratic_part
            td_cost = 0.5 * quadratic_part**2 + clip_delta * linear_part
        else:
            td_cost = 0.5 * td_errors**2

        # critic updates
        critic_cost = T.sum(td_cost)
        critic_params = self.Q_model.params + self.state_model.params
        learning_algo = self.Q_model.get_learning_method(
            learning_method, **learning_params)
        grads = T.grad(critic_cost, critic_params)
        critic_updates = learning_algo.apply(critic_params,
                                             grads,
                                             grad_clip=grad_clip)

        # actor updates
        actor_params = self.termination_model.params + self.options_model.params
        learning_algo = self.termination_model.get_learning_method("sgd",
                                                                   lr=actor_lr)
        disc_Q = theano.gradient.disconnected_grad(option_Q)
        disc_V = theano.gradient.disconnected_grad(T.max(Q, axis=1))
        term_grad = T.sum(option_term_prob *
                          (disc_Q - disc_V + termination_reg))
        entropy = -T.sum(action_probs * T.log(action_probs))
        if not BASELINE:
            policy_grad = - \
                T.sum(
                    T.log(action_probs[T.arange(a.shape[0]), a]) * y) - entropy_reg*entropy
        else:
            policy_grad = - \
                T.sum(T.log(action_probs[T.arange(a.shape[0]), a])
                      * (y-disc_Q)) - entropy_reg*entropy
        grads = T.grad(term_grad + policy_grad, actor_params)
        actor_updates = learning_algo.apply(actor_params,
                                            grads,
                                            grad_clip=grad_clip)

        if self.freeze_interval > 1:
            target_updates = OrderedDict()
            for t, b in zip(
                    self.Q_model_prime.params + self.state_model_prime.params,
                    self.Q_model.params + self.state_model.params):
                target_updates[t] = b
            self._update_target_params = theano.function(
                [], [], updates=target_updates)
            self.update_target_params()
            print "freeze interval:", self.freeze_interval
        else:
            print "freeze interval: None"

        critic_givens = {
            x: self.x_shared,
            o: self.o_shared,
            r: self.r_shared,
            terminal: self.terminal_shared,
            next_x: self.next_x_shared
        }

        actor_givens = {
            a: self.a_shared,
            r: self.r_shared,
            terminal: self.terminal_shared,
            o: self.o_shared,
            next_x: self.next_x_shared
        }

        print "compiling...",
        self.train_critic = theano.function([], [critic_cost],
                                            updates=critic_updates,
                                            givens=critic_givens)
        self.train_actor = theano.function([s], [],
                                           updates=actor_updates,
                                           givens=actor_givens)
        self.pred_score = theano.function([],
                                          T.max(Q, axis=1),
                                          givens={x: self.x_shared})
        self.sample_termination = theano.function(
            [s], [termination_sample, T.argmax(Q, axis=1)],
            givens={o: self.o_shared})
        self.sample_options = theano.function([s], T.argmax(Q, axis=1))
        self.sample_actions = theano.function([s],
                                              sampled_actions,
                                              givens={o: self.o_shared})
        self.get_action_dist = theano.function([s, o], action_probs)
        self.get_s = theano.function([], s, givens={x: self.x_shared})
        print "complete"

    def update_target_params(self):
        if self.freeze_interval > 1:
            self._update_target_params()
        return

    def predict_move(self, s):
        return self.sample_options(s)

    def predict_termination(self, s, a):
        self.a_shared.set_value(a)
        return tuple(self.sample_termination(s))

    def get_q_vals(self, x):
        self.x_shared.set_value(x)
        return self.pred_score()[:, np.newaxis]

    def get_state(self, x):
        self.x_shared.set_value(x)
        return self.get_s()

    def get_action(self, s, o):
        self.o_shared.set_value(o)
        return self.sample_actions(s)

    def train_conv_net(self,
                       train_set_x,
                       next_x,
                       options,
                       r,
                       terminal,
                       actions=None,
                       model=""):
        self.next_x_shared.set_value(next_x)
        self.o_shared.set_value(options)
        self.r_shared.set_value(r)
        self.terminal_shared.set_value(terminal)
        if model == "critic":
            self.x_shared.set_value(train_set_x)
            return self.train_critic()
        elif model == "actor":
            self.a_shared.set_value(actions)
            return self.train_actor(train_set_x)
        else:
            print "WRONG MODEL NAME"
            raise NotImplementedError

    def save_params(self):
        return [
            self.state_model.save_params(),
            self.Q_model.save_params(),
            self.termination_model.save_params(),
            self.options_model.save_params()
        ]

    def load_params(self, values):
        self.state_model.load_params(values[0])
        self.Q_model.load_params(values[1])
        self.termination_model.load_params(values[2])
        self.options_model.load_params(values[3])
Exemplo n.º 32
0
class ParticleFilter():
    ''' Implements particle filtering and smoothing for Markov Chains
	 with arbitrary proposal/true distributions '''
    def __init__(self,
                 transition_model,
                 observation_model,
                 n_particles,
                 observation_input=None,
                 n_history=1):

        self.transition_model = transition_model
        self.observation_model = observation_model
        self.data_dims = observation_model.output_dims
        self.state_dims = transition_model.output_dims
        self.n_particles = n_particles
        self.n_history = n_history

        #this is used to keep track of what set of particles corresponds
        #to the previous point in time
        self.time_counter = theano.shared(0)

        self.theano_rng = RandomStreams()

        #init_particles=np.zeros((n_history+1, n_particles, self.state_dims)).astype(np.float32)
        init_particles = np.random.randn(n_history + 1, n_particles,
                                         self.state_dims).astype(np.float32)
        init_weights = (np.ones((n_history + 1, n_particles)) /
                        float(n_particles)).astype(np.float32)

        self.particles = theano.shared(init_particles)
        self.weights = theano.shared(init_weights)

        self.next_state = self.particles[(self.time_counter + 1) %
                                         (self.n_history + 1)]
        self.current_state = self.particles[self.time_counter %
                                            (self.n_history + 1)]
        self.previous_state = self.particles[(self.time_counter - 1) %
                                             (self.n_history + 1)]

        self.next_weights = self.weights[(self.time_counter + 1) %
                                         (self.n_history + 1)]
        self.current_weights = self.weights[self.time_counter %
                                            (self.n_history + 1)]
        self.previous_weights = self.weights[(self.time_counter - 1) %
                                             (self.n_history + 1)]

        self.proposal_distrib = None

        self.true_log_transition_probs = self.transition_model.rel_log_prob
        self.true_log_observation_probs = self.observation_model.rel_log_prob

        self.perform_inference = None
        self.resample = None
        self.sample_joint = None

        self.observation_input = observation_input

        ess = self.compute_ESS()
        self.get_ESS = theano.function([], ess)

        n_samps = T.lscalar()
        n_T = T.lscalar()
        data_samples, state_samples, init_state_samples, data_sample_updates = self.sample_future(
            n_samps, n_T)
        self.sample_from_future = theano.function(
            [n_samps, n_T], [data_samples, state_samples, init_state_samples],
            updates=data_sample_updates)

        self.get_current_particles = theano.function([], self.current_state)
        self.get_current_weights = theano.function([], self.current_weights)

    def recompile(self):
        '''This function compiles each of the theano functions that might
		change following a change of the model. '''

        samp_updates = self.sample_update(self.observation_input)
        self.perform_inference = theano.function([], updates=samp_updates)

        res_updates = self.resample_update()
        self.resample = theano.function([], updates=res_updates)

        nsamps = T.lscalar()
        joint_samples, joint_updates = self.sample_from_joint(nsamps)
        self.sample_joint = theano.function([nsamps],
                                            joint_samples,
                                            updates=joint_updates)

        new_ess, stddevhist, esshist, sr_updates = self.sequential_resample()
        self.perform_sequential_resampling = theano.function(
            [], [new_ess, stddevhist, esshist], updates=sr_updates)

        csamps = self.sample_current(nsamps)
        self.sample_current_state = theano.function([nsamps], csamps)

        psamps = self.sample_prev(nsamps)
        self.sample_previous_state = theano.function([nsamps], psamps)

        return

    def set_proposal(self, proposal_distrib):

        self.proposal_distrib = proposal_distrib

        return

    def set_true_log_transition_probs(self, true_log_transition_probs):

        self.true_log_transition_probs = true_log_transition_probs
        return

    def set_true_log_observation_probs(self, true_log_observation_probs):

        self.true_log_observation_probs = true_log_observation_probs
        return

    def sample_update(self, data):

        proposal_samples, log_proposal_probs = self.proposal_distrib

        printing = False

        if printing:
            log_transition_probs = theano.printing.Print(
                '1 log transition probs update')(
                    self.true_log_transition_probs(self.current_state,
                                                   proposal_samples))
            log_observation_probs = theano.printing.Print(
                '2 log observation probs update')(
                    self.true_log_observation_probs(proposal_samples,
                                                    data.dimshuffle('x', 0)))
            log_unnorm_weights = theano.printing.Print(
                '3 log unnorm weights update')(log_transition_probs +
                                               log_observation_probs -
                                               log_proposal_probs)
            log_unnorm_weights_center = theano.printing.Print(
                '4 log unnorm weights center update')(
                    log_unnorm_weights - T.max(log_unnorm_weights))
            unnorm_weights = theano.printing.Print('5 unnorm weights update')(
                T.exp(log_unnorm_weights_center) * self.current_weights)
            normalizer = theano.printing.Print('6 normalizer update')(
                T.sum(unnorm_weights))
        else:
            log_transition_probs = self.true_log_transition_probs(
                self.current_state, proposal_samples)
            log_observation_probs = self.true_log_observation_probs(
                proposal_samples, data.dimshuffle('x', 0))
            log_unnorm_weights = log_transition_probs + log_observation_probs - log_proposal_probs
            log_unnorm_weights_center = log_unnorm_weights - T.max(
                log_unnorm_weights)
            unnorm_weights = T.exp(
                log_unnorm_weights_center) * self.current_weights
            normalizer = T.sum(unnorm_weights)

        weights = unnorm_weights / normalizer

        updates = OrderedDict()

        updates[self.weights] = T.set_subtensor(self.next_weights, weights)

        updates[self.particles] = T.set_subtensor(self.next_state,
                                                  proposal_samples)

        updates[self.time_counter] = self.time_counter + 1

        return updates

    def compute_ESS(self):

        return 1.0 / T.sum(self.current_weights**2)

    def resample_update(self):

        #shape: n_particles by n_particles
        samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat(
            self.current_weights.dimshuffle('x', 0), self.n_particles, axis=0))
        idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64')
        updates = OrderedDict()
        updates[self.particles] = T.set_subtensor(self.current_state,
                                                  self.current_state[idxs])
        updates[self.weights] = T.set_subtensor(
            self.current_weights,
            T.cast(
                T.ones_like(self.current_weights) / float(self.n_particles),
                'float32'))
        return updates

    def sample_step(self, future_samps, t, n_samples):

        particles_now = self.particles[(self.time_counter - t) %
                                       (self.n_history + 1)]
        weights_now = self.weights[(self.time_counter - t) %
                                   (self.n_history + 1)]

        #n_particles by n_samples
        rel_log_probs = self.true_log_transition_probs(particles_now,
                                                       future_samps,
                                                       all_pairs=True)

        unnorm_probs = T.exp(rel_log_probs) * weights_now.dimshuffle(0, 'x')
        probs = unnorm_probs / T.sum(unnorm_probs, axis=0).dimshuffle('x', 0)

        samps = self.theano_rng.multinomial(pvals=probs.T)
        idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64')
        output_samples = particles_now[idxs]

        return [output_samples, t + 1]

    def sample_from_joint(self, n_samples, output_2D=False):
        '''Samples from the joint posterior P(s_t-n_history:s_t | observations)
		n_samples: the number of samples to draw
		
		Returns an array with shape (n_history+1, n_samples, state_dims),
		where array[-1] corresponds to the current time.
		'''
        samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat(
            self.current_weights.dimshuffle('x', 0), n_samples, axis=0))
        idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64')
        samps_t0 = self.current_state[idxs]

        t0 = T.as_tensor_variable(1)

        [samples, ts], updates = theano.scan(fn=self.sample_step,
                                             outputs_info=[samps_t0, t0],
                                             non_sequences=[n_samples],
                                             n_steps=self.n_history)

        #the variable "samples" that results from the scan is time-flipped
        #in the sense that samples[0] corresponds to the most recent point
        #in time, and higher indices correspond to points in the past.
        #I will stick to the convention that for any collection of points in
        #time, [-1] will index the most recent time, and [0] will index
        #the point farthest in the past. So, the first axis of "samples"
        #needs to be flipped.
        flip_idxs = T.cast(-T.arange(self.n_history) + self.n_history - 1,
                           'int64')
        samples = T.concatenate(
            [samples[flip_idxs],
             samps_t0.dimshuffle('x', 0, 1)], axis=0)

        if output_2D:
            samples = T.reshape(
                samples, ((self.n_history + 1) * n_samples, self.state_dims))

        return samples, updates

    def sample_future(self, n_samples, n_T):
        '''Samples from the "future" data distribution: 
				P(s_t+1,...s_t+n_T, x_t+1,...x_t+n_T | s_t)
		
		n_samples: number of samples to draw
		n_T: the number of (future) time points to sample from
		
		Returns three arrays. The first two have shapes 
		(n_T, n_samples, data_dims) and
		(n_T, n_samples, state_dims),
		corresponding to samples of future observations and states,
		and the third having size (n_samples,state_dims),
		corresponding to the "initial" samples taken from the current
		state distribution.
		'''

        samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat(
            self.current_weights.dimshuffle('x', 0), n_samples, axis=0))
        idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64')
        samps_t0 = self.current_state[idxs]

        def fstep(states):
            next_states = self.transition_model.get_samples_noprobs(states)
            next_data = self.observation_model.get_samples_noprobs(next_states)
            return next_states, next_data

        [state_samples,
         data_samples], updates = theano.scan(fn=fstep,
                                              outputs_info=[samps_t0, None],
                                              n_steps=n_T)

        #data_samples=self.observation_model.get_samples_noprobs(state_samples)

        return data_samples, state_samples, samps_t0, updates

    def sample_model(self, n_samples, n_T):
        '''Samples from the "future" data distribution: 
				P(s_t+1,...s_t+n_T, x_t+1,...x_t+n_T | s_t)
		
		n_samples: number of samples to draw
		n_T: the number of (future) time points to sample from
		
		Returns three arrays. The first two have shapes 
		(n_T, n_samples, data_dims) and
		(n_T, n_samples, state_dims),
		corresponding to samples of future observations and states,
		and the third having size (n_samples,state_dims),
		corresponding to the "initial" samples taken from the current
		state distribution.
		'''

        samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat(
            self.current_weights.dimshuffle('x', 0), n_samples, axis=0))
        idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64')
        samps_t0 = self.current_state[idxs]

        state_samples, updates = theano.scan(
            fn=self.transition_model.get_samples_noprobs,
            outputs_info=[samps_t0],
            n_steps=n_T)

        data_sample = self.observation_model.get_samples_noprobs(
            state_samples[-1])

        return data_sample, state_samples[-1], state_samples[-2], updates

    def sr_step(self, means, weights, stddev, ess, decay):

        #Sampling from a mixture of gaussians
        msamps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat(
            weights.dimshuffle('x', 0), means.shape[0], axis=0))
        idxs = T.cast(T.dot(msamps, T.arange(means.shape[0])), 'int64')
        sample_means = T.cast(means[idxs], 'float32')

        proposal_samples = self.theano_rng.normal(
            size=means.shape) * stddev.dimshuffle('x', 0) + sample_means
        diffs = proposal_samples.dimshuffle(
            0, 'x', 1) - sample_means.dimshuffle('x', 0, 1)

        printing = False
        if printing:
            log_proposal_probs = theano.printing.Print('1 log_proposal_probs')(
                T.log(
                    T.dot(
                        T.exp(-T.sum(
                            (1.0 / (2.0 * stddev**2)).dimshuffle('x', 'x', 0) *
                            diffs**2,
                            axis=2)), weights)))
            log_transition_probs = theano.printing.Print(
                '2 log transition probs')(self.true_log_transition_probs(
                    self.previous_state, proposal_samples, all_pairs=True))
            log_transition_probs_2 = theano.printing.Print(
                '3 log transition probs 2')(T.log(
                    T.dot(
                        T.exp(log_transition_probs).T, self.previous_weights)))
            log_observation_probs = theano.printing.Print(
                '4 log observation probs')(self.true_log_observation_probs(
                    proposal_samples,
                    self.observation_input.dimshuffle('x', 0)))
            log_unnorm_weights = theano.printing.Print(
                '5 log unnorm weights nomax')(log_transition_probs_2 +
                                              log_observation_probs -
                                              log_proposal_probs)
            log_unnorm_weights = theano.printing.Print('6 log unnorm weights')(
                log_unnorm_weights - T.max(log_unnorm_weights))
            unnorm_weights = theano.printing.Print('7 unnorm weights')(
                T.exp(log_unnorm_weights))
            normalizer = theano.printing.Print('8 normalizer')(
                T.sum(unnorm_weights))
        else:
            log_proposal_probs = T.log(
                T.dot(
                    T.exp(-T.sum((1.0 /
                                  (2.0 * stddev**2)).dimshuffle('x', 'x', 0) *
                                 diffs**2,
                                 axis=2)), weights))
            log_transition_probs = self.true_log_transition_probs(
                self.previous_state, proposal_samples, all_pairs=True)
            log_transition_probs = T.log(
                T.dot(T.exp(log_transition_probs).T, self.previous_weights))
            log_observation_probs = self.true_log_observation_probs(
                proposal_samples, self.observation_input.dimshuffle('x', 0))
            log_unnorm_weights = log_transition_probs + log_observation_probs - log_proposal_probs
            log_unnorm_weights = log_unnorm_weights - T.max(log_unnorm_weights)
            unnorm_weights = T.exp(log_unnorm_weights)
            normalizer = T.sum(unnorm_weights)

        new_weights = unnorm_weights / normalizer

        new_ess = 1.0 / T.sum(new_weights**2)

        sampmean = T.dot(proposal_samples.T, new_weights)
        sampvar = T.dot(
            ((proposal_samples - sampmean.dimshuffle('x', 0))**2).T,
            new_weights)
        #propmean=T.mean(proposal_samples, axis=0)
        #propvar=T.mean((proposal_samples-propmean.dimshuffle('x',0))**2,axis=0)
        #new_stddev=stddev*T.clip(T.exp(decay*(1.0-propvar/sampvar)),0.5,2.0)
        #new_stddev=T.clip(stddev*T.clip(T.exp(decay*(1.0-stddev**2/sampvar)),0.5,2.0),0.0,4.0)
        new_stddev = T.clip(
            stddev * T.clip(T.exp(decay *
                                  (1.0 - stddev**2 / sampvar)), 0.5, 1.5), 0.0,
            4.0)
        return [
            proposal_samples, new_weights, new_stddev,
            T.cast(new_ess, 'float32')
        ]  #, theano.scan_module.until(new_ess>100)

    def sequential_resample(self,
                            init_stddev=4.0,
                            max_steps=20,
                            stddev_decay=0.1):
        '''Repeatedly resamples and then samples from a proposal distribution
		constructed from the current samples. Should be used when the main
		proposal distribution is poor or whenever the ESS is poor.
		'''

        essT = T.as_tensor_variable(np.asarray(0.0, dtype='float32'))
        stddevT = T.as_tensor_variable(
            np.asarray(init_stddev * np.ones(self.state_dims),
                       dtype='float32'))
        decayT = T.as_tensor_variable(np.asarray(stddev_decay,
                                                 dtype='float32'))

        [samphist, weighthist, stddevhist,
         esshist], updates = theano.scan(fn=self.sr_step,
                                         outputs_info=[
                                             self.current_state,
                                             self.current_weights, stddevT,
                                             essT
                                         ],
                                         non_sequences=decayT,
                                         n_steps=max_steps)

        end_samples = samphist[-1]
        end_weights = weighthist[-1]

        updates[self.particles] = T.set_subtensor(self.current_state,
                                                  end_samples)
        updates[self.weights] = T.set_subtensor(self.current_weights,
                                                end_weights)
        return 1.0 / T.sum(end_weights**2), stddevhist, esshist, updates

    def sample_current(self, nsamps):
        samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat(
            self.current_weights.dimshuffle('x', 0), nsamps, axis=0))
        idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64')
        samples = self.current_state[idxs]
        return samples

    def sample_prev(self, nsamps):
        samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat(
            self.previous_weights.dimshuffle('x', 0), nsamps, axis=0))
        idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64')
        samples = self.previous_state[idxs]
        return samples

    def get_history(self):
        '''This function returns a 3-D array containing all the particles
		and a 2-D array of weights for the entire memory. The first dimension indexes
		time, with the zeroth entry corresponding to the earliest point in 
		memory.'''
        idxs = (T.arange(self.n_history + 1) - self.n_history +
                self.time_counter) % (self.n_history + 1)
        return self.particles[idxs], self.weights[idxs]
Exemplo n.º 33
0
def test_undefined_grad():
    srng = MRG_RandomStreams(seed=1234)

    # checking uniform distribution
    low = tensor.scalar()
    out = srng.uniform((), low=low)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, low)

    high = tensor.scalar()
    out = srng.uniform((), low=0, high=high)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, high)

    out = srng.uniform((), low=low, high=high)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (low, high))

    # checking binomial distribution
    prob = tensor.scalar()
    out = srng.binomial((), p=prob)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, prob)

    # checking multinomial distribution
    prob1 = tensor.scalar()
    prob2 = tensor.scalar()
    p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad,
                  theano.tensor.sum(out), prob1)

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad,
                  theano.tensor.sum(out), (prob1, prob2))

    # checking choice
    p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  prob1)

    # checking normal distribution
    avg = tensor.scalar()
    out = srng.normal((), avg=avg)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg)

    std = tensor.scalar()
    out = srng.normal((), avg=0, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std)

    out = srng.normal((), avg=avg, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (avg, std))
Exemplo n.º 34
0
class MaskGenerator(object):

    def __init__(self, input_size, hidden_sizes, l, random_seed=1234):
        self._random_seed = random_seed
        self._mrng = MRG_RandomStreams(seed=random_seed)
        self._rng = RandomStreams(seed=random_seed)

        self._hidden_sizes = hidden_sizes
        self._input_size = input_size
        self._l = l

        self.ordering = theano.shared(np.arange(input_size, 
                                                dtype=theano.config.floatX), 
                                      'ordering', 
                                      borrow=False)

        # Initial layer connectivity
        self.layers_connectivity = [theano.shared((self.ordering + 1).eval(), 
                                                  'layer_connectivity_input', 
                                                  borrow=False)]
        for i in range(len(self._hidden_sizes)):
            lc = theano.shared(np.zeros((self._hidden_sizes[i]),dtype=floatX), 
                               'layer_connectivity_hidden{0}'.format(i),
                               borrow=False)
            self.layers_connectivity += [lc]
        self.layers_connectivity += [self.ordering]

        ## Theano functions
        new_ordering = self._rng.shuffle_row_elements(self.ordering)
        updates = [(self.ordering, new_ordering), 
                   (self.layers_connectivity[0], new_ordering + 1)]
        self.shuffle_ordering = theano.function(name='shuffle_ordering',
                                                inputs=[],
                                                updates=updates)

        self.layers_connectivity_updates = []
        for i in range(len(self._hidden_sizes)):
            lcu = self._get_hidden_layer_connectivity(i)
            self.layers_connectivity_updates += [lcu]
        
        hsizes = range(len(self._hidden_sizes))
        updates = [(self.layers_connectivity[i+1], 
                    self.layers_connectivity_updates[i]) for i in hsizes]
        self.sample_connectivity = theano.function(name='sample_connectivity',
                                                   inputs=[],
                                                   updates=updates)

        # Save random initial state
        self._initial_mrng_rstate = copy.deepcopy(self._mrng.rstate)
        self._initial_mrng_state_updates = [sup[0].get_value() for sup in 
                                            self._mrng.state_updates]

        # Ensuring valid initial connectivity
        self.sample_connectivity()

    def reset(self):
        # Set Original ordering
        self.ordering.set_value(np.arange(self._input_size, 
                                          dtype=theano.config.floatX))

        # Reset RandomStreams
        self._rng.seed(self._random_seed)

        # Initial layer connectivity
        self.layers_connectivity[0].set_value((self.ordering + 1).eval())
        for i in range(1, len(self.layers_connectivity)-1):
            value = np.zeros((self._hidden_sizes[i-1]), 
                             dtype=theano.config.floatX)
            self.layers_connectivity[i].set_value(value)
        self.layers_connectivity[-1].set_value(self.ordering.get_value())

        # Reset MRG_RandomStreams (GPU)
        self._mrng.rstate = self._initial_mrng_rstate
        states_values = zip(self._mrng.state_updates, 
                            self._initial_mrng_state_updates)
        for state, value in states_values:
            state[0].set_value(value)

        self.sample_connectivity()

    def _get_p(self, start_choice):
        start_choice_idx = (start_choice-1).astype('int32')
        prob = T.nnet.nnet.softmax(self._l * T.arange(start_choice, 
                                                      self._input_size, 
                                                      dtype=floatX))[0]
        p_vals = T.concatenate([T.zeros((start_choice_idx,)),prob])
        p_vals = T.inc_subtensor(p_vals[start_choice_idx], 1.)  
        return p_vals

    def _get_hidden_layer_connectivity(self, layerIdx):
        layer_size = self._hidden_sizes[layerIdx]
        if layerIdx == 0:
            lc = self.layers_connectivity[layerIdx]
            p_vals = self._get_p(T.min(lc))
        else:
            lc = self.layers_connectivity_updates[layerIdx-1]
            p_vals = self._get_p(T.min(lc))

        return T.sum(
            T.cumsum(self._mrng.multinomial(
            pvals=T.tile(p_vals[::-1][None, :],(layer_size, 1)), 
            dtype=floatX), axis=1), axis=1
        )

    def _get_mask(self, layerIdxIn, layerIdxOut):
        return (self.layers_connectivity[layerIdxIn][:, None] <= 
                self.layers_connectivity[layerIdxOut][None, :]).astype(floatX)

    def get_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(layerIdx, layerIdx + 1)

    def get_direct_input_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(0, layerIdx)

    def get_direct_output_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(layerIdx, -1)
Exemplo n.º 35
0
class RNNtsg(model):
    '''
		The attention-based NMT model for TSG
	'''
    def __init__(self, config, name=''):
        self.config = config
        self.name = name
        self.creater = LayerFactory()
        self.trng = RandomStreams(numpy.random.randint(int(10e6)))

    def translate(self, x, T, beam_size=10, return_array=False):
        '''
			Decode with beam search.

			:type x: numpy array
			:param x: the indexed source sentence

			:type beam_size: int
			:param beam_size: beam size

			:returns: a numpy array, the indexed translation result
		'''
        # initialize variables
        result = [[]]
        loss = [0.]
        result_eos = []
        loss_eos = []
        beam = beam_size
        nonterms = [
            ['S']
        ]  # same length as result, nonterms for each hypothesis # (n_hyps, nonterm for each hyp)
        par_state_time = [[0]]  # (n_hyps, len(nonterm) for each hyp)
        # get encoder states
        c, state = self.get_context_and_init(x)
        emb_y = numpy.zeros((1, self.config['dim_emb_trg']), dtype='float32')
        state_hist = [[
            numpy.zeros((1, self.config['dim_rec_enc']), dtype='float32')
        ]]  # (n_hyps, l)

        for l in range(x.shape[0] * 3):
            cur_nonterm_idx = [
            ]  # length lists, each list is the rule indices for expanding LHS
            #print result
            for i in range(len(nonterms)):
                if len(nonterms[i]) > 0:
                    potent_rules = T.rule_idx_with_root(
                        nonterms[i][-1]
                    )  # list of potential rules with the given lhs as root
                    #print potent_rules + i * self.config['dim_emb_trg']
                    cur_nonterm_idx += [
                        r + i * self.config['num_vocab_trg']
                        for r in potent_rules
                    ]

                    nonterms[i].pop()
            # only take the first k results if we have k < beam_size potential nonterms
            if len(cur_nonterm_idx) < beam_size:
                beam = len(cur_nonterm_idx)
            else:
                beam = beam_size
            # get word probability
            energy, ctx = self.get_probs(numpy.repeat(c, len(result), axis=1),
                                         state, emb_y)
            # multiply energy by cur_nonterm_idx mask
            energy_mask = numpy.zeros((energy.shape[0] * energy.shape[1]),
                                      dtype='float32')
            energy_mask[cur_nonterm_idx] = 1.
            energy_mask = energy_mask.reshape(
                (energy.shape[0], energy.shape[1]))
            energy = energy * energy_mask

            probs = tools.softmax(energy)
            losses = -numpy.log(probs)

            # prevent translation to be too short.
            if l < x.shape[0] / 2:
                losses[:, self.config['index_eos_trg']] = numpy.inf
            # prevent rules that do not have required lhs
            #losses[:, not_cur_nonterm_idx] = numpy.inf
            for i in range(len(loss)):
                losses[i] += loss[i]

            # get the n-best partial translations
            best_index_flatten = numpy.argpartition(losses.flatten(),
                                                    beam)[:beam]
            best_index = [(index / self.config['num_vocab_trg'],
                           index % self.config['num_vocab_trg'])
                          for index in best_index_flatten]

            # save the partial translations in the beam
            new_ctx = numpy.zeros((beam, 2 * self.config['dim_rec_enc']),
                                  dtype='float32')
            new_y = []
            new_state = numpy.zeros((beam, self.config['dim_rec_dec']),
                                    dtype='float32')
            new_result = []
            new_loss = []
            new_nonterms = []
            new_par_state_time = []
            new_state_hist = []
            new_par_state = numpy.zeros((beam, self.config['dim_rec_dec']),
                                        dtype='float32')
            #print best_index
            #print len(result), len(state_hist), len(par_state_time)
            for i in range(beam):
                index = best_index[i]
                new_result.append(result[index[0]] + [index[1]])
                new_loss.append(losses[index[0], index[1]])
                new_ctx[i] = ctx[index[0]]
                new_y.append(index[1])
                new_state[i] = state[index[0]]
                par_state_t = par_state_time[index[0]][-1]

                new_par_state[i] = state_hist[index[0]][par_state_t]

                r = T.get_rule_from_idx(index[1])
                if r:
                    add_nonterms = r.get_expand_tags()[::-1]
                else:
                    add_nonterms = []
                new_nonterms.append(nonterms[index[0]] + add_nonterms)
                # set the parent of expanded tags to be current
                # do not include last par_state_time[] for current hyp
                new_par_state_time.append(par_state_time[index[0]][:-1] +
                                          [l + 1] * len(add_nonterms))
                new_state_hist.append(state_hist[index[0]] + [state[index[0]]])
            # get the next decoder hidden state
            new_emby = self.get_trg_embedding(
                numpy.asarray(new_y, dtype='int64'))[0]
            new_state = self.get_next(new_ctx, new_state, new_par_state,
                                      new_emby)

            # remove finished translation from the beam
            state = []
            emb_y = []
            result = []
            loss = []
            nonterms = []
            state_hist = []
            par_state_time = []
            for i in range(beam):
                if len(new_nonterms[i]) == 0:
                    # par_state_time and nonterms should have same length for each hyp
                    # par_state_time records parent state timestep for each nonterms that needs to be expanded
                    assert len(new_par_state_time[i]) == 0
                    result_eos.append(new_result[i])
                    #print new_result[i]
                    loss_eos.append(new_loss[i])
                    beam -= 1
                else:
                    result.append(new_result[i])
                    loss.append(new_loss[i])
                    state.append(new_state[i])
                    emb_y.append(new_emby[i])
                    nonterms.append(new_nonterms[i])
                    state_hist.append(new_state_hist[i])
                    par_state_time.append(new_par_state_time[i])
            #print len(result), len(state_hist), len(par_state_time)
            if beam <= 0:
                break

            state = numpy.asarray(state, dtype='float32')
            emb_y = numpy.asarray(emb_y, dtype='float32')

        # only used in semi-supervised training
        if return_array:
            if len(result_eos) > 0:
                return result_eos
            else:
                return [result[-1][:1]]

        if len(result_eos) > 0:
            # return the best translation
            return result_eos[numpy.argmin(loss_eos)]
        elif beam_size > 100:
            # double the beam size on failure
            logging.warning('cannot find translation in beam size %d' %
                            beam_size)
            return []
        else:
            logging.info('cannot find translation in beam size %d, try %d' %
                         (beam_size, beam_size * 2))
            return self.translate(x, beam_size=beam_size * 2)

    def sampling_step(self, state, prev, context, par_state):
        '''
			Build the computational graph which samples the next word.

			:type state: theano variables
			:param state: the previous hidden state

			:type prev: theano variables
			:param prev: the last generated word

			:type context: theano variables
			:param context: the context vectors.
		'''
        emb = self.emb_trg.forward(prev)
        energy, c = self.decoderGRU.decode_probs(context, state, emb)
        probs = tensor.nnet.softmax(energy)

        sample = self.trng.multinomial(pvals=probs,
                                       dtype='int64').argmax(axis=-1)

        newemb = self.emb_trg.forward(sample)
        newstate = self.decoderGRU.decode_next(c, state, newemb, par_state)

        return newstate, sample, probs

    def decode_sample(self, state_init, c, length, n_samples):
        '''
			Build the decoder graph for sampling.

			:type state_init: theano variables
			:param state_init: the initial state of decoder

			:type c: theano variables
			:param c: the context vectors

			:type length: int
			:param length: the limitation of sample length

			:type n_samples: int
			:param n_samples: the number of samples
		'''

        state = tensor.repeat(state_init, n_samples, axis=0)
        sample = tensor.zeros((n_samples, ), dtype='int64')
        c = tensor.repeat(c, n_samples, axis=1)

        result, updates = theano.scan(self.sampling_step,
                                      outputs_info=[state, sample, None],
                                      non_sequences=[c],
                                      n_steps=length)

        samples = result[1]
        probs = result[2]
        y_idx = tensor.arange(samples.flatten(
        ).shape[0]) * self.config['num_vocab_trg'] + samples.flatten()
        probs = probs.flatten()[y_idx]
        probs.reshape(samples.shape)
        return samples, probs, updates

    def build(self, verbose=False):
        '''
			Build the computational graph.

			:type verbose: bool
			:param verbose: only set to True on visualization
		'''
        config = self.config

        #create layers
        logging.info('initializing layers...')
        self.emb_src = self.creater.createLookupTable(self.name + 'emb_src',
                                                      config['num_vocab_src'],
                                                      config['dim_emb_src'],
                                                      offset=True)
        self.emb_trg = self.creater.createLookupTable(self.name + 'emb_trg',
                                                      config['num_vocab_trg'],
                                                      config['dim_emb_trg'],
                                                      offset=True)
        self.encoderGRU = self.creater.createGRU(self.name + 'GRU_enc',
                                                 config['dim_emb_src'],
                                                 config['dim_rec_enc'],
                                                 verbose=verbose)
        self.encoderGRU_back = self.creater.createGRU(self.name +
                                                      'GRU_enc_back',
                                                      config['dim_emb_src'],
                                                      config['dim_rec_enc'],
                                                      verbose=verbose)

        self.decoderGRU = self.creater.createGRU_tsg(self.name + 'GRU_dec',
                                                     config['dim_emb_trg'],
                                                     2 * config['dim_rec_enc'],
                                                     config['dim_rec_dec'],
                                                     config['num_vocab_trg'],
                                                     verbose=verbose)

        self.initer = self.creater.createFeedForwardLayer(
            self.name + 'initer',
            config['dim_rec_enc'],
            config['dim_rec_dec'],
            offset=True)

        # create input variables
        self.x = tensor.matrix('x', dtype='int64')  # size: (length, batchsize)
        self.xmask = tensor.matrix(
            'x_mask', dtype='float32')  # size: (length, batchsize)
        self.y_idx = tensor.matrix('y_idx',
                                   dtype='int64')  # size: (length, batchsize)
        self.ymask = tensor.matrix(
            'y_mask', dtype='float32')  # size: (length, batchsize)
        #self.y_parent_idx = tensor.matrix('y_parent_idx', dtype='int64') # size: (length, batchsize)
        self.y_parent_t = tensor.matrix(
            'y_parent_t', dtype='int64')  # size: (length, batchsize)

        if 'MRT' in config and config['MRT'] is True:
            self.MRTLoss = tensor.vector('MRTLoss')
            self.inputs = [
                self.x, self.xmask, self.y_idx, self.y_parent_t, self.ymask,
                self.MRTLoss
            ]
        else:
            self.MRTLoss = None
            self.inputs = [
                self.x, self.xmask, self.y_idx, self.y_parent_t, self.ymask
            ]

        # create computational graph for training
        logging.info('building computational graph...')
        # ----encoder-----
        emb = self.emb_src.forward(
            self.x.flatten())  # size: (length, batch_size, dim_emb)
        back_emb = self.emb_src.forward(self.x[::-1].flatten())

        self.encode_forward = self.encoderGRU.forward(
            emb, self.x.shape[0], batch_size=self.x.shape[1],
            mask=self.xmask)  # size: (length, batch_size, dim)
        self.encode_backward = self.encoderGRU_back.forward(
            back_emb,
            self.x.shape[0],
            batch_size=self.x.shape[1],
            mask=self.xmask[::-1])  # size: (length, batch_size, dim)
        context_forward = self.encode_forward[0]
        context_backward = self.encode_backward[0][::-1]
        self.context = tensor.concatenate(
            (context_forward, context_backward),
            axis=2)  # size: (length, batch_size, 2*dim)

        # ----decoder----
        self.init_c = context_backward[0]
        self.state_init = self.initer.forward(context_backward[0])
        emb = self.emb_trg.forward(
            self.y_idx.flatten())  # size: (length, batch_size, dim_emb)
        self.decode = self.decoderGRU.forward(
            emb,
            self.y_idx.shape[0],
            self.context,
            self.state_init,
            self.y_parent_t,
            batch_size=self.y_idx.shape[1],
            mask=self.ymask,
            cmask=self.xmask)  # size: (length, batch_size, dim)

        energy = self.decode[1]
        self.attention = self.decode[2]
        self.softmax = tensor.nnet.softmax(energy)
        # compute costs and grads
        y_idx = tensor.arange(self.y_idx.flatten(
        ).shape[0]) * self.config['num_vocab_trg'] + self.y_idx.flatten()
        cost = self.softmax.flatten()[y_idx]
        cost = -tensor.log(cost)
        self.cost = cost.reshape(
            (self.y_idx.shape[0], self.y_idx.shape[1])) * self.ymask
        self.cost_per_sample = self.cost.sum(axis=0)
        if 'MRT' in config and config['MRT'] is True:
            self.cost_per_sample = self.cost.sum(axis=0)
            tmp = self.cost_per_sample
            tmp *= config['MRT_alpha']
            tmp -= tmp.min()
            tmp = tensor.exp(-tmp)
            tmp /= tmp.sum()
            tmp *= self.MRTLoss
            tmp = -tmp.sum()
            self.cost = tmp
        else:
            self.cost = self.cost.sum()

        # build sampling graph
        self.x_sample = tensor.matrix('x_sample', dtype='int64')
        self.n_samples = tensor.scalar('n_samples', dtype='int64')
        self.length_sample = tensor.scalar('length', dtype='int64')
        emb_sample = self.emb_src.forward(
            self.x_sample.flatten())  # (length, batch_size, dim_emb)
        back_emb_sample = self.emb_src.forward(self.x_sample[::-1].flatten())
        encode_forward_sample = self.encoderGRU.forward(
            emb_sample,
            self.x_sample.shape[0],
            batch_size=self.x_sample.shape[1])  # (length, batch_size, dim)
        encode_backward_sample = self.encoderGRU_back.forward(
            back_emb_sample,
            self.x_sample.shape[0],
            batch_size=self.x_sample.shape[1])  # (length, batch_size, dim)
        context_sample = tensor.concatenate(
            (encode_forward_sample[0], encode_backward_sample[0][::-1]),
            axis=2)  # (length, batch_size, 2*dim)
        state_init_sample = self.initer.forward(
            encode_backward_sample[0][::-1][0])
        self.state_init_sample = state_init_sample
        self.context_sample = context_sample
        #self.samples, self.probs_sample, self.updates_sample = self.decode_sample(state_init_sample, context_sample,
        #											self.length_sample, self.n_samples)

        # parameter for decoding
        self.y_decode = tensor.vector('y_decode', dtype='int64')
        self.context_decode = tensor.tensor3('context_decode', dtype='float32')
        self.c_decode = tensor.matrix('c_decode', dtype='float32')
        self.state_decode = tensor.matrix('state_decode', dtype='float32')
        self.par_state_decode = tensor.matrix('par_state_decode',
                                              dtype='float32')
        self.emb_decode = tensor.matrix('emb_decode', dtype='float32')

    def encode(self, x):
        '''
			Encode source sentence to context vector.
		'''
        if not hasattr(self, "encoder"):
            self.encoder = theano.function(inputs=[self.x, self.xmask],
                                           outputs=[self.context])
        x = numpy.reshape(x, (x.shape[0], 1))
        xmask = numpy.ones(x.shape, dtype='float32')
        return self.encoder(x, xmask)

    def get_trg_embedding(self, y):
        '''
			Get the embedding of target sentence.
		'''
        if not hasattr(self, "get_trg_embeddinger"):
            self.get_trg_embeddinger = theano.function(
                inputs=[self.y_decode],
                outputs=[self.emb_trg.forward(self.y_decode)])
        return self.get_trg_embeddinger(y)

    def get_init(self, c):
        '''
			Get the initial decoder hidden state with context vector.
		'''
        if not hasattr(self, "get_initer"):
            self.get_initer = theano.function(
                inputs=[self.context],
                outputs=[self.initer.forward(context_backward[0])])
        return self.get_initer(c)

    def get_context_and_init(self, x):
        '''
			Encode source sentence to context vectors and get the initial decoder hidden state.
		'''
        if not hasattr(self, "get_context_and_initer"):
            self.get_context_and_initer = theano.function(
                inputs=[self.x, self.xmask],
                outputs=[self.context, self.state_init])
        x = numpy.reshape(x, (x.shape[0], 1))
        xmask = numpy.ones(x.shape, dtype='float32')
        return self.get_context_and_initer(x, xmask)

    def get_probs(self, c, state, emb):
        '''
			Get the probability of the next target word.
		'''
        if not hasattr(self, "get_probser"):
            self.get_probser = theano.function(
                inputs=[
                    self.context_decode, self.state_decode, self.emb_decode
                ],
                outputs=self.decoderGRU.decode_probs(self.context_decode,
                                                     self.state_decode,
                                                     self.emb_decode))
        return self.get_probser(c, state, emb)

    def get_next(self, c, state, par_state, emb):
        '''
			Get the next hidden state.
		'''
        if not hasattr(self, "get_nexter"):
            self.get_nexter = theano.function(
                inputs=[
                    self.c_decode, self.state_decode, self.par_state_decode,
                    self.emb_decode
                ],
                outputs=self.decoderGRU.decode_next(self.c_decode,
                                                    self.state_decode,
                                                    self.par_state_decode,
                                                    self.emb_decode))
        return self.get_nexter(c, state, par_state, emb)

    def get_cost(self, x, xmask, y, ymask):
        '''
			Get the negative log-likelihood of parallel sentences.
		'''
        if not hasattr(self, "get_coster"):
            self.get_coster = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=[self.cost])
        return self.get_coster(x, xmask, y, ymask)

    def get_sample(self, x, length, n_samples):
        '''
			Get sampling results.
		'''
        if not hasattr(self, "get_sampler"):
            self.get_sampler = theano.function(
                inputs=[self.x_sample, self.length_sample, self.n_samples],
                outputs=[self.samples, self.probs_sample],
                updates=self.updates_sample)
        return self.get_sampler(x, length, n_samples)

    def get_attention(self, x, xmask, y, ymask):
        '''
			Get the attention weight of parallel sentences.
		'''
        if not hasattr(self, "get_attentioner"):
            self.get_attentioner = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=[self.attention])
        return self.get_attentioner(x, xmask, y, ymask)

    def get_layer(self, x, xmask, y, ymask):
        '''
			Get the hidden states essential for visualization
		'''
        if not hasattr(self, "get_layerer"):
            self.get_layerer = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=self.encode_forward + self.encode_backward +
                tuple(self.decode[0]) + tuple(self.decode[1:]))

        layers = self.get_layerer(x, xmask, y, ymask)
        enc_names = [
            'h', 'gate', 'reset', 'state', 'reseted', 'state_in', 'gate_in',
            'reset_in'
        ]
        dec_names = [
            'h', 'c', 'att', 'gate_cin', 'gate_preactive', 'gate', 'reset_cin',
            'reset_preactive', 'reset', 'state_cin', 'reseted',
            'state_preactive', 'state'
        ]
        dec_names += [
            'outenergy', 'state_in', 'gate_in', 'reset_in', 'state_in_prev',
            'readout', 'maxout', 'outenergy_1', 'outenergy_2'
        ]
        value_name = ['enc_for_' + name for name in enc_names]
        value_name += ['enc_back_' + name for name in enc_names]
        value_name += ['dec_' + name for name in dec_names]
        result = {}
        for i in range(len(layers)):
            print layers[i].shape
            if value_name[i] != '':
                result[value_name[i]] = layers[i]
        return result
Exemplo n.º 36
0
class CRBM:
    """CRBM class.

    The class :class:`CRBM` implements functionality for
    a *convolutional restricted Boltzmann machine* (cRBM) that
    extracts redundant DNA sequence features from a provided set
    of sequences.
    The model can subsequently be used to study the sequence content
    of (e.g. regulatory) sequences, by visualizing the features in terms
    of sequence logos or in order to cluster the sequences based
    on sequence content.

    Parameters
    -----------
    num_motifs : int
        Number of motifs.
    motif_length : int
        Motif length.

    epochs : int
        Number of epochs to train (Default: 100).
    input_dims :int
        Input dimensions aka alphabet size (Default: 4 for DNA).
    doublestranded : bool
        Single strand or both strands. If set to True,
        both strands are scanned. (Default: True).
    batchsize : int
        Batch size (Default: 20).
    learning_rate : float)
        Learning rate (Default: 0.1).
    momentum : float
        Momentum term (Default: 0.95).
    pooling : int
        Pooling factor (not relevant for
        cRBM, but for future work) (Default: 1).
    cd_k : int
        Number of Gibbs sampling iterations in
        each persistent contrastive divergence step (Default: 5).
    rho : float
        Target frequency of motif occurrences (Default: 0.01).
    lambda_rate : float
        Sparsity enforcement aka penality term (Default: 0.1).
    """
    def __init__(self, num_motifs, motif_length, epochs = 100, input_dims=4, \
            doublestranded = True, batchsize = 20, learning_rate = 0.1, \
            momentum = 0.95, pooling = 1, cd_k = 5,
            rho = 0.01, lambda_rate = 0.1):
        # sanity checks:
        if num_motifs <= 0:
            raise Exception("Number of motifs must be positive.")

        if motif_length <= 0:
            raise Exception("Motif length must be positive.")

        if epochs < 0:
            raise Exception("Epochs must be non-negative.")

        if input_dims <= 0:
            raise Exception("input_dims must be positive.")
        elif input_dims != 4:
            warnings.warn(
                "input_dims != 4 was not comprehensively \
                tested yet. Be careful when interpreting the results.",
                UserWarning)

        if batchsize <= 0:
            raise Exception("batchsize must be positive.")

        if learning_rate <= 0.0:
            raise Exception("learning_rate must be positive.")

        if not (momentum >= 0.0 and momentum < 1.):
            raise Exception("momentum must be between zero and one.")

        if pooling <= 0:
            raise Exception("pooling must be positive.")

        if cd_k <= 0:
            raise Exception("cd_k must be positive.")

        if not (rho >= 0.0 and rho < 1.):
            raise Exception("rho must be between zero and one.")

        if lambda_rate < 0.:
            raise Exception("lambda_rate must be non-negative.")

        # parameters for the motifs
        self.num_motifs = num_motifs
        self.motif_length = motif_length
        self.input_dims = input_dims
        self.doublestranded = doublestranded
        self.batchsize = batchsize
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.rho = rho
        self.lambda_rate = lambda_rate
        self.pooling = pooling
        self.cd_k = cd_k
        self.epochs = epochs
        self.spmethod = 'entropy'
        self._gradientSparsityConstraint = \
            self._gradientSparsityConstraintEntropy

        x = np.random.randn(self.num_motifs, 1, self.input_dims,
                            self.motif_length).astype(theano.config.floatX)

        self.motifs = theano.shared(value=x, name='W', borrow=True)

        # determine the parameter rho for the model if not given
        if not rho:
            rho = 1. / (self.num_motifs * self.motif_length)
            if self.doublestranded:
                rho = rho / 2.
            self.rho = rho

        # cRBM parameters (2*x to respect both strands of the DNA)
        b = np.zeros((1, self.num_motifs)).astype(theano.config.floatX)

        # adapt the bias such that it will initially have rho motif hits in H
        # That is, we want to have rho percent of the samples positive
        # randn draws from 'standard normal', this is why we have 0 and 1
        b = b + scipy.stats.norm.ppf(self.rho, 0, np.sqrt(self.motif_length))
        self.bias = theano.shared(value=b, name='bias', borrow=True)

        c = np.zeros((1, self.input_dims)).astype(theano.config.floatX)
        self.c = theano.shared(value=c, name='c', borrow=True)

        # infrastructural parameters
        self.theano_rng = RS(seed=int(time.time()))
        self.rng_data_permut = theano.tensor.shared_randomstreams.RandomStreams(
        )

        self.motif_velocity = theano.shared(value=np.zeros(
            self.motifs.get_value().shape).astype(theano.config.floatX),
                                            name='velocity_of_W',
                                            borrow=True)
        self.bias_velocity = theano.shared(value=np.zeros(b.shape).astype(
            theano.config.floatX),
                                           name='velocity_of_bias',
                                           borrow=True)
        self.c_velocity = theano.shared(value=np.zeros(c.shape).astype(
            theano.config.floatX),
                                        name='velocity_of_c',
                                        borrow=True)

        val = np.zeros((self.batchsize, self.num_motifs, 1,
                        200)).astype(theano.config.floatX)
        self.fantasy_h = theano.shared(value=val,
                                       name='fantasy_h',
                                       borrow=True)
        if self.doublestranded:
            self.fantasy_h_prime = theano.shared(value=\
                      np.zeros((self.batchsize, self.num_motifs, 1, 200)).astype(theano.config.floatX), \
                      name='fantasy_h_prime', borrow=True)

        self._compileTheanoFunctions()

    def saveModel(self, filename):
        """Save the model parameters and additional hyper-parameters.

        Parameters
        -----------
        filename : str
            Pickle filename where the model parameters are stored.
        """

        numpyParams = (self.motifs.get_value(), self.bias.get_value(),
                       self.c.get_value())

        hyperparams = (self.num_motifs, self.motif_length, self.input_dims,
                       self.doublestranded, self.batchsize, self.learning_rate,
                       self.momentum, self.rho, self.lambda_rate, self.pooling,
                       self.cd_k, self.epochs, self.spmethod)

        pickleObject = (numpyParams, hyperparams)
        joblib.dump(pickleObject, filename, protocol=2)

    @classmethod
    def loadModel(cls, filename):
        """Load a model from a given pickle file.

        Parameters
        -----------
        filename : str
            Pickle file containing the model parameters.
        returns : :class:`CRBM` object
            An instance of CRBM with reloaded parameters.
        """

        numpyParams, hyperparams = joblib.load(filename)

        (num_motifs, motif_length, input_dims, \
            doublestranded, batchsize, learning_rate, \
            momentum, rho, lambda_rate,
            pooling, cd_k,
            epochs, spmethod) = hyperparams

        obj = cls(num_motifs,
                  motif_length,
                  epochs=epochs,
                  input_dims=input_dims,
                  doublestranded=doublestranded,
                  batchsize=batchsize,
                  learning_rate=learning_rate,
                  momentum=momentum,
                  pooling=pooling,
                  cd_k=cd_k,
                  rho=rho,
                  lambda_rate=lambda_rate)
        motifs, bias, c = numpyParams
        obj.motifs.set_value(motifs)
        obj.bias.set_value(bias)
        obj.c.set_value(c)
        return obj

    def _bottomUpActivity(self, data, flip_motif=False):
        """Theano function for computing bottom up activity."""

        out = conv(data, self.motifs, filter_flip=flip_motif)
        out = out + self.bias.dimshuffle('x', 1, 0, 'x')
        return out

    def _bottomUpProbability(self, activities):
        """Theano function for computing bottom up Probability."""

        pool = self.pooling
        x = activities.reshape((activities.shape[0], \
                activities.shape[1], activities.shape[2], \
                activities.shape[3]//pool, pool))
        norm = T.sum(1. + T.exp(x), axis=4, keepdims=True)
        x = T.exp(x) / norm
        x=x.reshape((activities.shape[0], \
                activities.shape[1], activities.shape[2], \
                activities.shape[3]))
        return x

    def _bottomUpSample(self, probs):
        """Theano function for bottom up sampling."""

        pool = self.pooling
        _probs = probs.reshape((probs.shape[0], probs.shape[1], probs.shape[2],
                                probs.shape[3] // pool, pool))
        _probs_reshape = _probs.reshape(
            (_probs.shape[0] * _probs.shape[1] * _probs.shape[2] *
             _probs.shape[3], pool))
        samples = self.theano_rng.multinomial(pvals=_probs_reshape)
        samples = samples.reshape(
            (probs.shape[0], probs.shape[1], probs.shape[2], probs.shape[3]))
        return T.cast(samples, theano.config.floatX)

    def _computeHgivenV(self, data, flip_motif=False):
        """Theano function for complete bottom up pass."""

        activity = self._bottomUpActivity(data, flip_motif)
        probability = self._bottomUpProbability(activity)
        sample = self._bottomUpSample(probability)
        return [probability, sample]

    def _topDownActivity(self, h, hprime):
        """Theano function for top down activity."""
        W = self.motifs.dimshuffle(1, 0, 2, 3)
        C = conv(h, W, border_mode='full', filter_flip=True)

        out = T.sum(C, axis=1, keepdims=True)  # sum over all K

        if hprime:
            C = conv(hprime, W[:,:,::-1,::-1], \
                    border_mode='full', filter_flip=True)
            out = out + T.sum(C, axis=1, keepdims=True)  # sum over all K

        c_bc = self.c
        c_bc = c_bc.dimshuffle('x', 0, 1, 'x')
        activity = out + c_bc
        return activity

    def _topDownProbability(self, activity, softmaxdown=True):
        """Theano function for top down probability."""
        if softmaxdown:
            return self._softmax(activity)
        else:
            return 1. / (1. - T.exp(-activity))

    def _topDownSample(self, probability, softmaxdown=True):
        """Theano function for top down sample."""
        if softmaxdown:
            pV_ = probability.dimshuffle(0, 1, 3, 2).reshape( \
                (probability.shape[0]*probability.shape[3],
                    probability.shape[2]))
            V_ = self.theano_rng.multinomial(n=1, pvals=pV_).astype(
                theano.config.floatX)
            V = V_.reshape((probability.shape[0], 1, probability.shape[3],
                            probability.shape[2])).dimshuffle(0, 1, 3, 2)

        else:
            V=self.theano_rng.multinomial(n=1,\
                pvals=probability).astype(theano.config.floatX)
        return V

    def _computeVgivenH(self, H_sample, H_sample_prime, softmaxdown=True):
        """Theano function for complete top down pass."""

        activity = self._topDownActivity(H_sample, H_sample_prime)

        prob = self._topDownProbability(activity, softmaxdown)
        sample = self._topDownSample(prob, softmaxdown)

        return [prob, sample]

    def _collectVHStatistics(self, prob_of_H, data):
        """Theano function for collecting V*H statistics."""

        # reshape input
        data = data.dimshuffle(1, 0, 2, 3)
        prob_of_H = prob_of_H.dimshuffle(1, 0, 2, 3)
        avh = conv(data, prob_of_H, border_mode="valid", filter_flip=False)
        avh = avh / T.prod(prob_of_H.shape[1:])
        avh = avh.dimshuffle(1, 0, 2, 3).astype(theano.config.floatX)

        return avh

    def _collectVStatistics(self, data):
        """Theano function for collecting V statistics."""

        # reshape input
        a = T.mean(data, axis=(0, 1, 3)).astype(theano.config.floatX)
        a = a.dimshuffle('x', 0)
        a = T.inc_subtensor(a[:, :],
                            a[:, ::-1])  # match a-t and c-g occurances

        return a

    def _collectHStatistics(self, data):
        """Theano function for collecting H statistics."""

        # reshape input
        a = T.mean(data, axis=(0, 2, 3)).astype(theano.config.floatX)
        a = a.dimshuffle('x', 0)

        return a

    def _collectUpdateStatistics(self, prob_of_H, prob_of_H_prime, data):
        """Theano function for collecting the complete update statistics."""

        average_VH = self._collectVHStatistics(prob_of_H, data)
        average_H = self._collectHStatistics(prob_of_H)

        if prob_of_H_prime:
            average_VH_prime = self._collectVHStatistics(prob_of_H_prime, data)
            average_H_prime = self._collectHStatistics(prob_of_H_prime)
            average_VH = (average_VH + average_VH_prime[:, :, ::-1, ::-1]) / 2.
            average_H = (average_H + average_H_prime) / 2.

        average_V = self._collectVStatistics(data)
        return average_VH, average_H, average_V

    def _updateWeightsOnMinibatch(self, D, gibbs_chain_length):
        """Theano function that defines an SGD update step with momentum."""

        # calculate the data gradient for weights (motifs), bias and c
        [prob_of_H_given_data, H_given_data] = self._computeHgivenV(D)

        if self.doublestranded:
            [prob_of_H_given_data_prime,H_given_data_prime] = \
                    self._computeHgivenV(D, True)
        else:
            [prob_of_H_given_data_prime, H_given_data_prime] = [None, None]

        # calculate data gradients
        G_motif_data, G_bias_data, G_c_data = \
                  self._collectUpdateStatistics(prob_of_H_given_data, \
                  prob_of_H_given_data_prime, D)

        # calculate model probs
        H_given_model = self.fantasy_h
        if self.doublestranded:
            H_given_model_prime = self.fantasy_h_prime
        else:
            H_given_model_prime = None

        for i in range(gibbs_chain_length):
            prob_of_V_given_model, V_given_model = \
                    self._computeVgivenH(H_given_model, H_given_model_prime)
            #sample up
            prob_of_H_given_model, H_given_model = \
                    self._computeHgivenV(V_given_model)

            if self.doublestranded:
                prob_of_H_given_model_prime, H_given_model_prime = \
                        self._computeHgivenV(V_given_model,  True)
            else:
                prob_of_H_given_model_prime, H_given_model_prime = None, None

        # compute the model gradients
        G_motif_model, G_bias_model, G_c_model = \
                  self._collectUpdateStatistics(prob_of_H_given_model, \
                  prob_of_H_given_model_prime, V_given_model)

        mu = self.momentum
        alpha = self.learning_rate
        sp = self.lambda_rate
        reg_motif, reg_bias = self._gradientSparsityConstraint(D)

        vmotifs = mu * self.motif_velocity + \
                alpha * (G_motif_data - G_motif_model - sp*reg_motif)
        vbias = mu * self.bias_velocity + \
                alpha * (G_bias_data - G_bias_model - sp*reg_bias)
        vc = mu*self.c_velocity + \
                alpha * (G_c_data - G_c_model)

        new_motifs = self.motifs + vmotifs
        new_bias = self.bias + vbias
        new_c = self.c + vc

        updates = [(self.motifs, new_motifs), (self.bias, new_bias),
                   (self.c, new_c), (self.motif_velocity, vmotifs),
                   (self.bias_velocity, vbias), (self.c_velocity, vc),
                   (self.fantasy_h, H_given_model)]
        if self.doublestranded:
            updates.append((self.fantasy_h_prime, H_given_model_prime))

        return updates

    def _gradientSparsityConstraintEntropy(self, data):
        """Theano function that defines the entropy-based sparsity constraint."""
        # get expected[H|V]
        [prob_of_H, _] = self._computeHgivenV(data)
        q = self.rho
        p = T.mean(prob_of_H, axis=(0, 2, 3))

        gradKernels = -T.grad(T.mean(q * T.log(p) +
                                     (1 - q) * T.log(1 - p)), self.motifs)
        gradBias = -T.grad(T.mean(q * T.log(p) +
                                  (1 - q) * T.log(1 - p)), self.bias)
        return gradKernels, gradBias

    def _compileTheanoFunctions(self):
        """This methods compiles all theano functions."""

        print("Start compiling Theano training function...")
        D = T.tensor4('data')
        updates = self._updateWeightsOnMinibatch(D, self.cd_k)
        self.theano_trainingFct = theano.function([D],
                                                  None,
                                                  updates=updates,
                                                  name='train_CRBM')

        #compute mean free energy
        mfe_ = self._meanFreeEnergy(D)
        #compute number  of motif hits
        [_, H] = self._computeHgivenV(D)

        #H = self.bottomUpProbability(self.bottomUpActivity(D))
        nmh_ = T.mean(H)  # mean over samples (K x 1 x N_h)

        #compute norm of the motif parameters
        twn_ = T.sqrt(T.mean(self.motifs**2))

        #compute information content
        pwm = self._softmax(self.motifs)
        entropy = -pwm * T.log2(pwm)
        entropy = T.sum(entropy, axis=2)  # sum over letters
        ic_= T.log2(self.motifs.shape[2]) - \
            T.mean(entropy)  # log is possible information due to length of sequence
        medic_= T.log2(self.motifs.shape[2]) - \
            T.mean(T.sort(entropy, axis=2)[:, :, entropy.shape[2] // 2])

        self.theano_evaluateData = theano.function([D], [mfe_, nmh_],
                                                   name='evaluationData')

        W = T.tensor4("W")
        self.theano_evaluateParams = theano.function([], [twn_, ic_, medic_],
                                                     givens={W: self.motifs},
                                                     name='evaluationParams')
        fed = self._freeEnergyForData(D)
        self.theano_freeEnergy = theano.function([D],
                                                 fed,
                                                 name='fe_per_datapoint')

        fed = self._freeEnergyPerMotif(D)
        self.theano_fePerMotif = theano.function([D], fed, name='fe_per_motif')

        if self.doublestranded:
            self.theano_getHitProbs = theano.function([D], \
                self._bottomUpProbability(self._bottomUpActivity(D)))
        else:
            self.theano_getHitProbs = theano.function([D], \
                #self.bottomUpProbability( T.maximum(self.bottomUpActivity(D),

                self._bottomUpProbability( self._bottomUpActivity(D) +
                        self._bottomUpActivity(D, True)))
        print("Compilation of Theano training function finished")

    def _evaluateData(self, data):
        """Evaluate performance on given numpy array.

        This is used to monitor training progress.
        """
        return self.theano_evaluateData(data)

    def _trainingFct(self, data):
        """Train on mini-batch given numpy array."""
        return self.theano_trainingFct(data)

    def _evaluateParams(self):
        """Evaluate parameters.

        This is used to monitor training progress.
        """
        return self.theano_evaluateParams()

    def motifHitProbs(self, data):
        """Motif match probabilities.

        Parameters
        -----------
        data : numpy-array
            4D numpy array representing a DNA sequence in one-hot encoding.
            See :meth:`crbm.sequences.seqToOneHot`.

        returns : numpy-array
            Per-position motif match probabilities of all motifs as numpy array.
        """
        return self.theano_getHitProbs(data)

    def freeEnergy(self, data, permotif=False):
        """Free energy determined on the given dataset.

        Parameters
        -----------
        data : numpy-array
            4D numpy array representing a DNA sequence in one-hot encoding.
            See :meth:`crbm.sequences.seqToOneHot`.
        permotif : boolean
            Indicates whether the free energy should be computed per motif.
            Default: The free energy is computed per sequence, by summing over
                the individual motif contributions.

        returns : numpy-array
            Free energy per sequence.
        """
        if permotif:
            return self.theano_fePerMotif(data)
        else:
            return self.theano_freeEnergy(data)

    def fit(self, training_data, test_data=None):
        """Fits the cRBM to the provided training sequences.

        Parameters
        -----------
        training_data : numpy-array
            4D-Numpy array representing the training sequence in one-hot encoding.
            See :meth:`crbm.sequences.seqToOneHot`.

        test_data : numpy-array
            4D-Numpy array representing the validation sequence in one-hot encoding.
            If no test_data is provided, the training progress will be reported
            on the training set itself. See :meth:`crbm.sequences.seqToOneHot`.
        """
        # assert that pooling can be done without rest to the division
        # compute sequence length
        nseq=int((training_data.shape[3]-\
            self.motif_length + 1)/\
            self.pooling)*\
            self.pooling+ \
            self.motif_length -1
        training_data = training_data[:, :, :, :nseq]

        if type(test_data) != type(None):
            nseq=int((test_data.shape[3]-\
                self.motif_length + 1)/\
                self.pooling)*\
                self.pooling+ \
                self.motif_length -1
            test_data = test_data[:, :, :, :nseq]
        else:
            test_data = training_data

        print(("BatchSize: " + str(self.batchsize)))
        start = time.time()

        # compile training function

        # now perform training
        print("Start training the model...")
        starttime = time.time()

        for epoch in range(self.epochs):
            for [start,end] in self._iterateBatchIndices(\
                            training_data.shape[0],self.batchsize):
                self._trainingFct(training_data[start:end, :, :, :])
            meanfe = 0.0
            meannmh = 0.0
            nb = 0
            for [start,end] in self._iterateBatchIndices(\
                            test_data.shape[0],self.batchsize):
                [mfe_,
                 nmh_] = self._evaluateData(test_data[start:end, :, :, :])
                meanfe = meanfe + mfe_
                meannmh = meannmh + nmh_
                nb = nb + 1
            [twn_, ic_, medic_] = self._evaluateParams()
            print(("Epoch {:d}: ".format(epoch) + \
                    "FE={:1.3f} ".format(meanfe/nb) + \
                    "NumH={:1.4f} ".format(meannmh/nb) + \
                    "WNorm={:2.2f} ".format(float(twn_)) + \
                    "IC={:1.3f} medIC={:1.3f}".format(float(ic_), float(medic_))))

        # done with training
        print(("Training finished after: {:5.2f} seconds!".format(\
                time.time()-starttime)))

    def _meanFreeEnergy(self, D):
        """Theano function for computing the mean free energy."""
        return T.sum(self._freeEnergyForData(D)) / D.shape[0]

    def getPFMs(self):
        """Returns the weight matrices converted to *position frequency matrices*.

        Parameters
        -----------
        returns: numpy-array
            List of position frequency matrices as numpy arrays.
        """
        def softmax_(x):
            x_exp = np.exp(x)
            y = np.zeros(x.shape)
            for i in range(x.shape[1]):
                y[:, i] = x_exp[:, i] / np.sum(x_exp[:, i])
            return y

        return [softmax_(m[0, :, :]) for m in self.motifs.get_value()]

    def _freeEnergyForData(self, D):
        """Theano function for computing the free energy (per position)."""

        pool = self.pooling

        x = self._bottomUpActivity(D)

        x = x.reshape(
            (x.shape[0], x.shape[1], x.shape[2], x.shape[3] // pool, pool))
        free_energy = -T.sum(T.log(1. + T.sum(T.exp(x), axis=4)),
                             axis=(1, 2, 3))
        if self.doublestranded:
            x = self._bottomUpActivity(D, True)

            x = x.reshape(
                (x.shape[0], x.shape[1], x.shape[2], x.shape[3] // pool, pool))
            free_energy = free_energy - T.sum(
                T.log(1. + T.sum(T.exp(x), axis=4)), axis=(1, 2, 3))

        cMod = self.c
        cMod = cMod.dimshuffle('x', 0, 1,
                               'x')  # make it 4D and broadcastable there
        free_energy = free_energy - T.sum(D * cMod, axis=(1, 2, 3))

        return free_energy / D.shape[3]

    def _freeEnergyPerMotif(self, D):
        """Theano function for computing the free energy (per motif)."""

        pool = self.pooling

        x = self._bottomUpActivity(D)

        x = x.reshape(
            (x.shape[0], x.shape[1], x.shape[2], x.shape[3] // pool, pool))
        free_energy = -T.sum(T.log(1. + T.sum(T.exp(x), axis=4)), axis=(2, 3))

        if self.doublestranded:
            x = self._bottomUpActivity(D, True)
            x = x.reshape(
                (x.shape[0], x.shape[1], x.shape[2], x.shape[3] // pool, pool))
            free_energy = free_energy - T.sum(
                T.log(1. + T.sum(T.exp(x), axis=4)), axis=(2, 3))

        cMod = self.c
        cMod = cMod.dimshuffle('x', 0, 1,
                               'x')  # make it 4D and broadcastable there
        free_energy = free_energy - T.sum(D * cMod, axis=(1, 2, 3)).dimshuffle(
            0, 'x')

        return free_energy

    def _softmax(self, x):
        """Softmax operation."""

        return T.exp(x) / T.exp(x).sum(axis=2, keepdims=True)

    def __repr__(self):
        st = "Parameters:\n\n"
        st += "Number of motifs: {}\n".format(self.num_motifs)
        st += "Motif length: {}\n".format(self.motif_length)
        st += "\n"
        st += "Hyper-parameters:\n\n"
        st += "input dims: {:d}".format(self.input_dims)
        st += "doublestranded: {}".format(self.doublestranded)
        st += "batchsize: {:d}".format(self.batchsize)
        st += "learning rate: {:1.3f}".format(self.learning_rate)
        st += "momentum: {:1.3f}".format(self.momentum)
        st += "rho: {:1.4f}".format(self.rho)
        st += "lambda: {:1.3f}".format(self.lambda_rate)
        st += "pooling: {:d}".format(self.pooling)
        st += "cd_k: {:d}".format(self.cd_k)
        st += "epochs: {:d}".format(self.epochs)
        return st

    def _iterateBatchIndices(self, totalsize, nbatchsize):
        """Returns indices in batches."""

        return [ [i,i+nbatchsize] if i+nbatchsize<=totalsize \
                    else [i,totalsize] for i in range(totalsize)[0::nbatchsize] ]
Exemplo n.º 37
0
    m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")
    A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64')
    s2 = numpy.random.randint(0, i32max, 3).astype('int32')
    m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")

    f0.input_storage[0].storage[0] = A1
    f0.input_storage[1].storage[0] = s1
    f0.input_storage[2].storage[0] = m1
    f0.input_storage[3].storage[0] = A2
    f0.input_storage[4].storage[0] = s2
    f0.input_storage[5].storage[0] = m2

    r_a1 = rng_mrg.matVecModM(A1, s1, m1)
    r_a2 = rng_mrg.matVecModM(A2, s2, m2)
    f0.fn()
    r_b = f0.output_storage[0].value

    assert numpy.allclose(r_a1, r_b[:3])
    assert numpy.allclose(r_a2, r_b[3:])


if __name__ == "__main__":
    rng = MRG_RandomStreams(numpy.random.randint(2147462579))
    import time
    print theano.__file__
    pvals = theano.tensor.fmatrix()
    for i in range(10):
        t0 = time.time()
        multinomial = rng.multinomial(pvals=pvals)
        print time.time() - t0
def theano_multinomial(n, pvals, seed):
    rng = RandomStreams(seed)
    return rng.multinomial(n=n, pvals=pvals, dtype='float32')
Exemplo n.º 39
0
class Decoder(EncoderDecoderBase):
    NCE = 0
    EVALUATION = 1
    SAMPLING = 2
    BEAM_SEARCH = 3

    def __init__(self, state, rng, parent, encoder):
        EncoderDecoderBase.__init__(self, state, rng, parent)
        # Take as input the encoder instance for the embeddings..
        # To modify in the future
        self.encoder = encoder
        self.trng = MRG_RandomStreams(self.seed)
        self.init_params()

    def init_params(self):
        """ Decoder weights """
        self.bd_out = add_to_params(
            self.params,
            theano.shared(value=np.zeros((self.idim, ), dtype='float32'),
                          name='bd_out'))
        self.Wd_emb = add_to_params(
            self.params,
            theano.shared(value=NormalInit(self.rng, self.idim, self.rankdim),
                          name='Wd_emb'))

        self.Wd_hh = add_to_params(
            self.params,
            theano.shared(value=OrthogonalInit(self.rng, self.qdim, self.qdim),
                          name='Wd_hh'))
        self.bd_hh = add_to_params(
            self.params,
            theano.shared(value=np.zeros((self.qdim, ), dtype='float32'),
                          name='bd_hh'))
        self.Wd_in = add_to_params(
            self.params,
            theano.shared(value=NormalInit(self.rng, self.rankdim, self.qdim),
                          name='Wd_in'))
        self.Wd_s_0 = add_to_params(
            self.params,
            theano.shared(value=NormalInit(self.rng, self.sdim, self.qdim),
                          name='Wd_s_0'))
        self.bd_s_0 = add_to_params(
            self.params,
            theano.shared(value=np.zeros((self.qdim, ), dtype='float32'),
                          name='bd_s_0'))

        if self.decoder_bias_type == 'all':
            self.Wd_s_q = add_to_params(
                self.params,
                theano.shared(value=NormalInit(self.rng, self.sdim, self.qdim),
                              name='Wd_s_q'))

        if self.query_step_type == "gated":
            self.Wd_in_r = add_to_params(
                self.params,
                theano.shared(value=NormalInit(self.rng, self.rankdim,
                                               self.qdim),
                              name='Wd_in_r'))
            self.Wd_in_z = add_to_params(
                self.params,
                theano.shared(value=NormalInit(self.rng, self.rankdim,
                                               self.qdim),
                              name='Wd_in_z'))
            self.Wd_hh_r = add_to_params(
                self.params,
                theano.shared(value=OrthogonalInit(self.rng, self.qdim,
                                                   self.qdim),
                              name='Wd_hh_r'))
            self.Wd_hh_z = add_to_params(
                self.params,
                theano.shared(value=OrthogonalInit(self.rng, self.qdim,
                                                   self.qdim),
                              name='Wd_hh_z'))
            self.bd_r = add_to_params(
                self.params,
                theano.shared(value=np.zeros((self.qdim, ), dtype='float32'),
                              name='bd_r'))
            self.bd_z = add_to_params(
                self.params,
                theano.shared(value=np.zeros((self.qdim, ), dtype='float32'),
                              name='bd_z'))

            if self.decoder_bias_type == 'all':
                self.Wd_s_z = add_to_params(
                    self.params,
                    theano.shared(value=NormalInit(self.rng, self.sdim,
                                                   self.qdim),
                                  name='Wd_s_z'))
                self.Wd_s_r = add_to_params(
                    self.params,
                    theano.shared(value=NormalInit(self.rng, self.sdim,
                                                   self.qdim),
                                  name='Wd_s_r'))

        ######################
        # Output layer weights
        ######################
        out_target_dim = self.qdim
        if not self.maxout_out:
            out_target_dim = self.rankdim

        self.Wd_out = add_to_params(
            self.params,
            theano.shared(value=NormalInit(self.rng, self.qdim,
                                           out_target_dim),
                          name='Wd_out'))

        # Set up deep output
        if self.deep_out:
            self.Wd_e_out = add_to_params(
                self.params,
                theano.shared(value=NormalInit(self.rng, self.rankdim,
                                               out_target_dim),
                              name='Wd_e_out'))
            self.bd_e_out = add_to_params(
                self.params,
                theano.shared(value=np.zeros((out_target_dim, ),
                                             dtype='float32'),
                              name='bd_e_out'))

            if self.decoder_bias_type != 'first':
                self.Wd_s_out = add_to_params(
                    self.params,
                    theano.shared(value=NormalInit(self.rng, self.sdim,
                                                   out_target_dim),
                                  name='Wd_s_out'))
        """ Rank """
        if hasattr(self, 'train_rank'):
            self.Wr_out = add_to_params(
                self.params,
                theano.shared(value=NormalInit(self.rng, self.sdim, 1),
                              name='Wr_out'))
            self.br_out = add_to_params(
                self.params,
                theano.shared(value=np.zeros((1, ), dtype='float32'),
                              name='br_out'))

    def build_rank_layer(self, hs):
        return T.dot(hs, self.Wr_out) + self.br_out

    def build_output_layer(self, hs, xd, hd):
        pre_activ = T.dot(hd, self.Wd_out)

        if self.deep_out:
            pre_activ += T.dot(xd, self.Wd_e_out) + self.bd_e_out

            if self.decoder_bias_type != 'first':
                pre_activ += T.dot(hs, self.Wd_s_out)
                # ^ if bias all, bias the deep output

        if self.maxout_out:
            pre_activ = Maxout(2)(pre_activ)

        return pre_activ

    def build_next_probs_predictor(self, hs, x, prev_hd):
        """
        Return output probabilities given prev_words x, hierarchical pass hs, and previous hd
        hs should always be the same (and should not be updated).
        """
        return self.build_decoder(hs,
                                  x,
                                  mode=Decoder.BEAM_SEARCH,
                                  prev_hd=prev_hd)

    def approx_embedder(self, x):
        # Here we use the same embeddings learnt in the encoder.. !!!
        return self.encoder.approx_embedder(x)

    def output_softmax(self, pre_activ):
        # returns a (timestep, bs, idim) matrix (huge)
        return SoftMax(T.dot(pre_activ, self.Wd_emb.T) + self.bd_out)

    def output_nce(self, pre_activ, y, y_hat):
        # returns a (timestep, bs, pos + neg) matrix (very small)
        target_embedding = self.Wd_emb[y]
        # ^ target embedding is (timestep x bs, rankdim)
        noise_embedding = self.Wd_emb[y_hat]
        # ^ noise embedding is (10, timestep x bs, rankdim)

        # pre_activ is (timestep x bs x rankdim)
        pos_scores = (target_embedding * pre_activ).sum(2)
        neg_scores = (noise_embedding * pre_activ).sum(3)

        pos_scores += self.bd_out[y]
        neg_scores += self.bd_out[y_hat]

        pos_noise = self.parent.t_noise_probs[y] * 10
        neg_noise = self.parent.t_noise_probs[y_hat] * 10

        pos_scores = -T.log(T.nnet.sigmoid(pos_scores - T.log(pos_noise)))
        neg_scores = -T.log(1 - T.nnet.sigmoid(neg_scores -
                                               T.log(neg_noise))).sum(0)
        return pos_scores + neg_scores

    def build_decoder(self,
                      hs,
                      x,
                      xmask=None,
                      y=None,
                      y_neg=None,
                      mode=EVALUATION,
                      prev_hd=None,
                      step_num=None):
        # Check parameter consistency
        if mode == Decoder.EVALUATION or mode == Decoder.NCE:
            assert not prev_hd
            assert y
        else:
            assert not y
            assert prev_hd

        # if mode == EVALUATION
        #   xd = (timesteps, batch_size, qdim)
        #
        # if mode != EVALUATION
        #   xd = (n_samples, dim)
        xd = self.approx_embedder(x)
        if not xmask:
            xmask = T.neq(x, self.eoq_sym)

        # we must zero out the </s> embedding
        # i.e. the embedding x_{-1} is the 0 vector
        # as well as hd_{-1} which will be reseted in the scan functions
        if xd.ndim != 3:
            assert mode != Decoder.EVALUATION
            xd = (xd.dimshuffle((1, 0)) * xmask).dimshuffle((1, 0))
        else:
            assert mode == Decoder.EVALUATION or mode == Decoder.NCE
            xd = (xd.dimshuffle((2, 0, 1)) * xmask).dimshuffle((1, 2, 0))

        # Run the decoder
        if mode == Decoder.EVALUATION or mode == Decoder.NCE:
            hd_init = T.alloc(np.float32(0), x.shape[1], self.qdim)
        else:
            hd_init = prev_hd

        if self.query_step_type == "gated":
            f_dec = self.gated_step
            o_dec_info = [hd_init, None, None, None]
        else:
            f_dec = self.plain_step
            o_dec_info = [hd_init]

        # If the mode of the decoder is EVALUATION
        # then we evaluate by default all the sentence
        # xd - i.e. xd.ndim == 3, xd = (timesteps, batch_size, qdim)
        if mode == Decoder.EVALUATION or mode == Decoder.NCE:
            _res, _ = theano.scan(f_dec,
                              sequences=[xd, xmask, hs],\
                              outputs_info=o_dec_info)
        # else we evaluate only one step of the recurrence using the
        # previous hidden states and the previous computed hierarchical
        # states.
        else:
            _res = f_dec(xd, xmask, hs, prev_hd)

        if isinstance(_res, list) or isinstance(_res, tuple):
            hd = _res[0]
        else:
            hd = _res

        # if we are using selective bias, we should update our hs
        # to the step-selective hs
        pre_activ = self.build_output_layer(hs, xd, hd)

        # EVALUATION  : Return target_probs + all the predicted ranks
        # target_probs.ndim == 3
        if mode == Decoder.EVALUATION:
            target_probs = GrabProbs(self.output_softmax(pre_activ), y)
            return target_probs, hd, _res
        elif mode == Decoder.NCE:
            return self.output_nce(pre_activ, y, y_neg), hd
        # BEAM_SEARCH : Return output (the softmax layer) + the new hidden states
        elif mode == Decoder.BEAM_SEARCH:
            return self.output_softmax(pre_activ), hd
        # SAMPLING    : Return a vector of n_sample from the output layer
        #                 + log probabilities + the new hidden states
        elif mode == Decoder.SAMPLING:
            outputs = self.output_softmax(pre_activ)
            if outputs.ndim == 1:
                outputs = outputs.dimshuffle('x', 0)
            sample = self.trng.multinomial(pvals=outputs,
                                           dtype='int64').argmax(axis=-1)
            if outputs.ndim == 1:
                sample = sample[0]
            log_prob = -T.log(T.diag(outputs.T[sample]))
            return sample, log_prob, hd

    def sampling_step(self, *args):
        args = iter(args)

        # Arguments that correspond to scan's "sequences" parameteter:
        step_num = next(args)
        assert step_num.ndim == 0

        # Arguments that correspond to scan's "outputs" parameteter:
        prev_word = next(args)
        assert prev_word.ndim == 1

        # skip the previous word log probability
        log_prob = next(args)
        assert log_prob.ndim == 1

        prev_h = next(args)
        assert prev_h.ndim == 2

        prev_hs = next(args)
        assert prev_hs.ndim == 2

        prev_hd = next(args)
        assert prev_hd.ndim == 2

        # When we sample we shall recompute the encoder for one step...
        encoder_args = dict(prev_hs=prev_hs, prev_h=prev_h)
        h, hs = self.parent.encoder.build_encoder(prev_word, **encoder_args)

        assert h.ndim == 2
        assert hs.ndim == 2

        # ...and decode one step.
        sample, log_prob, hd = self.build_decoder(hs,
                                                  prev_word,
                                                  prev_hd=prev_hd,
                                                  step_num=step_num,
                                                  mode=Decoder.SAMPLING)

        assert sample.ndim == 1
        assert log_prob.ndim == 1
        assert hd.ndim == 2

        return [sample, log_prob, h, hs, hd]

    def build_sampler(self, n_samples, n_steps):
        # For the naive sampler, the states are:
        # 1) a vector [</q>] * n_samples to seed the sampling
        # 2) a vector of [ 0. ] * n_samples for the log_probs
        # 3) prev_h hidden layers
        # 4) prev_hs hidden layers
        # 5) prev_hd hidden layers
        states = [
            T.alloc(np.int64(self.eoq_sym), n_samples),
            T.alloc(np.float32(0.), n_samples),
            T.alloc(np.float32(0.), n_samples, self.qdim),
            T.alloc(np.float32(0.), n_samples, self.sdim),
            T.alloc(np.float32(0.), n_samples, self.qdim)
        ]
        outputs, updates = theano.scan(
            self.sampling_step,
            outputs_info=states,
            sequences=[T.arange(n_steps, dtype='int64')],
            n_steps=n_steps,
            name="sampler_scan")
        # Return sample, log_probs and updates (for tnrg multinomial)
        return (outputs[0], outputs[1]), updates

    def gated_step(self, xd_t, m_t, hs_t, hd_tm1):
        if m_t.ndim >= 1:
            m_t = m_t.dimshuffle(0, 'x')

        hd_tm1 = (m_t) * hd_tm1 + (
            1 - m_t) * T.tanh(T.dot(hs_t, self.Wd_s_0) + self.bd_s_0)
        # ^ iff x_{t - 1} = </s> (m_t = 0) then x_{t - 1} = 0
        # and hd_{t - 1} = tanh(W_s_0 hs_t + bd_s_0) else hd_{t - 1} is left unchanged (m_t = 1)

        # In the 'all' decoder bias type each hidden state of the decoder
        # RNN receives the hs_t vector as bias without modification
        if self.decoder_bias_type == 'all':

            rd_t = T.nnet.sigmoid(
                T.dot(xd_t, self.Wd_in_r) + T.dot(hd_tm1, self.Wd_hh_r) +
                T.dot(hs_t, self.Wd_s_r) + self.bd_r)
            zd_t = T.nnet.sigmoid(
                T.dot(xd_t, self.Wd_in_z) + T.dot(hd_tm1, self.Wd_hh_z) +
                T.dot(hs_t, self.Wd_s_z) + self.bd_z)
            hd_tilde = self.query_rec_activation(T.dot(xd_t, self.Wd_in) \
                                        + T.dot(rd_t * hd_tm1, self.Wd_hh) \
                                        + T.dot(hs_t, self.Wd_s_q) \
                                        + self.bd_hh)
            hd_t = (np.float32(1.) - zd_t) * hd_tm1 + zd_t * hd_tilde
            output = (hd_t, rd_t, zd_t, hd_tilde)

        else:
            # Do not bias all the decoder (force to store very useful information in the first state)
            rd_t = T.nnet.sigmoid(
                T.dot(xd_t, self.Wd_in_r) + T.dot(hd_tm1, self.Wd_hh_r) +
                self.bd_r)
            zd_t = T.nnet.sigmoid(
                T.dot(xd_t, self.Wd_in_z) + T.dot(hd_tm1, self.Wd_hh_z) +
                self.bd_z)
            hd_tilde = self.query_rec_activation(T.dot(xd_t, self.Wd_in) \
                                        + T.dot(rd_t * hd_tm1, self.Wd_hh) \
                                        + self.bd_hh)
            hd_t = (np.float32(1.) - zd_t) * hd_tm1 + zd_t * hd_tilde
            output = (hd_t, rd_t, zd_t, hd_tilde)
        return output

    def plain_step(self, xd_t, m_t, hs_t, hd_tm1):
        if m_t.ndim >= 1:
            m_t = m_t.dimshuffle(0, 'x')

        # We already assume that xd are zeroed out
        hd_tm1 = (m_t) * hd_tm1 + (
            1 - m_t) * T.tanh(T.dot(hs_t, self.Wd_s_0) + self.bd_s_0)
        # ^ iff x_{t - 1} = </s> (m_t = 0) then x_{t-1} = 0
        # and hd_{t - 1} = 0 else hd_{t - 1} is left unchanged (m_t = 1)

        if self.decoder_bias_type == 'first':
            # Do not bias all the decoder (force to store very useful information in the first state)
            hd_t = self.query_rec_activation( T.dot(xd_t, self.Wd_in) \
                                             + T.dot(hd_tm1, self.Wd_hh) \
                                             + self.bd_hh )
            output = (hd_t, )
        elif self.decoder_bias_type == 'all':
            hd_t = self.query_rec_activation( T.dot(xd_t, self.Wd_in) \
                                             + T.dot(hd_tm1, self.Wd_hh) \
                                             + T.dot(hs_t, self.Wd_s_q) \
                                             + self.bd_hh )
            output = (hd_t, )
        return output
Exemplo n.º 40
0
class RNNsearch(model):
    '''
		The attention-based NMT model
	'''
    def __init__(self, config, name='', fls=None):
        self.config = config
        self.name = name
        self.creater = LayerFactory()
        self.fls = fls
        #print(self.fls)
        self.trng = RandomStreams(numpy.random.randint(int(10e6)))

    def sampling_step(self, state, prev, context):
        '''
			Build the computational graph which samples the next word.

			:type state: theano variables
			:param state: the previous hidden state

			:type prev: theano variables
			:param prev: the last generated word

			:type context: theano variables
			:param context: the context vectors.
		'''
        emb = self.emb_trg.forward(prev)
        energy, c = self.decoderGRU.decode_probs(context, state, emb)
        probs = tensor.nnet.softmax(energy)

        sample = self.trng.multinomial(pvals=probs,
                                       dtype='int64').argmax(axis=-1)

        newemb = self.emb_trg.forward(sample)
        newstate = self.decoderGRU.decode_next(c, state, newemb)

        return newstate, sample, probs

    def decode_sample(self, state_init, c, length, n_samples):
        '''
			Build the decoder graph for sampling.

			:type state_init: theano variables
			:param state_init: the initial state of decoder

			:type c: theano variables
			:param c: the context vectors

			:type length: int
			:param length: the limitation of sample length

			:type n_samples: int
			:param n_samples: the number of samples
		'''

        state = tensor.repeat(state_init, n_samples,
                              axis=0)  # copy state n times
        sample = tensor.zeros((n_samples, ), dtype='int64')
        c = tensor.repeat(c, n_samples, axis=1)

        result, updates = theano.scan(self.sampling_step,
                                      outputs_info=[state, sample, None],
                                      non_sequences=[c],
                                      n_steps=length)

        samples = result[1]
        probs = result[2]
        y_idx = tensor.arange(samples.flatten(
        ).shape[0]) * self.config['num_vocab_trg'] + samples.flatten()
        #probs = probs.flatten()[y_idx]
        #probs = probs.reshape(samples.shape)
        return samples, probs, updates

    def build(self, verbose=False):
        '''
			Build the computational graph.

			:type verbose: bool
			:param verbose: only set to True on visualization
		'''
        config = self.config

        # create layers
        logging.info('Initializing layers')
        self.emb_src = self.creater.createLookupTable(
            self.name + 'emb_src',
            config['num_vocab_src'],
            config['dim_emb_src'],
            offset=True)  #(input,output)-->[30000,620]
        self.emb_trg = self.creater.createLookupTable(
            self.name + 'emb_trg',
            config['num_vocab_trg'],
            config['dim_emb_trg'],
            offset=True)  #(input,output)-->[30000,620]
        self.encoderGRU = self.creater.createGRU(self.name + 'GRU_enc',
                                                 config['dim_emb_src'],
                                                 config['dim_rec_enc'],
                                                 verbose=verbose)
        self.encoderGRU_back = self.creater.createGRU(self.name +
                                                      'GRU_enc_back',
                                                      config['dim_emb_src'],
                                                      config['dim_rec_enc'],
                                                      verbose=verbose)
        self.decoderGRU = self.creater.createGRU_attention(
            self.name + 'GRU_dec',
            config['dim_emb_trg'],
            2 * config['dim_rec_enc'],
            config['dim_rec_dec'],
            config['num_vocab_trg'],
            verbose=verbose)
        self.initer = self.creater.createFeedForwardLayer(
            self.name + 'initer',
            config['dim_rec_enc'],
            config['dim_rec_dec'],
            offset=True)

        if self.fls:
            #print("loaded feature")
            fl_weight = []
            for fl in self.fls:
                fl_weight.append(fl.feature_weight)
                #logging.info("sen weight")
                #print(fl.feature_weight)
            fl_weight = numpy.concatenate(fl_weight)
            self.feature_weight = theano.shared(fl_weight.astype('float32'),
                                                name="feature_weight")
            self.creater.params += [self.feature_weight]
            self.feature_weight_dim = self.feature_weight.dimshuffle(
                'x', 0)  # equal to a.T  (m,n)-->(n,m)

        # create input variables
        self.x = tensor.matrix('x', dtype='int64')  # size: (length, batchsize)
        self.xmask = tensor.matrix(
            'x_mask', dtype='float32')  # size: (length, batchsize)
        self.y = tensor.matrix('y', dtype='int64')  # size: (length, batchsize)
        self.ymask = tensor.matrix(
            'y_mask', dtype='float32')  # size: (length, batchsize)

        if 'MRT' in config and config['MRT'] is True:
            self.MRTLoss = tensor.vector('MRTLoss')
            self.inputs = [
                self.x, self.xmask, self.y, self.ymask, self.MRTLoss
            ]
        else:
            self.MRTLoss = None
            self.inputs = [self.x, self.xmask, self.y, self.ymask]

        if config['PR']:
            self.ans = tensor.scalar('ans', dtype='int64')
            self.features = tensor.matrix('features', dtype='float32')
            self.inputs += [self.features, self.ans]

        # create computational graph for training
        logging.info('Building computational graph')
        # ----encoder-----
        emb = self.emb_src.forward(
            self.x.flatten())  # size: (length, batch_size, dim_emb)
        back_emb = self.emb_src.forward(self.x[::-1].flatten())

        self.encode_forward = self.encoderGRU.forward(
            emb, self.x.shape[0], batch_size=self.x.shape[1],
            mask=self.xmask)  # size: (length, batch_size, dim)
        self.encode_backward = self.encoderGRU_back.forward(
            back_emb,
            self.x.shape[0],
            batch_size=self.x.shape[1],
            mask=self.xmask[::-1])  # size: (length, batch_size, dim)
        context_forward = self.encode_forward[0]  # only hiddens
        context_backward = self.encode_backward[0][::-1]
        self.context = tensor.concatenate(
            (context_forward, context_backward),
            axis=2)  # size: (length, batch_size, 2*dim)

        # ----decoder----
        self.init_c = context_backward[0]
        self.state_init = self.initer.forward(context_backward[0])
        emb = self.emb_trg.forward(
            self.y.flatten())  # size: (length, batch_size, dim_emb)
        self.decode = self.decoderGRU.forward(
            emb,
            self.y.shape[0],
            self.context,
            state_init=self.state_init,
            batch_size=self.y.shape[1],
            mask=self.ymask,
            cmask=self.xmask)  # size: (length, batch_size, dim)

        energy = self.decode[1]
        self.attention = self.decode[2]
        self.softmax = tensor.nnet.softmax(energy)
        # compute costs and grads
        y_idx = tensor.arange(self.y.flatten(
        ).shape[0]) * self.config['num_vocab_trg'] + self.y.flatten()
        cost = self.softmax.flatten()[y_idx]
        cost = -tensor.log(cost)
        self.cost = cost.reshape(
            (self.y.shape[0], self.y.shape[1])) * self.ymask
        self.cost_per_sample = self.cost.sum(axis=0)
        if 'MRT' in config and config['MRT'] is True:
            self.cost_per_sample = self.cost.sum(axis=0)
            tmp = self.cost_per_sample
            tmp *= config['MRT_alpha']
            tmp -= tmp.min()
            tmp = tensor.exp(-tmp)
            tmp /= tmp.sum()
            tmp *= self.MRTLoss
            tmp = -tmp.sum()
            self.cost = tmp
        elif config['PR'] and self.fls:
            # calculate p
            self.cost_per_sample = self.cost.sum(axis=0)
            self.cost_per_sample *= config['alpha_PR']
            cost_min = self.cost_per_sample - self.cost_per_sample.min()
            probs = tensor.exp(-cost_min)
            log_probs = -cost_min - tensor.log(probs.sum())
            probs /= probs.sum()
            self.probs = log_probs
            # calculate q
            energy_q = self.features * self.feature_weight_dim
            energy_q = energy_q.sum(axis=1)
            self.energy_q = energy_q
            energy_q_min = energy_q - energy_q.max()
            probs_q = tensor.exp(energy_q_min)
            log_probs_q = energy_q_min - tensor.log(probs_q.sum())

            probs_q /= probs_q.sum()
            self.probs_q = log_probs_q
            # calculate KL divergence
            cost_KL = tensor.exp(log_probs_q) * (log_probs_q - log_probs)
            self.cost_KLs = cost_KL
            self.cost_KL = cost_KL.sum()
            self.cost_NMT = self.cost_per_sample[self.ans]
            self.cost = config['lambda_PR'] * self.cost_KL + config[
                'lambda_MLE'] * self.cost_NMT
        else:
            self.cost = self.cost.sum()

        # build sampling graph
        self.x_sample = tensor.matrix('x_sample', dtype='int64')
        self.n_samples = tensor.scalar('n_samples', dtype='int64')
        self.length_sample = tensor.scalar('length', dtype='int64')
        emb_sample = self.emb_src.forward(
            self.x_sample.flatten())  # (length, batch_size, dim_emb)
        back_emb_sample = self.emb_src.forward(self.x_sample[::-1].flatten())
        encode_forward_sample = self.encoderGRU.forward(
            emb_sample,
            self.x_sample.shape[0],
            batch_size=self.x_sample.shape[1])  # (length, batch_size, dim)
        encode_backward_sample = self.encoderGRU_back.forward(
            back_emb_sample,
            self.x_sample.shape[0],
            batch_size=self.x_sample.shape[1])  # (length, batch_size, dim)
        context_sample = tensor.concatenate(
            (encode_forward_sample[0], encode_backward_sample[0][::-1]),
            axis=2)  # (length, batch_size, 2*dim)
        state_init_sample = self.initer.forward(
            encode_backward_sample[0][::-1][0])
        self.state_init_sample = state_init_sample
        self.context_sample = context_sample
        self.samples, self.probs_sample, self.updates_sample = self.decode_sample(
            state_init_sample, context_sample, self.length_sample,
            self.n_samples)

        # parameter for decoding
        self.y_decode = tensor.vector('y_decode', dtype='int64')
        self.context_decode = tensor.tensor3('context_decode', dtype='float32')
        self.c_decode = tensor.matrix('c_decode', dtype='float32')
        self.state_decode = tensor.matrix('state_decode', dtype='float32')
        self.emb_decode = tensor.matrix('emb_decode', dtype='float32')

    def encode(self, x):
        '''
			Encode source sentence to context vector.
		'''
        if not hasattr(self, "encoder"):
            self.encoder = theano.function(inputs=[self.x, self.xmask],
                                           outputs=[self.context])
        x = numpy.reshape(x, (x.shape[0], 1))
        xmask = numpy.ones(x.shape, dtype='float32')
        return self.encoder(x, xmask)

    def get_trg_embedding(self, y):
        '''
			Get the embedding of target sentence.
		'''
        if not hasattr(self, "get_trg_embeddinger"):
            self.get_trg_embeddinger = theano.function(
                inputs=[self.y_decode],
                outputs=[self.emb_trg.forward(self.y_decode)])
        return self.get_trg_embeddinger(y)

    def get_init(self, c):
        '''
			Get the initial decoder hidden state with context vector.
		'''
        if not hasattr(self, "get_initer"):
            self.get_initer = theano.function(
                inputs=[self.context],
                outputs=[self.initer.forward(context_backward[0])])
        return self.get_initer(c)

    def get_context_and_init(self, x):
        '''
			Encode source sentence to context vectors and get the initial decoder hidden state.
		'''
        if not hasattr(self, "get_context_and_initer"):
            self.get_context_and_initer = theano.function(
                inputs=[self.x, self.xmask],
                outputs=[self.context, self.state_init])
        x = numpy.reshape(x, (x.shape[0], 1))
        xmask = numpy.ones(x.shape, dtype='float32')
        return self.get_context_and_initer(x, xmask)

    def get_probs(self, c, state, emb):
        '''
			Get the probability of the next target word.
		'''
        if not hasattr(self, "get_probser"):
            self.get_probser = theano.function(inputs = [self.context_decode, \
                                                   self.state_decode, \
                        self.emb_decode], \
                       outputs = self.decoderGRU.decode_probs(self.context_decode, \
                                                           self.state_decode, \
                                self.emb_decode))
        return self.get_probser(c, state, emb)

    def get_next(self, c, state, emb):
        '''
			Get the next hidden state.
		'''
        if not hasattr(self, "get_nexter"):
            self.get_nexter = theano.function(inputs = [self.c_decode, \
                                                  self.state_decode, \
                       self.emb_decode],
                      outputs = self.decoderGRU.decode_next(self.c_decode, \
                                                         self.state_decode, \
                              self.emb_decode))
        return self.get_nexter(c, state, emb)

    def get_cost(self, x, xmask, y, ymask):
        '''
			Get the negative log-likelihood of parallel sentences.
		'''
        if not hasattr(self, "get_coster"):
            self.get_coster = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=[self.cost])
        return self.get_coster(x, xmask, y, ymask)

    def get_sample(self, x, length, n_samples):
        '''
			Get sampling results.
		'''
        if not hasattr(self, "get_sampler"):
            self.get_sampler = theano.function(
                inputs=[self.x_sample, self.length_sample, self.n_samples],
                outputs=[self.samples, self.probs_sample],
                updates=self.updates_sample)
        return self.get_sampler(x, length, n_samples)

    def get_attention(self, x, xmask, y, ymask):
        '''
			Get the attention weight of parallel sentences.
		'''
        if not hasattr(self, "get_attentioner"):
            self.get_attentioner = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=[self.attention])
        return self.get_attentioner(x, xmask, y, ymask)

    def get_layer(self, x, xmask, y, ymask):
        '''
			Get the hidden states essential for visualization
		'''
        if not hasattr(self, "get_layerer"):
            self.get_layerer = theano.function(inputs = [self.x, self.xmask, self.y, self.ymask],
                          outputs = self.encode_forward + \
                                 self.encode_backward + \
                        tuple(self.decode[0]) + tuple(self.decode[1:]))

        layers = self.get_layerer(x, xmask, y, ymask)
        enc_names = [
            'h', 'gate', 'reset', 'state', 'reseted', 'state_in', 'gate_in',
            'reset_in'
        ]
        dec_names = [
            'h', 'c', 'att', 'gate_cin', 'gate_preactive', 'gate', 'reset_cin',
            'reset_preactive', 'reset', 'state_cin', 'reseted',
            'state_preactive', 'state'
        ]
        dec_names += [
            'outenergy', 'state_in', 'gate_in', 'reset_in', 'state_in_prev',
            'readout', 'maxout', 'outenergy_1', 'outenergy_2'
        ]
        value_name = ['enc_for_' + name for name in enc_names]
        value_name += ['enc_back_' + name for name in enc_names]
        value_name += ['dec_' + name for name in dec_names]
        result = {}
        for i in range(len(layers)):
            if value_name[i] != '':
                result[value_name[i]] = layers[i]
        return result
Exemplo n.º 41
0
class Categorical(Distribution):
    def __init__(self, dim):
        self._dim = dim
        self._srng = RandomStreams()

    @property
    def dim(self):
        return self._dim

    def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
        """
        Compute the symbolic KL divergence of two categorical distributions
        """
        old_prob_var = old_dist_info_vars["prob"]
        new_prob_var = new_dist_info_vars["prob"]
        # Assume layout is N * A
        return TT.sum(
            old_prob_var *
            (TT.log(old_prob_var + TINY) - TT.log(new_prob_var + TINY)),
            axis=-1)

    def kl(self, old_dist_info, new_dist_info):
        """
        Compute the KL divergence of two categorical distributions
        """
        old_prob = old_dist_info["prob"]
        new_prob = new_dist_info["prob"]
        return np.sum(old_prob *
                      (np.log(old_prob + TINY) - np.log(new_prob + TINY)),
                      axis=-1)

    def likelihood_ratio_sym(self, x_var, old_dist_info_vars,
                             new_dist_info_vars):
        old_prob_var = old_dist_info_vars["prob"]
        new_prob_var = new_dist_info_vars["prob"]
        x_var = TT.cast(x_var, 'float32')
        # Assume layout is N * A
        return (TT.sum(new_prob_var * x_var, axis=-1) +
                TINY) / (TT.sum(old_prob_var * x_var, axis=-1) + TINY)

    def entropy(self, info):
        probs = info["prob"]
        return -np.sum(probs * np.log(probs + TINY), axis=1)

    def entropy_sym(self, dist_info_vars):
        prob_var = dist_info_vars["prob"]
        return -TT.sum(prob_var * TT.log(prob_var + TINY), axis=1)

    def log_likelihood_sym(self, x_var, dist_info_vars):
        probs = dist_info_vars["prob"]
        # Assume layout is N * A
        return TT.log(
            TT.sum(probs * TT.cast(x_var, 'float32'), axis=-1) + TINY)

    def log_likelihood(self, xs, dist_info):
        probs = dist_info["prob"]
        # Assume layout is N * A
        n = probs.shape[0]
        return np.log(probs[np.arange(n), from_onehot(np.asarray(xs))] + TINY)

    def sample_sym(self, dist_info):
        probs = dist_info["prob"]
        return self._srng.multinomial(pvals=probs, dtype='uint8')

    @property
    def dist_info_keys(self):
        return ["prob"]
Exemplo n.º 42
0
class ParticleFilter():
	''' Implements particle filtering and smoothing for Markov Chains
	 with arbitrary proposal/true distributions '''
	
	def __init__(self, transition_model, observation_model, n_particles, observation_input=None, n_history=1):
		
		self.transition_model=transition_model
		self.observation_model=observation_model
		self.data_dims=observation_model.output_dims
		self.state_dims=transition_model.output_dims
		self.n_particles=n_particles
		self.n_history=n_history
		
		#this is used to keep track of what set of particles corresponds
		#to the previous point in time
		self.time_counter=theano.shared(0)
		
		self.theano_rng=RandomStreams()
		
		#init_particles=np.zeros((n_history+1, n_particles, self.state_dims)).astype(np.float32)
		init_particles=np.random.randn(n_history+1, n_particles, self.state_dims).astype(np.float32)
		init_weights=(np.ones((n_history+1, n_particles))/float(n_particles)).astype(np.float32)
		
		self.particles=theano.shared(init_particles)
		self.weights=theano.shared(init_weights)
		
		self.next_state=self.particles[(self.time_counter+1)%(self.n_history+1)]
		self.current_state=self.particles[self.time_counter%(self.n_history+1)]
		self.previous_state=self.particles[(self.time_counter-1)%(self.n_history+1)]
		
		self.next_weights=self.weights[(self.time_counter+1)%(self.n_history+1)]
		self.current_weights=self.weights[self.time_counter%(self.n_history+1)]
		self.previous_weights=self.weights[(self.time_counter-1)%(self.n_history+1)]
		
		self.proposal_distrib=None
		
		self.true_log_transition_probs=self.transition_model.rel_log_prob
		self.true_log_observation_probs=self.observation_model.rel_log_prob
		
		self.perform_inference=None
		self.resample=None
		self.sample_joint=None
		
		self.observation_input=observation_input
		
		ess=self.compute_ESS()
		self.get_ESS=theano.function([],ess)
		
		n_samps=T.lscalar()
		n_T=T.lscalar()
		data_samples, state_samples, init_state_samples, data_sample_updates=self.sample_future(n_samps,n_T)
		self.sample_from_future=theano.function([n_samps, n_T],[data_samples,state_samples,init_state_samples],updates=data_sample_updates)
		
		self.get_current_particles=theano.function([],self.current_state)
		self.get_current_weights=theano.function([],self.current_weights)
		
	
	def recompile(self):
		'''This function compiles each of the theano functions that might
		change following a change of the model. '''
		
		samp_updates=self.sample_update(self.observation_input)
		self.perform_inference=theano.function([],updates=samp_updates)
		
		res_updates=self.resample_update()
		self.resample=theano.function([],updates=res_updates)
		
		nsamps=T.lscalar()
		joint_samples, joint_updates=self.sample_from_joint(nsamps)
		self.sample_joint=theano.function([nsamps],joint_samples,updates=joint_updates)
		
		new_ess, stddevhist, esshist, sr_updates=self.sequential_resample()
		self.perform_sequential_resampling=theano.function([],[new_ess,stddevhist,esshist],updates=sr_updates)
		
		csamps=self.sample_current(nsamps)
		self.sample_current_state=theano.function([nsamps],csamps)
		
		psamps=self.sample_prev(nsamps)
		self.sample_previous_state=theano.function([nsamps],psamps)
		
		return
	
	
	def set_proposal(self, proposal_distrib):
		
		self.proposal_distrib=proposal_distrib
		
		return
	
	
	def set_true_log_transition_probs(self, true_log_transition_probs):
		
		self.true_log_transition_probs=true_log_transition_probs
		return
	
	
	def set_true_log_observation_probs(self, true_log_observation_probs):
		
		self.true_log_observation_probs=true_log_observation_probs
		return
	
	
	def sample_update(self, data):
		
		proposal_samples, log_proposal_probs=self.proposal_distrib
		
		printing=False
		
		if printing:
			log_transition_probs=theano.printing.Print('1 log transition probs update')(self.true_log_transition_probs(self.current_state, proposal_samples))
			log_observation_probs=theano.printing.Print('2 log observation probs update')(self.true_log_observation_probs(proposal_samples, data.dimshuffle('x',0)))
			log_unnorm_weights=theano.printing.Print('3 log unnorm weights update')(log_transition_probs + log_observation_probs - log_proposal_probs)
			log_unnorm_weights_center=theano.printing.Print('4 log unnorm weights center update')(log_unnorm_weights-T.max(log_unnorm_weights))
			unnorm_weights=theano.printing.Print('5 unnorm weights update')(T.exp(log_unnorm_weights_center)*self.current_weights)
			normalizer=theano.printing.Print('6 normalizer update')(T.sum(unnorm_weights))
		else:
			log_transition_probs=self.true_log_transition_probs(self.current_state, proposal_samples)
			log_observation_probs=self.true_log_observation_probs(proposal_samples, data.dimshuffle('x',0))
			log_unnorm_weights=log_transition_probs + log_observation_probs - log_proposal_probs
			log_unnorm_weights_center=log_unnorm_weights-T.max(log_unnorm_weights)
			unnorm_weights=T.exp(log_unnorm_weights_center)*self.current_weights
			normalizer=T.sum(unnorm_weights)

		
		weights=unnorm_weights/normalizer
		
		updates=OrderedDict()
		
		updates[self.weights]=T.set_subtensor(self.next_weights, weights)
		
		updates[self.particles]=T.set_subtensor(self.next_state, proposal_samples)
		
		updates[self.time_counter]=self.time_counter+1
		
		return updates
	
	
	def compute_ESS(self):
		
		return 1.0/T.sum(self.current_weights**2)
	
	
	def resample_update(self):
		
		#shape: n_particles by n_particles
		samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),self.n_particles,axis=0))
		idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64')
		updates=OrderedDict()
		updates[self.particles]=T.set_subtensor(self.current_state, self.current_state[idxs])
		updates[self.weights]=T.set_subtensor(self.current_weights, T.cast(T.ones_like(self.current_weights)/float(self.n_particles),'float32'))
		return updates
	
	
	def sample_step(self, future_samps, t, n_samples):
		
		particles_now=self.particles[(self.time_counter-t)%(self.n_history+1)]
		weights_now=self.weights[(self.time_counter-t)%(self.n_history+1)]
		
		#n_particles by n_samples
		rel_log_probs=self.true_log_transition_probs(particles_now, future_samps, all_pairs=True)
		
		unnorm_probs=T.exp(rel_log_probs)*weights_now.dimshuffle(0,'x')
		probs=unnorm_probs/T.sum(unnorm_probs, axis=0).dimshuffle('x',0)
		
		samps=self.theano_rng.multinomial(pvals=probs.T)
		idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64')
		output_samples=particles_now[idxs]
		
		return [output_samples, t+1]
	
	
	def sample_from_joint(self, n_samples, output_2D=False):
		'''Samples from the joint posterior P(s_t-n_history:s_t | observations)
		n_samples: the number of samples to draw
		
		Returns an array with shape (n_history+1, n_samples, state_dims),
		where array[-1] corresponds to the current time.
		'''
		samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),n_samples,axis=0))
		idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64')
		samps_t0=self.current_state[idxs]
		
		t0=T.as_tensor_variable(1)
		
		[samples, ts], updates = theano.scan(fn=self.sample_step,
											outputs_info=[samps_t0, t0],
											non_sequences=[n_samples],
											n_steps=self.n_history)
		
		#the variable "samples" that results from the scan is time-flipped
		#in the sense that samples[0] corresponds to the most recent point
		#in time, and higher indices correspond to points in the past.
		#I will stick to the convention that for any collection of points in 
		#time, [-1] will index the most recent time, and [0] will index
		#the point farthest in the past. So, the first axis of "samples" 
		#needs to be flipped.
		flip_idxs=T.cast(-T.arange(self.n_history)+self.n_history-1,'int64')
		samples=T.concatenate([samples[flip_idxs], samps_t0.dimshuffle('x',0,1)], axis=0)
		
		if output_2D:
			samples=T.reshape(samples, ((self.n_history+1)*n_samples, self.state_dims))
		
		return samples, updates
	
	
	def sample_future(self, n_samples, n_T):
		'''Samples from the "future" data distribution: 
				P(s_t+1,...s_t+n_T, x_t+1,...x_t+n_T | s_t)
		
		n_samples: number of samples to draw
		n_T: the number of (future) time points to sample from
		
		Returns three arrays. The first two have shapes 
		(n_T, n_samples, data_dims) and
		(n_T, n_samples, state_dims),
		corresponding to samples of future observations and states,
		and the third having size (n_samples,state_dims),
		corresponding to the "initial" samples taken from the current
		state distribution.
		'''
		
		samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),n_samples,axis=0))
		idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64')
		samps_t0=self.current_state[idxs]
		
		def fstep(states):
			next_states=self.transition_model.get_samples_noprobs(states)
			next_data=self.observation_model.get_samples_noprobs(next_states)
			return next_states, next_data
		
		[state_samples, data_samples], updates = theano.scan(fn=fstep,
											outputs_info=[samps_t0, None],
											n_steps=n_T)
		
		#data_samples=self.observation_model.get_samples_noprobs(state_samples)
		
		return data_samples, state_samples, samps_t0, updates
	
	
	def sample_model(self, n_samples, n_T):
		'''Samples from the "future" data distribution: 
				P(s_t+1,...s_t+n_T, x_t+1,...x_t+n_T | s_t)
		
		n_samples: number of samples to draw
		n_T: the number of (future) time points to sample from
		
		Returns three arrays. The first two have shapes 
		(n_T, n_samples, data_dims) and
		(n_T, n_samples, state_dims),
		corresponding to samples of future observations and states,
		and the third having size (n_samples,state_dims),
		corresponding to the "initial" samples taken from the current
		state distribution.
		'''
		
		samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),n_samples,axis=0))
		idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64')
		samps_t0=self.current_state[idxs]
		
		state_samples, updates = theano.scan(fn=self.transition_model.get_samples_noprobs,
											outputs_info=[samps_t0],
											n_steps=n_T)
		
		data_sample=self.observation_model.get_samples_noprobs(state_samples[-1])
		
		return data_sample, state_samples[-1], state_samples[-2], updates
	
	
	def sr_step(self, means, weights, stddev, ess, decay):
		
		#Sampling from a mixture of gaussians
		msamps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(weights.dimshuffle('x',0),means.shape[0],axis=0))
		idxs=T.cast(T.dot(msamps, T.arange(means.shape[0])),'int64')
		sample_means=T.cast(means[idxs],'float32')
		
		proposal_samples=self.theano_rng.normal(size=means.shape)*stddev.dimshuffle('x',0)+sample_means
		diffs=proposal_samples.dimshuffle(0,'x',1)-sample_means.dimshuffle('x',0,1)
		
		printing=False
		if printing:
			log_proposal_probs=theano.printing.Print('1 log_proposal_probs')(T.log(T.dot(T.exp(-T.sum((1.0/(2.0*stddev**2)).dimshuffle('x','x',0)*diffs**2,axis=2)),weights)))
			log_transition_probs=theano.printing.Print('2 log transition probs')(self.true_log_transition_probs(self.previous_state, proposal_samples, all_pairs=True))
			log_transition_probs_2=theano.printing.Print('3 log transition probs 2')(T.log(T.dot(T.exp(log_transition_probs).T,self.previous_weights)))
			log_observation_probs=theano.printing.Print('4 log observation probs')(self.true_log_observation_probs(proposal_samples, self.observation_input.dimshuffle('x',0)))
			log_unnorm_weights=theano.printing.Print('5 log unnorm weights nomax')(log_transition_probs_2 + log_observation_probs - log_proposal_probs)
			log_unnorm_weights=theano.printing.Print('6 log unnorm weights')(log_unnorm_weights-T.max(log_unnorm_weights))
			unnorm_weights=theano.printing.Print('7 unnorm weights')(T.exp(log_unnorm_weights))
			normalizer=theano.printing.Print('8 normalizer')(T.sum(unnorm_weights))
		else:
			log_proposal_probs=T.log(T.dot(T.exp(-T.sum((1.0/(2.0*stddev**2)).dimshuffle('x','x',0)*diffs**2,axis=2)),weights))
			log_transition_probs=self.true_log_transition_probs(self.previous_state, proposal_samples, all_pairs=True)
			log_transition_probs=T.log(T.dot(T.exp(log_transition_probs).T,self.previous_weights))
			log_observation_probs=self.true_log_observation_probs(proposal_samples, self.observation_input.dimshuffle('x',0))
			log_unnorm_weights=log_transition_probs + log_observation_probs - log_proposal_probs
			log_unnorm_weights=log_unnorm_weights-T.max(log_unnorm_weights)
			unnorm_weights=T.exp(log_unnorm_weights)
			normalizer=T.sum(unnorm_weights)

		
		new_weights=unnorm_weights/normalizer
		
		new_ess=1.0/T.sum(new_weights**2)
		
		sampmean=T.dot(proposal_samples.T, new_weights)
		sampvar=T.dot(((proposal_samples-sampmean.dimshuffle('x',0))**2).T,new_weights)
		#propmean=T.mean(proposal_samples, axis=0)
		#propvar=T.mean((proposal_samples-propmean.dimshuffle('x',0))**2,axis=0)
		#new_stddev=stddev*T.clip(T.exp(decay*(1.0-propvar/sampvar)),0.5,2.0)
		#new_stddev=T.clip(stddev*T.clip(T.exp(decay*(1.0-stddev**2/sampvar)),0.5,2.0),0.0,4.0)
		new_stddev=T.clip(stddev*T.clip(T.exp(decay*(1.0-stddev**2/sampvar)),0.5,1.5),0.0,4.0)
		return [proposal_samples, new_weights, new_stddev, T.cast(new_ess,'float32')]#, theano.scan_module.until(new_ess>100)
	
	
	def sequential_resample(self, init_stddev=4.0, max_steps=20, stddev_decay=0.1):
		'''Repeatedly resamples and then samples from a proposal distribution
		constructed from the current samples. Should be used when the main
		proposal distribution is poor or whenever the ESS is poor.
		'''
		
		essT=T.as_tensor_variable(np.asarray(0.0,dtype='float32'))
		stddevT=T.as_tensor_variable(np.asarray(init_stddev*np.ones(self.state_dims),dtype='float32'))
		decayT=T.as_tensor_variable(np.asarray(stddev_decay,dtype='float32'))
		
		[samphist, weighthist, stddevhist, esshist], updates = theano.scan(fn=self.sr_step,
				outputs_info=[self.current_state, self.current_weights, stddevT, essT],
				non_sequences=decayT,
				n_steps=max_steps)
		
		end_samples=samphist[-1]
		end_weights=weighthist[-1]
		
		updates[self.particles]=T.set_subtensor(self.current_state, end_samples)
		updates[self.weights]=T.set_subtensor(self.current_weights, end_weights)
		return 1.0/T.sum(end_weights**2), stddevhist, esshist, updates
	
	
	def sample_current(self, nsamps):
		samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),nsamps,axis=0))
		idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64')
		samples=self.current_state[idxs]
		return samples
	
	
	def sample_prev(self, nsamps):
		samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.previous_weights.dimshuffle('x',0),nsamps,axis=0))
		idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64')
		samples=self.previous_state[idxs]
		return samples
	
	
	def get_history(self):
		'''This function returns a 3-D array containing all the particles
		and a 2-D array of weights for the entire memory. The first dimension indexes
		time, with the zeroth entry corresponding to the earliest point in 
		memory.'''
		idxs=(T.arange(self.n_history+1)-self.n_history+self.time_counter)%(self.n_history+1)
		return self.particles[idxs], self.weights[idxs]
Exemplo n.º 43
0
class SLmodel():

    #This version adapts the proposal distribution by keeping a running
    #estimate of the exact posterior covariance, parametrized as the
    #matrix CC'

    def __init__(self, nx, ns, nh, npcl, xvar=1.0):

        #for this model I assume one linear generative model and a
        #combination of nh linear dynamical models

        #generative matrix
        init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32')
        #init_W=np.asarray(np.eye(2),dtype='float32')

        #always normalize the columns of W to be unit length
        init_W = init_W / np.sqrt(np.sum(init_W**2, axis=0))

        #observed variable means
        init_c = np.asarray(np.zeros(nx), dtype='float32')

        #dynamical matrices
        #init_M=np.asarray(np.random.randn(ns,ns*nh)/2.0,dtype='float32')
        init_M = np.asarray((np.tile(np.eye(ns), (1, nh))), dtype='float32')

        #state-variable variances
        #(covariance matrix of state variable noise assumed to be diagonal)
        init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32')

        #Switching parameter matrix
        init_A = np.asarray(np.zeros((ns, nh)), dtype='float32')

        #priors for switching variable
        init_ph = np.asarray(np.zeros(nh), dtype='float32')

        self.W = theano.shared(init_W)
        self.c = theano.shared(init_c)
        self.M = theano.shared(init_M)
        self.b = theano.shared(init_b)
        self.A = theano.shared(init_A)
        self.ph = theano.shared(init_ph)

        #square root of covariance matrix of proposal distribution
        #initialized to the true root covariance
        init_cov_inv = np.dot(
            init_W.T, init_W) / (xvar**2) + np.eye(ns) * np.exp(-init_b)
        init_cov = spla.inv(init_cov_inv)
        init_C = spla.sqrtm(init_cov)
        init_C = np.asarray(np.real(init_C), dtype='float32')

        init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_h_now = np.asarray(np.zeros((npcl, nh)), dtype='float32')
        init_h_now[:, 0] = 1.0
        init_weights_now = np.asarray(np.ones(npcl) / float(npcl),
                                      dtype='float32')

        init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32')
        init_h_past[:, 0] = 1.0
        init_weights_past = np.asarray(np.ones(npcl) / float(npcl),
                                       dtype='float32')

        self.C = theano.shared(init_C)

        #this is to help vectorize operations
        self.sum_mat = T.as_tensor_variable(
            np.asarray((np.tile(np.eye(ns), nh)).T, dtype='float32'))

        self.s_now = theano.shared(init_s_now)
        self.h_now = theano.shared(init_h_now)
        self.weights_now = theano.shared(init_weights_now)

        self.s_past = theano.shared(init_s_past)
        self.h_past = theano.shared(init_h_past)
        self.weights_past = theano.shared(init_weights_past)

        self.xvar = np.asarray(xvar, dtype='float32')

        self.nx = nx  #dimensionality of observed variables
        self.ns = ns  #dimensionality of latent variables
        self.nh = nh  #number of (linear) dynamical modes
        self.npcl = npcl  #numer of particles in particle filter

        #for ease of use and efficient computation (these are used a lot)
        self.CCT = T.dot(self.C, self.C.T)
        self.cov_inv = T.dot(
            self.W.T, self.W) / (self.xvar**2) + T.eye(ns) * T.exp(-self.b)

        self.theano_rng = RandomStreams()

        self.params = [self.W, self.M, self.b, self.A, self.c, self.ph]
        self.rel_lrates = np.asarray([0.1, 1.0, 1.0, 10.0, 1.0, 1.0],
                                     dtype='float32')

        self.meta_params = [self.C]
        self.meta_rel_lrates = [1.0]

    def sample_proposal_s(self, s, h, xp):

        s_pred = self.get_prediction(s, h)

        n = self.theano_rng.normal(size=T.shape(s))

        mean_term = T.dot(
            (xp - self.c), self.W) / (self.xvar**2) + s_pred * T.exp(-self.b)
        prop_mean = T.dot(mean_term, self.CCT)

        s_prop = prop_mean + T.dot(n, self.C)

        #I compute the term inside the exponent for the pdf of the proposal distrib
        prop_term = -T.sum(n**2) / 2.0

        return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast(
            prop_term, 'float32'), prop_mean

    def calc_h_probs(self, s):

        #this function takes an np by ns matrix of s samples
        #and returns an nh by np set of h probabilities

        exp_terms = T.dot(s, self.A) + T.reshape(self.ph, (1, self.nh))

        #re-centering for numerical stability
        exp_terms_recentered = exp_terms - T.max(exp_terms, axis=1)

        #exponentiation and normalization
        rel_probs = T.exp(exp_terms)
        probs = rel_probs.T / T.sum(rel_probs, axis=1)

        return probs.T

    def forward_filter_step(self, xp):

        #need to sample from the proposal distribution first
        s_samps, s_pred, prop_terms, prop_means = self.sample_proposal_s(
            self.s_now, self.h_now, xp)

        updates = {}

        #now that we have samples from the proposal distribution, we need to reweight them

        h_probs = self.calc_h_probs(s_samps)

        h_samps = self.theano_rng.multinomial(pvals=h_probs)

        recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1))

        x_terms = -T.sum(
            (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 *
                                                                  self.xvar**2)
        s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1) / 2.0

        energies = x_terms + s_terms - prop_terms

        #to avoid exponentiating large or very small numbers, I
        #"re-center" the reweighting factors by adding a constant,
        #as this has no impact on the resulting new weights

        energies_recentered = energies - T.max(energies)

        alpha = T.exp(energies_recentered)  #these are the reweighting factors

        new_weights_unnorm = self.weights_now * alpha
        normalizer = T.sum(new_weights_unnorm)
        new_weights = new_weights_unnorm / normalizer  #need to normalize new weights

        updates[self.h_past] = T.cast(self.h_now, 'float32')
        updates[self.s_past] = T.cast(self.s_now, 'float32')

        updates[self.h_now] = T.cast(h_samps, 'float32')
        updates[self.s_now] = T.cast(s_samps, 'float32')

        updates[self.weights_past] = T.cast(self.weights_now, 'float32')
        updates[self.weights_now] = T.cast(new_weights, 'float32')

        #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates
        #return normalizer, energies_recentered, updates
        #return h_samps, updates
        return updates

    def proposal_loss(self, C):

        #calculates how far off self.CCT is from the true posterior covariance
        CCT = T.dot(C, C.T)
        prod = T.dot(CCT, self.cov_inv)
        diff = prod - T.eye(self.ns)
        tot = T.sum(T.sum(diff**2))  #frobenius norm

        return tot

    def prop_update_step(self, C_now, lr):

        loss = self.proposal_loss(C_now)
        gr = T.grad(loss, C_now)
        return [C_now - lr * gr]

    def update_proposal_distrib(self, n_steps, lr):

        #does some gradient descent on self.C, so that self.CCT becomes
        #closer to the true posterior covariance
        C0 = self.C
        Cs, updates = theano.scan(fn=self.prop_update_step,
                                  outputs_info=[C0],
                                  non_sequences=[lr],
                                  n_steps=n_steps)

        updates[self.C] = Cs[-1]

        loss = self.proposal_loss(Cs[-1])

        #updates={}
        #updates[self.C]=self.prop_update_step(self.C,lr)
        #loss=self.proposal_loss(self.C)

        return loss, updates

    def get_prediction(self, s, h):

        s_dot_M = T.dot(s, self.M)  #this is np by nh*ns
        s_pred = T.dot(s_dot_M * T.extra_ops.repeat(h, self.ns, axis=1),
                       self.sum_mat)  #should be np by ns

        return T.cast(s_pred, 'float32')

    def sample_joint(self, sp):

        t2_samp = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s2_samp = T.cast(
            T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')
        h2_samp = T.cast(
            T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')

        diffs = self.b * (s2_samp - sp)
        sqr_term = T.sum(diffs**2, axis=1)
        alpha = T.exp(-sqr_term)
        probs_unnorm = self.weights_past * alpha
        probs = probs_unnorm / T.sum(probs_unnorm)

        t1_samp = self.theano_rng.multinomial(
            pvals=T.reshape(probs, (1, self.npcl))).T
        s1_samp = T.cast(
            T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')
        h1_samp = T.cast(
            T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')

        return [s1_samp, h1_samp, s2_samp, h2_samp]

    def calc_mean_h_energy(self, s, h):

        #you give this function a set of samples of s and h,
        #it gives you the average energy of those samples

        exp_terms = T.dot(s, self.A) + T.reshape(self.ph,
                                                 (1, self.nh))  #np by nh

        energies = T.sum(h * exp_terms, axis=1) - T.log(
            T.sum(T.exp(exp_terms), axis=1))  #should be np by 1

        energy = T.mean(energies)

        return energy

    def update_params(self, x1, x2, n_samps, lrate):

        #this function samples from the joint posterior and performs
        # a step of gradient ascent on the log-likelihood

        sp = self.get_prediction(self.s_past, self.h_past)

        #sp should be np by ns

        [s1_samps, h1_samps, s2_samps, h2_samps
         ], updates = theano.scan(fn=self.sample_joint,
                                  outputs_info=[None, None, None, None],
                                  non_sequences=[sp],
                                  n_steps=n_samps)

        x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1))
        x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1))

        s_pred = self.get_prediction(s1_samps, h1_samps)

        hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps)
        #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps)

        sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1)) / 2.0

        #xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2))
        xterm2 = -T.mean(
            T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) /
            (2.0 * self.xvar**2))

        #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2))
        energy = hterm1 + xterm2 + sterm

        gparams = T.grad(
            energy,
            self.params,
            consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps])

        # constructs the update dictionary
        for gparam, param, rel_lr in zip(gparams, self.params,
                                         self.rel_lrates):
            #gnat=T.dot(param, T.dot(param.T,param))
            updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32')

        return energy, updates

    def get_ESS(self):

        return 1.0 / T.sum(self.weights_now**2)

    def resample_step(self):

        idx = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0)
        h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0)

        return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32')

    def resample(self):

        [s_samps, h_samps], updates = theano.scan(fn=self.resample_step,
                                                  outputs_info=[None, None],
                                                  n_steps=self.npcl)

        updates[self.s_now] = T.cast(s_samps, 'float32')
        updates[self.h_now] = T.cast(h_samps, 'float32')
        updates[self.weights_now] = T.cast(
            T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'),
            'float32')  #dtype paranoia

        return updates

    def simulate_step(self, s):

        s = T.reshape(s, (1, self.ns))
        #get h probabilities
        h_probs = self.calc_h_probs(s)

        #h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(self.nh,1)))
        h_samp = self.theano_rng.multinomial(pvals=h_probs)

        sp = self.get_prediction(s, h_samp)

        xp = T.dot(self.W, sp.T) + T.reshape(self.c, (self.nx, 1))

        return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp

    def simulate_forward(self, n_steps):

        s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)),
                   axis=0)
        s0 = T.reshape(s0, (1, self.ns))
        [sp, xp, hs], updates = theano.scan(fn=self.simulate_step,
                                            outputs_info=[s0, None, None],
                                            n_steps=n_steps)

        return sp, xp, hs, updates
Exemplo n.º 44
0
class LatentPolicy(BaseNNModule):

    # Policy network takes three inputs and produces a single
    # system action embedding. Its use is heavily coupled with decoder.

    def __init__(self, latent_size, learn_mode, belief_size, degree_size,
                 ihidden_size, ohidden_size, tfEncoder, tbEncoder, sfEncoder,
                 sbEncoder):

        # latent variable dimension
        self.dl = latent_size
        hidden_size = 100
        # set default sampling mode: posterior, from all actions
        if learn_mode == 'rl': self.setSampleMode('prior', 5)
        else: self.setSampleMode('posterior', latent_size)

        # random seed
        self.srng = RandomStreams(seed=234)

        # decoder input embedding
        self.Wd1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (latent_size,hidden_size)).astype(theano.config.floatX))
        self.Wd2 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (latent_size,hidden_size)).astype(theano.config.floatX))
        self.bd1 = theano.shared(2. * np.ones(
            (hidden_size)).astype(theano.config.floatX))
        self.Wd3 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (hidden_size*2,ohidden_size)).astype(theano.config.floatX))
        # for state construction
        # belief to state
        self.Ws1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (belief_size,hidden_size)).astype(theano.config.floatX))
        # matching degree to state
        self.Ws2 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (degree_size,hidden_size)).astype(theano.config.floatX))
        # intent to state
        self.Ws3 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (ihidden_size*2,hidden_size)).astype(theano.config.floatX))
        # latent policy parameterisation, state -> action
        self.Wp1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (hidden_size,hidden_size)).astype(theano.config.floatX))
        self.Wp2 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (hidden_size,latent_size)).astype(theano.config.floatX))
        self.bp1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (hidden_size)).astype(theano.config.floatX))
        # prior parameters P(z_t|S_t) and P(R_t|z_t)
        self.params = [
            self.Wd1, self.Wd2, self.bd1, self.Wd3, self.Ws1, self.Ws2,
            self.Ws3, self.Wp1, self.Wp2, self.bp1
        ]

        # approximated posterior parameters Q(z_t|S_t,R_t)
        # sentence encoders
        self.sfEncoder, self.sbEncoder = sfEncoder, sbEncoder
        self.tfEncoder, self.tbEncoder = tfEncoder, tbEncoder
        # belief to posterior
        self.Wq1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (belief_size,hidden_size)).astype(theano.config.floatX))
        # matching degree to posterior
        self.Wq2 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (degree_size,hidden_size)).astype(theano.config.floatX))
        # intent to posterior
        self.Wq3 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (ihidden_size*2,hidden_size)).astype(theano.config.floatX))
        # response to posterior
        self.Wq4 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (ihidden_size*2,hidden_size)).astype(theano.config.floatX))
        # MLP 2nd layer
        self.Wq5 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\
                (hidden_size,latent_size)).astype(theano.config.floatX))
        #posterior parameters Q(z_t|S_t,R_t)
        self.Qparams = [self.Wq1, self.Wq2, self.Wq3, self.Wq4, self.Wq5]
        self.Qparams.extend(self.tfEncoder.params + self.tbEncoder.params +
                            self.sfEncoder.params + self.sbEncoder.params)
        # add posterior also into parameter set
        self.params.extend(self.Qparams)

        # Reinforce baseline
        self.baseline = ReinforceBaseline(belief_size, degree_size,
                                          ihidden_size)

    def setSampleMode(self, sample_mode, topN):
        self.sample_mode = sample_mode
        self.topN = topN

    def encode(self,
               belief_t,
               degree_t,
               intent_t,
               masked_source_t,
               masked_source_len_t,
               masked_target_t,
               masked_target_len_t,
               utt_group_t,
               sample_t=None):

        # prepare belief state vector
        belief_t = G.disconnected_grad(T.concatenate(belief_t, axis=0))
        ##########################
        # prior parameterisarion #
        ##########################
        hidden_t = T.tanh(
            T.dot(belief_t, self.Ws1) + T.dot(degree_t, self.Ws2) +
            T.dot(intent_t, self.Ws3))
        prior_t = T.nnet.softmax(
            T.dot(T.tanh(T.dot(hidden_t, self.Wp1) + self.bp1), self.Wp2))

        ##############################
        # posterior parameterisation #
        ##############################
        # response encoding
        target_intent_t = bidirectional_encode(self.tfEncoder, self.tbEncoder,
                                               masked_target_t,
                                               masked_target_len_t)
        source_intent_t = bidirectional_encode(self.sfEncoder, self.sbEncoder,
                                               masked_source_t,
                                               masked_source_len_t)
        # scores before softmax layer
        q_logit_t = T.dot(
            T.tanh(
                T.dot(belief_t, self.Wq1) + T.dot(degree_t, self.Wq2) +
                T.dot(source_intent_t, self.Wq3) +
                T.dot(target_intent_t, self.Wq4)), self.Wq5)

        # sampling from a scaled posterior
        if self.sample_mode == 'posterior':
            print '\t\tSampling from posterior ...'
            posterior_t = T.nnet.softmax(q_logit_t)
            z_t = T.switch(
                T.lt(utt_group_t, self.dl - 1), utt_group_t,
                G.disconnected_grad(
                    T.argmax(
                        self.srng.multinomial(pvals=posterior_t,
                                              dtype='float32')[0])))
        else:
            # choose to use the current sample or ground truth
            print '\t\tSampling from prior ...'
            z_t = T.switch(T.lt(utt_group_t, self.dl - 1), utt_group_t,
                           sample_t)

        # put sample into decoder to decode
        hidden_t = T.nnet.sigmoid(self.Wd2[z_t, :] + self.bd1) * hidden_t
        actEmb_t = T.tanh(
            T.dot(T.concatenate([T.tanh(self.Wd1[z_t, :]), hidden_t], axis=0),
                  self.Wd3)).dimshuffle('x', 0)

        # return the true posterior
        posterior_t = T.nnet.softmax(q_logit_t)

        # compute baseline estimate
        b_t = self.baseline.encode(belief_t, degree_t, source_intent_t,
                                   target_intent_t)

        return actEmb_t, prior_t[0], posterior_t[0], z_t, b_t, posterior_t

    def decide(self,
               belief_t,
               degree_t,
               intent_t,
               masked_source_t,
               masked_target_t,
               forced_sample=None):
        # prepare belief state vector
        belief_t = np.concatenate(belief_t, axis=0)
        # sample how many actions
        n = 1
        # forced sampling
        if forced_sample != None:
            z_t = [forced_sample]
            prob_t = None
        # different sampling mode
        elif self.sample_mode == 'posterior' and masked_target_t != None:
            # training time, sample from posterior
            z_t, prob_t = self._sample_from_posterior(belief_t, degree_t,
                                                      intent_t,
                                                      masked_source_t,
                                                      masked_target_t)
        elif self.sample_mode == 'prior':
            # testing time, sample from prior
            z_t, prob_t = self._sample_from_prior(belief_t, degree_t, intent_t)

        # state representation
        hidden_t = tanh(
            np.dot(belief_t, self.Ws1_backup) +
            np.dot(degree_t, self.Ws2_backup) +
            np.dot(intent_t, self.Ws3_backup))

        # put sample into decoder to decode
        hidden_t = np.multiply(
            sigmoid(self.Wd2_backup[z_t, :] + self.bd1_backup), hidden_t)
        hidden_t = np.repeat(hidden_t, n, axis=0)
        actEmb_t = tanh(
            np.dot(
                np.concatenate([tanh(self.Wd1_backup[z_t, :]), hidden_t],
                               axis=1), self.Wd3_backup))

        return actEmb_t, z_t, prob_t

    def _sample_from_prior(self, belief_t, degree_t, intent_t):

        # prior parameterisarion
        hidden_t = tanh(
            np.dot(belief_t, self.Ws1_backup) +
            np.dot(degree_t, self.Ws2_backup) +
            np.dot(intent_t, self.Ws3_backup))
        p_logit_t = np.dot(
            tanh(np.dot(hidden_t, self.Wp1_backup) + self.bp1_backup),
            self.Wp2_backup)

        # sampling from prior
        sortedIndex = np.argsort(p_logit_t)[::-1][:self.topN]
        topN_prior_t = softmax(p_logit_t[sortedIndex])
        z_t = sortedIndex[np.argmax(
            np.random.multinomial(n=1, pvals=topN_prior_t))]
        z_t = np.expand_dims(z_t, axis=0)
        # choose the top N samples
        print 'Sample     : %s' % z_t
        print 'Prior dist.: %s' % sortedIndex
        print 'probability: %s' % topN_prior_t
        print
        return z_t, softmax(p_logit_t)

    def _sample_from_posterior(self, belief_t, degree_t, intent_t,
                               masked_source_t, masked_target_t):

        # Posterior
        # response encoding
        target_intent_t = bidirectional_read(self.tfEncoder, self.tbEncoder,
                                             masked_target_t)
        source_intent_t = bidirectional_read(self.sfEncoder, self.sbEncoder,
                                             masked_source_t)
        # posterior parameterisation
        q_logit_t = np.dot(
            tanh(
                np.dot(belief_t, self.Wq1_backup) +
                np.dot(degree_t, self.Wq2_backup) +
                np.dot(source_intent_t, self.Wq3_backup) +
                np.dot(target_intent_t, self.Wq4_backup)), self.Wq5_backup)

        # sampling from a scaled posterior
        sortedIndex = np.argsort(q_logit_t)[::-1][:self.topN]
        topN_posterior_t = softmax(q_logit_t[sortedIndex])
        z_t = sortedIndex[np.argmax(
            np.random.multinomial(n=1, pvals=topN_posterior_t))]
        #z_t = sortedIndex[0]
        z_t = np.expand_dims(z_t, axis=0)
        print sortedIndex[:3]
        print softmax(q_logit_t)[sortedIndex][:3]
        print 'Posterior  : %s' % sortedIndex
        print 'probability: %s' % topN_posterior_t

        return z_t, softmax(q_logit_t)

    def loadConverseParams(self):
        # decoder
        self.Wd1_backup = self.params[0].get_value()
        self.Wd2_backup = self.params[1].get_value()
        self.bd1_backup = self.params[2].get_value()
        self.Wd3_backup = self.params[3].get_value()
        # state
        self.Ws1_backup = self.params[4].get_value()
        self.Ws2_backup = self.params[5].get_value()
        self.Ws3_backup = self.params[6].get_value()
        # latent policy (conditional prior)
        self.Wp1_backup = self.params[7].get_value()
        self.Wp2_backup = self.params[8].get_value()
        self.bp1_backup = self.params[9].get_value()
        # posterior
        self.Wq1_backup = self.params[10].get_value()
        self.Wq2_backup = self.params[11].get_value()
        self.Wq3_backup = self.params[12].get_value()
        self.Wq4_backup = self.params[13].get_value()
        self.Wq5_backup = self.params[14].get_value()
        # posterior sentence encoder
        self.tfEncoder.loadConverseParams()
        self.tbEncoder.loadConverseParams()
        self.sfEncoder.loadConverseParams()
        self.sbEncoder.loadConverseParams()
class SCLmodel():
	
	#This class defines the switched constrained linear model, which was
	#designed to eliminate state-space 'explosions' that can occur when
	#doing prediction - a serious issue in the basic SL model
	
	def __init__(self, nx, ns, nh, npcl, xvar=1.0):
		
		#for this model I assume one linear generative model and a 
		#combination of nh linear dynamical models
		
		#generative matrix
		init_W=np.asarray(np.random.randn(nx,ns)/10.0,dtype='float32')
		#init_W=np.asarray(np.eye(2),dtype='float32')
		
		#always normalize the columns of W to be unit length
		init_W=init_W/np.sqrt(np.sum(init_W**2,axis=0))
		
		
		#observed variable means
		init_c=np.asarray(np.zeros(nx),dtype='float32')
		
		#dynamical matrices
		init_M=np.asarray(np.random.randn(nh,ns**2)/2.0,dtype='float32')
		
		#state-variable variances
		#(covariance matrix of state variable noise assumed to be diagonal)
		init_b=np.asarray(np.ones(ns)*10.0,dtype='float32')
		
		#means for switching variable
		init_mu=np.asarray(np.random.randn(nh,ns)/1.0,dtype='float32')
		
		#(natural log of) covariance matrices for switching variable
		#I assume the covariance matrices to be diagonal, so I 
		#store all the diagonal elements in a ns-by-nh matrix
		init_A=np.asarray(np.zeros((nh,ns)),dtype='float32')
		
		init_s_now=np.asarray(np.zeros((npcl,ns)),dtype='float32')
		init_h_now=np.asarray(np.zeros((npcl,nh)),dtype='float32')
		init_h_now[:,0]=1.0
		init_weights_now=np.asarray(np.ones(npcl)/float(npcl),dtype='float32')
		
		init_s_past=np.asarray(np.zeros((npcl,ns)),dtype='float32')
		init_h_past=np.asarray(np.zeros((npcl,nh)),dtype='float32')
		init_h_past[:,0]=1.0
		init_weights_past=np.asarray(np.ones(npcl)/float(npcl),dtype='float32')
		
		
		
		self.W=theano.shared(init_W)
		self.c=theano.shared(init_c)
		self.M=theano.shared(init_M)
		self.b=theano.shared(init_b)
		self.A=theano.shared(init_A)
		self.mu=theano.shared(init_mu)
		
		#I define thes to avoid repeated computations of the exponential
		#of the elements of A and of the normalizing constants for each h
		self.exp_A=T.exp(self.A)
		self.ln_Z_h=T.reshape(0.5*T.sum(self.A, axis=1), (nh,1))
		
		
		self.s_now=theano.shared(init_s_now)
		self.h_now=theano.shared(init_h_now)
		self.weights_now=theano.shared(init_weights_now)
		
		self.s_past=theano.shared(init_s_past)
		self.h_past=theano.shared(init_h_past)
		self.weights_past=theano.shared(init_weights_past)
		
		self.xvar=np.asarray(xvar,dtype='float32')
		
		self.nx=nx		#dimensionality of observed variables
		self.ns=ns		#dimensionality of latent variables
		self.nh=nh		#number of (linear) dynamical modes
		self.npcl=npcl	#numer of particles in particle filter
		
		self.theano_rng = RandomStreams()
		
		self.params=				[self.W, self.M, self.b, self.A, self.c, self.mu]
		self.rel_lrates=np.asarray([  1.0,    1.0,    0.01,   1.0,   1.0,    10.0]   ,dtype='float32')
	
	
	def sample_proposal_s(self, s, h, xpred, sig):
		
		s_pred=self.get_prediction(s, h)
		
		n=self.theano_rng.normal(size=T.shape(s))
		
		#This is the proposal distribution that arises when one assumes that W'W=I
		
		mean=2.0*(xpred+s_pred*(self.b**2))*sig
		
		s_prop=mean+n*T.sqrt(sig)
		
		#I compute the term inside the exponent for the pdf of the proposal distrib
		prop_term=-T.sum(n**2)/2.0
		
		return T.cast(s_prop,'float32'), T.cast(s_pred,'float32'), T.cast(prop_term,'float32')
	
	
	#This function is required if we allow multiple generative models
	
	#def get_recon(self, s, h):
		
		#W_vec=T.sum(self.W*h, axis=0)
		#W=W.reshape((self.nx, self.ns))
		
		#xr=T.dot(W, s)
		
		#return xr
	
	
	def one_h_prob(self, exp_A_i, mu_i, s):
		
		#scan function for self.calc_h_probs
		smi=s-mu_i   #should be np by ns
		smia=smi*T.reshape(exp_A_i,(1,self.ns))
		gaussian_term=-T.sum(smia*smi,axis=1)
		return gaussian_term
	
	
	def calc_h_probs(self, s):
		
		#gterms, updates = theano.scan(fn=self.one_h_prob,
									#outputs_info=[None],
									#sequences=[self.exp_A, self.mu],
									#non_sequences=[s],
									#n_steps=self.nh)
		#vectorized version
		t1=T.dot(s*s,self.exp_A.T)
		t2=-2.0*T.dot(s, (self.exp_A*self.mu).T)
		t3=T.sum((self.mu*self.mu)*self.exp_A,axis=1)
		gterms=(t1+t2+t3).T
		
		#gterms should be nh by np
		
		#need to multiply by relative partition functions
		exp_terms=gterms+self.ln_Z_h
		
		#re-centering for numerical stability
		exp_terms_recentered=exp_terms-T.max(exp_terms)
		
		#exponentiation and normalization
		rel_probs=T.exp(exp_terms)
		probs=rel_probs/T.sum(rel_probs, axis=0)
		
		return probs
	
		
	
	def forward_filter_step(self, xp):
		
		#need to sample from the proposal distribution first
		
		#these terms are the same for every particle
		xpred=T.dot(self.W.T,(xp-self.c))/(2.0*self.xvar**2)
		sig=(1.0/(self.b**2+1.0/(2.0*self.xvar**2)))/2.0
		
		[s_samps, s_pred, prop_terms], updates = theano.scan(fn=self.sample_proposal_s,
										outputs_info=[None, None, None],
										sequences=[self.s_now, self.h_now],
										non_sequences=[xpred, sig],
										n_steps=self.npcl)
		
		#now that we have samples from the proposal distribution, we need to reweight them
		
		#would use this if we have multiple generative models
		#recons, updates = theano.scan(fn=get_recon,
										#outputs_info=[None],
										#sequences=[s_samps, h_samps],
										#n_steps=self.npcl)
		
		#this loops over every row of A and mu to calculate relative h probabilities
		#for each particle
		
		h_probs = self.calc_h_probs(s_samps)
		
		h_samps=self.theano_rng.multinomial(pvals=h_probs.T)
		
		recons=T.dot(self.W, s_samps.T) + T.reshape(self.c,(self.nx,1))
		
		x_terms=-T.sum((recons-T.reshape(xp,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)
		s_terms=-T.sum(((s_samps-s_pred)*self.b)**2,axis=1)
		
		energies=x_terms+s_terms-prop_terms
		
		#to avoid exponentiating large or very small numbers, I 
		#"re-center" the reweighting factors by adding a constant, 
		#as this has no impact on the resulting new weights
		
		energies_recentered=energies-T.max(energies)
		
		alpha=T.exp(energies_recentered) #these are the reweighting factors
		
		new_weights_unnorm=self.weights_now*alpha
		normalizer=T.sum(new_weights_unnorm)
		new_weights=new_weights_unnorm/normalizer  #need to normalize new weights
		
		updates[self.h_past]=T.cast(self.h_now,'float32')
		updates[self.s_past]=T.cast(self.s_now,'float32')
		
		updates[self.h_now]=T.cast(h_samps,'float32')
		updates[self.s_now]=T.cast(s_samps,'float32')
		
		updates[self.weights_past]=T.cast(self.weights_now,'float32')
		updates[self.weights_now]=T.cast(new_weights,'float32')
		
		#return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates
		#return normalizer, energies_recentered, updates
		return h_samps, updates
		
	
	def get_prediction(self, s, h):
		
		M_vec=T.sum(self.M*T.reshape(h,(self.nh,1)),axis=0)
		M=M_vec.reshape((self.ns,self.ns))
		
		sp=T.dot(M, s)
		
		return T.cast(sp,'float32')
	
	
	def sample_joint(self, sp):
		
		t2_samp=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T
		s2_samp=T.cast(T.sum(self.s_now*T.addbroadcast(t2_samp,1),axis=0),'float32')
		h2_samp=T.cast(T.sum(self.h_now*T.addbroadcast(t2_samp,1),axis=0),'float32')
		
		diffs=self.b*(s2_samp-sp)
		sqr_term=T.sum(diffs**2,axis=1)
		alpha=T.exp(-sqr_term)
		probs_unnorm=self.weights_past*alpha
		probs=probs_unnorm/T.sum(probs_unnorm)
		
		t1_samp=self.theano_rng.multinomial(pvals=T.reshape(probs,(1,self.npcl))).T
		s1_samp=T.cast(T.sum(self.s_past*T.addbroadcast(t1_samp,1),axis=0),'float32')
		h1_samp=T.cast(T.sum(self.h_past*T.addbroadcast(t1_samp,1),axis=0),'float32')
		
		return [s1_samp, h1_samp, s2_samp, h2_samp]
	
	
	#def sample_posterior(self, n_samps):
		
		
		#sp, updates = theano.scan(fn=self.get_prediction,
									#outputs_info=[None],
									#sequences=[self.s_past, self.h_past],
									#n_steps=self.npcl)
		
		##sp should be np by ns
		
		
		#[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint,
									#outputs_info=[None, None, None, None],
									#non_sequences=[sp],
									#n_steps=n_samps)
		
		#return [s1_samps, h1_samps, s2_samps, h2_samps]
	
	
	def h_energy_step(self, s, h):
		
		#helper function for self.calc_s_energy
		
		exp_A_i=T.reshape(T.sum(self.exp_A*T.reshape(h,(self.nh,1)),axis=0),(self.ns,1))
		mu_i=T.reshape(T.sum(self.mu*T.reshape(h,(self.nh,1)),axis=0), (self.ns,1))
		ln_Z_h_i=T.sum(self.ln_Z_h*T.reshape(h,(self.nh,1)))
		diff=T.reshape(T.reshape(s,(self.ns,1))-mu_i,(self.ns,1))
		diff_dot_exp_A_i=diff*exp_A_i
		gterm=-T.sum(T.sum(diff_dot_exp_A_i*diff))
		energy=gterm+ln_Z_h_i
		
		
		return energy
	
	
	def calc_mean_h_energy(self, s, h, nsamps):
		
		#you give this function a set of samples of s and h,
		#it gives you the average energy of those samples
		
		energies, updates = theano.scan(fn=self.h_energy_step,
									outputs_info=[None],
									sequences=[s, h],
									n_steps=nsamps)
		
		
		energy=T.mean(energies)
		
		return energy
	
	
	def update_params(self, x1, x2, n_samps, lrate):
		
		#this function samples from the joint posterior and performs
		# a step of gradient ascent on the log-likelihood
		
		sp, updates = theano.scan(fn=self.get_prediction,
									outputs_info=[None],
									sequences=[self.s_past, self.h_past],
									n_steps=self.npcl)
									
		#sp should be np by ns
		
		
		[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint,
									outputs_info=[None, None, None, None],
									non_sequences=[sp],
									n_steps=n_samps)
		
		
		
		x1_recons=T.dot(self.W, s1_samps.T) + T.reshape(self.c,(self.nx,1))
		x2_recons=T.dot(self.W, s2_samps.T) + T.reshape(self.c,(self.nx,1))
		
		s_pred, updates = theano.scan(fn=self.get_prediction,
									outputs_info=[None],
									sequences=[s1_samps, h1_samps],
									n_steps=n_samps)
		
		
		hterm1=self.calc_mean_h_energy(s1_samps, h1_samps, n_samps)
		hterm2=self.calc_mean_h_energy(s2_samps, h2_samps, n_samps)
		
		sterm=-T.mean(T.sum((self.b*(s2_samps-s_pred))**2,axis=1))
		
		xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2))
		xterm2=-T.mean(T.sum((x2_recons-T.reshape(x2,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2))
		
		energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm
		
		gparams=T.grad(energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps])
		
		# constructs the update dictionary
		for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates):
			#gnat=T.dot(param, T.dot(param.T,param))
			updates[param] = T.cast(param + gparam*lrate*rel_lr,'float32')
		
		
		#make sure W has unit-length columns
		#new_W=updates[self.W]
		#updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32')
		
		#MIGHT NEED TO NORMALIZE A
		
		
		return energy, updates
		
	
	def get_ESS(self):
		
		return 1.0/T.sum(self.weights_now**2)
	
	
	def resample_step(self):
		
		idx=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T
		s_samp=T.sum(self.s_now*T.addbroadcast(idx,1),axis=0)
		h_samp=T.sum(self.h_now*T.addbroadcast(idx,1),axis=0)
		
		return T.cast(s_samp,'float32'), T.cast(h_samp,'float32')
	
	
	def resample(self):
		
		[s_samps, h_samps], updates = theano.scan(fn=self.resample_step,
												outputs_info=[None, None],
												n_steps=self.npcl)
		
		updates[self.s_now]=T.cast(s_samps,'float32')
		updates[self.h_now]=T.cast(h_samps,'float32')
		updates[self.weights_now]=T.cast(T.ones_like(self.weights_now)/T.cast(self.npcl,'float32'),'float32') #dtype paranoia
		
		return updates
	
	
	def simulate_step(self, s):
		
		#get h probabilities
		h_probs = self.calc_h_probs(s)
		
		h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(1,self.nh)))
		
		M_vec=T.sum(self.M*T.reshape(h_samp,(self.nh,1)),axis=0)
		
		#here I use the 'mean M' by combining the M's according to their probabilities
		#M_vec=T.sum(self.M*T.reshape(hprobs,(self.nh,1)),axis=0)
		M=M_vec.reshape((self.ns,self.ns))
		
		sp=T.dot(M, s)
		
		xp=T.dot(self.W, sp) + self.c
		
		return T.cast(sp,'float32'), T.cast(xp,'float32'), h_samp
		
	
	def simulate_forward(self, n_steps):
		
		s0=T.sum(self.s_now*T.reshape(self.weights_now,(self.npcl,1)),axis=0)
		[sp, xp, hs], updates = theano.scan(fn=self.simulate_step,
										outputs_info=[s0, None, None],
										n_steps=n_steps)
		
		return sp, xp, hs, updates
Exemplo n.º 46
0
class StochasticPoolLayer(layers.Layer):
    def __init__(self,
                 incoming,
                 ds,
                 strides=None,
                 ignore_border=False,
                 pad=(0, 0),
                 random_state=42,
                 **kwargs):
        super(StochasticPoolLayer, self).__init__(incoming, **kwargs)
        self.ds = ds
        self.ignore_border = ignore_border
        self.pad = pad
        self.st = ds if strides is None else strides
        if hasattr(random_state, 'multinomial'):
            self.rng = random_state
        else:
            self.rng = RandomStreams(seed=random_state)

    def get_output_shape_for(self, input_shape):
        output_shape = list(input_shape)  # copy / convert to mutable list
        output_shape[2] = pool_output_length(
            input_shape[2],
            ds=self.ds[0],
            st=self.st[0],
            ignore_border=self.ignore_border,
            pad=self.pad[0],
        )

        output_shape[3] = pool_output_length(
            input_shape[3],
            ds=self.ds[1],
            st=self.st[1],
            ignore_border=self.ignore_border,
            pad=self.pad[1],
        )

        return tuple(output_shape)

    def get_output_for(self, input, deterministic=False, **kwargs):
        # inspired by:
        # https://github.com/lisa-lab/pylearn2/blob/14b2f8bebce7cc938cfa93e640008128e05945c1/pylearn2/expr/stochastic_pool.py#L23
        batch, channels, nr, nc = self.input_shape
        pr, pc = self.ds
        sr, sc = self.st
        output_shape = self.get_output_shape()
        out_r, out_c = output_shape[2:]
        # calculate shape needed for padding
        pad_shape = list(output_shape)
        pad_shape[2] = (pad_shape[2] - 1) * sr + pr
        pad_shape[3] = (pad_shape[3] - 1) * sc + pc
        # allocate a new input tensor
        padded = T.alloc(0.0, *pad_shape)
        # get padding offset
        offset_x = (pad_shape[2] - nr) // 2
        offset_y = (pad_shape[3] - nc) // 2

        padded = T.set_subtensor(
            padded[:, :, offset_x:(offset_x + nr), offset_y:(offset_y + nc)],
            input)
        window = T.alloc(0.0, batch, channels, out_r, out_c, pr, pc)
        for row_within_pool in xrange(pr):
            row_stop = (output_shape[2] - 1) * sr + row_within_pool + 1
            for col_within_pool in xrange(pc):
                col_stop = (output_shape[3] - 1) * sc + col_within_pool + 1
                # theano dark magic
                win_cell = padded[:, :, row_within_pool:row_stop:sr,
                                  col_within_pool:col_stop:sc]
                window = T.set_subtensor(
                    window[:, :, :, :, row_within_pool, col_within_pool],
                    win_cell)
        # sum across pooling regions
        norm = window.sum(axis=[4, 5])
        norm = T.switch(T.eq(norm, 0.0), 1.0, norm)
        norm = window / norm.dimshuffle(0, 1, 2, 3, 'x', 'x')

        if deterministic:
            res = (window * norm).sum(axis=[4, 5])
        else:
            prob = self.rng.multinomial(pvals=norm.reshape(
                (batch * channels * out_r * out_c, pr * pc)),
                                        dtype=theano.config.floatX)
            # double max because of grad problems
            res = (window * prob.reshape(
                (batch, channels, out_r, out_c, pr, pc))).max(axis=5).max(
                    axis=4)

        return T.cast(res, theano.config.floatX)
Exemplo n.º 47
0
class MultiRTRBM(DS_MRTRBM):
    """This Class Implement the Multi-Category Recurrent Temporal RBM """
    def __init__(self,
                 input,
                 n_visible,
                 n_hidden,
                 time,
                 n_cate,
                 W=None,
                 Wt=None,
                 vbias=None,
                 hbias=None,
                 h0=None):
        # the input dimeansion should be (n_cate, N_sample * time * n_vis)
        self.input = input
        self.n_vis = n_visible
        self.n_hid = n_hidden
        self.time = time
        self.n_cate = n_cate
        # Define the parameter of the Machine
        if W is None:
            W = theano.shared(
                np.random.normal(size=(self.n_cate, self.n_vis,
                                       self.n_hid)).astype(
                                           theano.config.floatX))

        if vbias is None:
            vbias = theano.shared(
                np.zeros(shape=(self.n_cate, self.time,
                                self.n_vis)).astype(theano.config.floatX))

        if hbias is None:
            hbias = theano.shared(
                np.zeros(shape=(self.time,
                                self.n_hid)).astype(theano.config.floatX))

        if Wt is None:
            Wt = theano.shared(
                np.random.normal(size=(self.n_hid, self.n_hid)).astype(
                    theano.config.floatX))

        if h0 is None:
            h0 = theano.shared(
                np.zeros(shape=(1, 1,
                                self.n_hid)).astype(theano.config.floatX))
        # set parameters
        self.W = W
        self.Wt = Wt
        self.h0 = h0
        self.hbias = hbias
        self.vbias = vbias
        self.params = [self.W, self.Wt, self.h0, self.hbias, self.vbias]
        self.numpy_rng = np.random.RandomState(1234)
        self.theano_rng = MRG_RandomStreams(self.numpy_rng.randint(2**30))

    def h_given_h_lag_vt(self, vt, h_lag, hbias):
        if h_lag == self.h0:
            x = T.batched_dot(vt, self.W) + T.addbroadcast(
                T.dot(h_lag, self.Wt) + hbias.dimshuffle('x', 0), 0, 1)
        else:
            x = T.batched_dot(vt, self.W) + \
                T.dot(h_lag, self.Wt) + hbias.dimshuffle('x', 0)
        return [x, T.nnet.sigmoid(x)]

    def H_given_h_lag_vt(self, V):
        H = [self.h0]

        # [x, out], _ = theano.scan(fn=self.h_given_h_lag_vt, sequence=V,
        #                           outputs_info=[None, self.h0],
        #                          n_steps=V.shape[0])
        for t in range(self.time):
            H += [self.h_given_h_lag_vt(V[t], H[-1], self.hbias[t])[1]]
        return T.concatenate(H[1:], axis=2)

    def free_energy_given_hid_lag(self, vt, h_lag, hbias, vbias):
        if h_lag == self.h0:
            wx_b = T.batched_dot(vt, self.W) +\
                T.addbroadcast(T.dot(h_lag, self.Wt) + hbias, 0, 1)
            vbias_term = T.batched_dot(vt, vbias)
            hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=2)
        else:
            wx_b = T.batched_dot(vt, self.W) + T.dot(h_lag, self.Wt) + \
                hbias.dimshuffle('x', 0)
            vbias_term = T.batched_dot(vt, vbias)
            hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=2)
        return -hidden_term - vbias_term

    def free_energy_RTRBM(self, V):
        H = self.H_given_h_lag_vt(V)
        for t in range(self.time):
            if t == 0:
                Et = T.sum(self.free_energy_given_hid_lag(
                    V[t], self.h0, self.hbias[t], self.vbias[:, t, :]),
                           axis=0)
            else:
                Et += T.sum(self.free_energy_given_hid_lag(
                    V[t], H[:, :, t * (self.n_hid):(t + 1) * self.n_hid],
                    self.hbias[t], self.vbias[:, t, :]),
                            axis=0)
        return Et

    def propup_given_h_lag(self, vt, h_lag, hbias):
        if h_lag == self.h0:
            x = T.batched_dot(vt, self.W) + T.addbroadcast(
                T.dot(h_lag, self.Wt) + hbias, 0, 1)
        else:
            x = T.batched_dot(vt, self.W) + hbias + T.dot(h_lag, self.Wt)
        return [x, T.nnet.sigmoid(x)]

    def propdown_given_h_lag(self, ht, vbias):
        x = T.batched_dot(ht, self.W.dimshuffle(0, 2, 1)) + \
            vbias.dimshuffle((0, 'x', 1))
        e_x = T.exp(x - x.max(axis=0, keepdims=True))
        out = e_x / e_x.sum(axis=0, keepdims=True)
        return [x, out]

    def sample_vt_given_ht_h_lag(self, ht, vbias):
        x, out = self.propdown_given_h_lag(ht, vbias)
        v_sample = []
        for v in range(self.n_vis):
            v_sample += [
                self.theano_rng.multinomial(
                    n=1, pvals=out[:, :, v].T,
                    dtype=theano.config.floatX).dimshuffle(1, 0, 'x')
            ]
        v_sample = T.concatenate(v_sample, axis=2)
        return [x, out, v_sample]

    def sample_ht_given_vt_hid_lag(self, vt, h_lag, hbias):
        x, out = self.propup_given_h_lag(vt, h_lag, hbias)
        h_sample = self.theano_rng.binomial(n=1,
                                            p=out,
                                            size=out.shape,
                                            dtype=theano.config.floatX)
        return [x, out, h_sample]

    def gibbs_vhv_given_h_lag(self, v0, h_lag, hbias, vbias):
        xh, ph, h0 = self.sample_ht_given_vt_hid_lag(v0, h_lag, hbias)
        xv, pv, v1 = self.sample_vt_given_ht_h_lag(h0, vbias)
        return [xh, ph, h0, xv, pv, v1]

    def gibbs_VhV(self, V0):
        V = []
        H = self.H_given_h_lag_vt(V0)
        for t in range(self.time):
            if t == 0:
                V += [
                    self.gibbs_vhv_given_h_lag(
                        V0[t], self.h0, self.hbias[t],
                        self.vbias[:, t, :])[-1].dimshuffle('x', 0, 1, 2)
                ]
            else:
                V += [
                    self.gibbs_vhv_given_h_lag(
                        V0[t], H[:, :, t * self.n_hid:(t + 1) * self.n_hid],
                        self.hbias[t],
                        self.vbias[:, t, :])[-1].dimshuffle('x', 0, 1, 2)
                ]
        return T.concatenate(V, axis=0)

    def get_cost_updates(self, persistant, k=2, lr=0.01, l1=0., l2=0.01):
        chain_start = persistant
        V_burn_in, updates = theano.scan(fn=self.gibbs_VhV,
                                         outputs_info=[chain_start],
                                         n_steps=k,
                                         name='MultiRTRBM Gibbs Smapler')

        chain_end = V_burn_in[-1]
        # Contrastive Divergence (Variational method Cost)/ Approxiamted
        # likelihood
        L1 = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.Wt))
        L2 = T.sum(self.W**2) + T.sum(self.Wt**2)
        KL_diff = T.mean(self.free_energy_RTRBM(self.input) -
                         self.free_energy_RTRBM(chain_end)) +\
            T.cast(l1, theano.config.floatX) * L1 + \
            T.cast(l2, theano.config.floatX) * L2
        self.gparams = T.grad(KL_diff,
                              self.params,
                              consider_constant=[chain_end])
        for param, gparam in zip(self.params, self.gparams):
            if param in [self.W, self.Wt]:
                updates[param] = param - 0.0001 * gparam
            else:
                updates[param] = param - lr * gparam
        cost, updates = self.get_pseudo_likelihood_cost(updates)

        return cost, updates

    def get_pseudo_likelihood_cost(self, updates):
        bit_i_idx = theano.shared(value=0, name='bit_i_idx')
        xi = T.round(self.input)
        fe_xi = self.free_energy_RTRBM(xi)
        for k in range(self.n_cate):
            xi_flip = T.set_subtensor(xi[:, k, :, bit_i_idx],
                                      1 - xi[:, k, :, bit_i_idx])

        # calculate free energy with bit flipped
        fe_xi_flip = self.free_energy_RTRBM(xi_flip)

        # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
        cost = T.mean(self.n_vis * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)))

        # increment bit_i_idx % number as part of updates
        updates[bit_i_idx] = (bit_i_idx + 1) % self.n_vis
        return cost, updates
Exemplo n.º 48
0
class Network:
    def __init__(self, options):
        ctx_dim = options['ctx_dim']
        dim = options['dim']
        dim_word = options['dim_word']
        n_words = options['n_words']

        self.scale = 0.01
        self.Wemb = theano.shared(
            (self.scale *
             numpy.random.randn(n_words, dim_word)).astype('float32'),
            name='Wemb')
        self.trng = RandomStreams(1234)
        self.use_noise = theano.shared(numpy.float32(0.))

        self.FFInit = FFLayer(shape=[ctx_dim, ctx_dim], name='ff_init')
        self.FFState = FFLayer(shape=[ctx_dim, dim], name='ff_state')
        self.FFMemory = FFLayer(shape=[ctx_dim, dim], name='ff_memory')

        self.LSTMLayer = LSTMLayer(shape=[dim_word, dim, ctx_dim],
                                   name='decoder')

        self.FFLSTM = FFLayer(shape=[dim, dim_word], name='ff_logit_lstm')
        self.FFCtx = FFLayer(shape=[ctx_dim, dim_word], name='ff_logit_ctx')
        self.FFLogit = FFLayer(shape=[dim_word, n_words], name='ff_logit')

        self.Layers = [
            self.FFInit, self.FFState, self.FFMemory, self.LSTMLayer,
            self.FFLSTM, self.FFCtx, self.FFLogit
        ]

        self._params = sum([layer.params() for layer in self.Layers],
                           [self.Wemb])

        self.dropOutInit = DropOutLayer(self.use_noise, self.trng)
        self.dropOutLSTM = DropOutLayer(self.use_noise, self.trng)
        self.dropOutLogit = DropOutLayer(self.use_noise, self.trng)

    def params(self):
        return self._params

    def infer_init(self, ctx_mean):
        ctx_mean = self.FFInit(ctx_mean, activation='relu')
        ctx_mean = self.dropOutInit(ctx_mean)

        init_state = self.FFState(ctx_mean, activation='tanh')
        init_memory = self.FFMemory(ctx_mean, activation='tanh')

        return init_state, init_memory

    def infer_main(self,
                   ctx,
                   emb=None,
                   mask=None,
                   init_state=None,
                   init_memory=None,
                   one_step=False):

        output_state = self.LSTMLayer(emb, ctx, init_memory, init_state,
                                      one_step, mask)
        output_state_h = self.dropOutLSTM(output_state[0])
        logit = self.FFLSTM(output_state_h, activation='linear')
        # prev2out
        logit += emb
        # ctx2out
        logit += self.FFCtx(output_state[3], activation='linear')
        logit = tensor.tanh(logit)
        logit = self.dropOutLogit(logit)
        logit = self.FFLogit(logit, activation='linear')

        return output_state, logit

    def build_training_graph(self, options):
        # description string: #words x #samples,
        x = tensor.matrix('x', dtype='int64')
        mask = tensor.matrix('mask', dtype='float32')
        # context: #samples x #annotations x dim
        ctx = tensor.tensor3('ctx', dtype='float32')

        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        # index into the word embedding matrix, shift it forward in time
        #n_timesteps == caption length. n_samples = number of captions.
        emb = self.Wemb[x.flatten()].reshape(
            [n_timesteps, n_samples, options['dim_word']])
        emb_shifted = tensor.zeros_like(emb)
        emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
        emb = emb_shifted

        # initial state/cell [top right on page 4]
        ctx_mean = ctx.mean(1)
        init_state, init_memory = self.infer_init(ctx_mean)

        output_state, logit = self.infer_main(ctx=ctx,
                                              emb=emb,
                                              mask=mask,
                                              init_state=init_state,
                                              init_memory=init_memory,
                                              one_step=False)

        logit_shp = logit.shape
        probs = tensor.nnet.softmax(
            logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

        # Index into the computed probability to give the log likelihood
        x_flat = x.flatten()
        p_flat = probs.flatten()
        cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) *
                                  probs.shape[1] + x_flat] + 1e-8)
        cost = cost.reshape([x.shape[0], x.shape[1]])
        masked_cost = cost * mask
        cost = (masked_cost).sum(0)

        alphas = output_state[2]

        return self.use_noise, [x, mask, ctx], alphas, cost

    def infer(self):

        # context: #annotations x dim
        ctx = tensor.matrix('ctx_sampler', dtype='float32')
        x = tensor.vector('x_sampler', dtype='int64')

        # initial state/cell
        ctx_mean = ctx.mean(0)
        init_state, init_memory = self.infer_init(ctx_mean)

        f_init = TFW([ctx], {
            'context': ctx,
            'state': init_state,
            'memory': init_memory
        },
                     name='f_init',
                     profile=False)

        init_state = tensor.matrix('init_state', dtype='float32')
        init_memory = tensor.matrix('init_memory', dtype='float32')

        # for the first word (which is coded with -1), emb should be all zero
        emb = tensor.switch(x[:, None] < 0,
                            tensor.alloc(0., 1, self.Wemb.shape[1]),
                            self.Wemb[x])

        output_state, logit = self.infer_main(ctx=ctx,
                                              emb=emb,
                                              mask=None,
                                              init_state=init_state,
                                              init_memory=init_memory,
                                              one_step=True)

        next_probs = tensor.nnet.softmax(logit)
        next_sample = self.trng.multinomial(pvals=next_probs).argmax(1)

        next_state, next_memory = output_state[0], output_state[1]

        f_next = TFW(
            [x, ctx, init_state, init_memory], {
                'probs': next_probs,
                'sample': next_sample,
                'state': next_state,
                'memory': next_memory
            },
            name='f_next',
            profile=False)

        return f_init, f_next
Exemplo n.º 49
0
class EncoderDecoder(object):
    def __init__(self, rng, **kwargs):
        self.n_in_src = kwargs.get('nembed_src')
        self.n_in_trg = kwargs.get('nembed_trg')
        self.n_hids_src = kwargs.get('nhids_src')
        self.n_hids_trg = kwargs.get('nhids_trg')
        self.src_vocab_size = kwargs.get('src_vocab_size')
        self.trg_vocab_size = kwargs.get('trg_vocab_size')
        self.method = kwargs.get('method')
        self.dropout = kwargs.get('dropout')
        self.maxout_part = kwargs.get('maxout_part')
        self.path = kwargs.get('saveto')
        self.clip_c = kwargs.get('clip_c')
        self.rng = rng
        self.trng = RandomStreams(rng.randint(1e5))

        # added by Zhaopeng Tu, 2016-04-29
        self.with_coverage = kwargs.get('with_coverage')
        self.coverage_dim = kwargs.get('coverage_dim')
        self.coverage_type = kwargs.get('coverage_type')
        self.max_fertility = kwargs.get('max_fertility')
        if self.coverage_type is 'linguistic':
            # make sure the dimension of linguistic coverage is always 1
            self.coverage_dim = 1

        # added by Zhaopeng Tu, 2016-05-30
        self.with_context_gate = kwargs.get('with_context_gate')

        # added by Zhaopeng Tu, 2017-11-29
        self.with_layernorm = kwargs.get('with_layernorm', False)

        self.params = []
        self.layers = []

        self.table_src = LookupTable(self.rng,
                                     self.src_vocab_size,
                                     self.n_in_src,
                                     name='table_src')
        self.layers.append(self.table_src)

        self.encoder = BidirectionalEncoder(self.rng,
                                            self.n_in_src,
                                            self.n_hids_src,
                                            self.table_src,
                                            name='birnn_encoder')
        self.layers.append(self.encoder)

        self.table_trg = LookupTable(self.rng,
                                     self.trg_vocab_size,
                                     self.n_in_trg,
                                     name='table_trg')
        self.layers.append(self.table_trg)

        self.decoder = Decoder(self.rng, self.n_in_trg, self.n_hids_trg, 2*self.n_hids_src, \
                               maxout_part=self.maxout_part, name='rnn_decoder', \
                               # added by Zhaopeng Tu, 2016-04-29

                               with_coverage=self.with_coverage, coverage_dim=self.coverage_dim, coverage_type=self.coverage_type, max_fertility=self.max_fertility, \
                               # added by Zhaopeng Tu, 2016-05-30

                               with_context_gate=self.with_context_gate, \
                               with_layernorm=self.with_layernorm)
        self.layers.append(self.decoder)

        self.logistic_layer = LogisticRegression(self.rng, self.n_in_trg,
                                                 self.trg_vocab_size)
        self.layers.append(self.logistic_layer)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        self.with_reconstruction = kwargs.get('with_reconstruction')
        if self.with_reconstruction:
            # added by Zhaopeng Tu, 2016-07-27
            self.reconstruction_weight = kwargs.get('reconstruction_weight')
            # note the source and target sides are reversed
            self.inverse_decoder = Decoder(self.rng, self.n_in_src, 2*self.n_hids_src, self.n_hids_trg, \
                                   maxout_part=self.maxout_part, name='rnn_inverse_decoder', \
                                   with_layernorm=self.with_layernorm)
            self.layers.append(self.inverse_decoder)

            self.srng = RandomStreams(rng.randint(1e5))
            self.inverse_logistic_layer = LogisticRegression(
                self.rng,
                self.n_in_src,
                self.src_vocab_size,
                name='inverse_LR')
            self.layers.append(self.inverse_logistic_layer)

        for layer in self.layers:
            self.params.extend(layer.params)

    def build_trainer(self, src, src_mask, trg, trg_mask):
        annotations = self.encoder.apply(src, src_mask)
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = (annotations *
                        src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None]

        trg_emb = self.table_trg.apply(trg)
        trg_emb_shifted = T.zeros_like(trg_emb)
        trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1])
        results = self.decoder.run_pipeline(state_below=trg_emb_shifted,
                                            mask_below=trg_mask,
                                            init_context=init_context,
                                            c=annotations,
                                            c_mask=src_mask)

        hiddens, ctxs, readout, alignment = results[:4]

        # apply dropout
        if self.dropout < 1.0:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(self.trng, readout, 1, self.dropout)

        p_y_given_x = self.logistic_layer.get_probs(readout)

        self.cost = self.logistic_layer.cost(p_y_given_x, trg,
                                             trg_mask) / trg.shape[1]

        # self.cost = theano.printing.Print('likilihood cost:')(self.cost)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        if self.with_reconstruction:
            # now hiddens is the annotations
            inverse_init_context = (hiddens * trg_mask[:, :, None]
                                    ).sum(0) / trg_mask.sum(0)[:, None]

            src_emb = self.table_src.apply(src)
            src_emb_shifted = T.zeros_like(src_emb)
            src_emb_shifted = T.set_subtensor(src_emb_shifted[1:],
                                              src_emb[:-1])
            inverse_results = self.inverse_decoder.run_pipeline(
                state_below=src_emb_shifted,
                mask_below=src_mask,
                init_context=inverse_init_context,
                c=hiddens,
                c_mask=trg_mask)

            inverse_hiddens, inverse_ctxs, inverse_readout, inverse_alignment = inverse_results[:
                                                                                                4]

            # apply dropout
            if self.dropout < 1.0:
                # logger.info('Apply dropout with p = {}'.format(self.dropout))
                inverse_readout = Dropout(self.srng, inverse_readout, 1,
                                          self.dropout)

            p_x_given_y = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            self.reconstruction_cost = self.inverse_logistic_layer.cost(
                p_x_given_y, src, src_mask) / src.shape[1]

            # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost)
            self.cost += self.reconstruction_cost * self.reconstruction_weight

        self.L1 = sum(T.sum(abs(param)) for param in self.params)
        self.L2 = sum(T.sum(param**2) for param in self.params)

        params_regular = self.L1 * 1e-6 + self.L2 * 1e-6
        # params_regular = theano.printing.Print('params_regular:')(params_regular)

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = T.grad(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # train function
        inps = [src, src_mask, trg, trg_mask]
        outs = [train_cost]

        if self.with_layernorm:
            inps = [src, src_mask, trg, trg_mask]
            lr = T.scalar(name='lr')
            print 'Building optimizers...',
            self.train_fn, self.update_fn = adam(lr, self.params, grads, inps,
                                                 outs)
        else:
            # updates
            updates = adadelta(self.params, grads)

            # mode=theano.Mode(linker='vm') for ifelse
            # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch.
            self.train_fn = theano.function(inps,
                                            outs,
                                            updates=updates,
                                            name='train_function',
                                            mode=theano.Mode(linker='vm'))
            # self.train_fn = theano.function(inps, outs, updates=updates, name='train_function', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True))

    def build_sampler(self):

        x = T.lmatrix()

        # Build Networks
        # src_mask is None
        c = self.encoder.apply(x, None)
        #init_context = ctx[0, :, -self.n_hids_src:]
        # mean pooling
        init_context = c.mean(0)

        init_state = self.decoder.create_init_state(init_context)

        # compile function
        print 'Building compile_init_state_and_context function ...'
        self.compile_init_and_context = theano.function(
            [x], [init_state, c], name='compile_init_and_context')
        print 'Done'

        y = T.lvector()
        cur_state = T.matrix()

        # if it is the first word, emb should be a1l zero, and it is indicated by -1
        trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg),
                           self.table_trg.apply(y))

        # added by Zhaopeng Tu, 2016-06-09
        if self.with_coverage:
            cov_before = T.tensor3()
            if self.coverage_type is 'linguistic':
                print 'Building compile_fertility ...'
                fertility = self.decoder._get_fertility(c)
                fertility = T.addbroadcast(fertility, 1)
                self.compile_fertility = theano.function(
                    [c], [fertility], name='compile_fertility')
                print 'Done'
            else:
                fertility = None
        else:
            cov_before = None
            fertility = None

        # apply one step
        # modified by Zhaopeng Tu, 2016-04-29
        results = self.decoder.apply(
            state_below=trg_emb,
            init_state=cur_state,
            c=c,
            one_step=True,
            # added by Zhaopeng Tu, 2016-04-27
            cov_before=cov_before,
            fertility=fertility)
        next_state, ctxs, alignment = results[:3]
        idx = 3
        if self.with_coverage:
            cov = results[idx]
            idx += 1

        readout = self.decoder.readout(next_state, ctxs, trg_emb)

        # maxout
        if self.maxout_part > 1:
            readout = self.decoder.one_step_maxout(readout)

        # apply dropout
        if self.dropout < 1.0:
            readout = Dropout(self.trng, readout, 0, self.dropout)

        # compute the softmax probability
        next_probs = self.logistic_layer.get_probs(readout)

        # sample from softmax distribution to get the sample
        next_sample = self.trng.multinomial(pvals=next_probs).argmax(1)

        # compile function
        print 'Building compile_next_state_and_probs function ...'
        inps = [y, cur_state, c]
        outs = [next_probs, next_state, next_sample, alignment]

        # added by Zhaopeng Tu, 2016-04-29
        if self.with_coverage:
            inps.append(cov_before)
            if self.coverage_type is 'linguistic':
                inps.append(fertility)
            outs.append(cov)

        # mode=theano.Mode(linker='vm') for ifelse
        # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch.
        self.compile_next_state_and_probs = theano.function(
            inps,
            outs,
            name='compile_next_state_and_probs',
            mode=theano.Mode(linker='vm'))
        print 'Done'

        # added by Zhaopeng Tu, 2016-07-18
        # for reconstruction
        if self.with_reconstruction:
            # Build Networks
            # trg_mask is None
            inverse_c = T.tensor3()
            # mean pooling
            inverse_init_context = inverse_c.mean(0)

            inverse_init_state = self.inverse_decoder.create_init_state(
                inverse_init_context)

            outs = [inverse_init_state]

            # compile function
            print 'Building compile_inverse_init_state_and_context function ...'
            self.compile_inverse_init_and_context = theano.function(
                [inverse_c], outs, name='compile_inverse_init_and_context')
            print 'Done'

            src = T.lvector()
            inverse_cur_state = T.matrix()

            trg_mask = T.matrix()
            # if it is the first word, emb should be all zero, and it is indicated by -1
            src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src),
                               self.table_src.apply(src))

            # apply one step
            # modified by Zhaopeng Tu, 2016-04-29
            inverse_results = self.inverse_decoder.apply(
                state_below=src_emb,
                init_state=inverse_cur_state,
                c=inverse_c,
                c_mask=trg_mask,
                one_step=True)
            inverse_next_state, inverse_ctxs, inverse_alignment = inverse_results[:
                                                                                  3]

            inverse_readout = self.inverse_decoder.readout(
                inverse_next_state, inverse_ctxs, src_emb)

            # maxout
            if self.maxout_part > 1:
                inverse_readout = self.inverse_decoder.one_step_maxout(
                    inverse_readout)

            # apply dropout
            if self.dropout < 1.0:
                inverse_readout = Dropout(self.srng, inverse_readout, 0,
                                          self.dropout)

            # compute the softmax probability
            inverse_next_probs, inverse_next_energy = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            # sample from softmax distribution to get the sample
            inverse_next_sample = self.srng.multinomial(
                pvals=inverse_next_probs).argmax(1)

            # compile function
            print 'Building compile_inverse_next_state_and_probs function ...'
            inps = [src, trg_mask, inverse_cur_state, inverse_c]
            outs = [
                inverse_next_probs, inverse_next_state, inverse_next_sample,
                inverse_alignment
            ]

            self.compile_inverse_next_state_and_probs = theano.function(
                inps, outs, name='compile_inverse_next_state_and_probs')
            print 'Done'

    def save(self, path=None):
        if path is None:
            path = self.path
        filenpz = open(path, "w")
        val = dict([(value.name, value.get_value())
                    for index, value in enumerate(self.params)])
        logger.info("save the model {}".format(path))
        numpy.savez(path, **val)
        filenpz.close()

    def load(self, path=None):
        if path is None:
            path = self.path
        if os.path.isfile(path):
            logger.info("load params {}".format(path))
            val = numpy.load(path)
            for index, param in enumerate(self.params):
                logger.info('Loading {} with shape {}'.format(
                    param.name,
                    param.get_value(borrow=True).shape))
                if param.name not in val.keys():
                    logger.info('Adding new param {} with shape {}'.format(
                        param.name,
                        param.get_value(borrow=True).shape))
                    continue
                if param.get_value().shape != val[param.name].shape:
                    logger.info("Error: model param != load param shape {} != {}".format(\
                                        param.get_value().shape, val[param.name].shape))
                    raise Exception("loading params shape mismatch")
                else:
                    param.set_value(val[param.name], borrow=True)
        else:
            logger.error("file {} does not exist".format(path))
            self.save()
Exemplo n.º 50
0
def sample(p, seed=None):
    if seed is None:
        seed = np.random.randint(10e6)
    rng = RandomStreams(seed=seed)
    return rng.multinomial(n=1, pvals=p, dtype=theano.config.floatX)
Exemplo n.º 51
0
class RVal(Elem, TensorWrapped, Masked):  # random value
    def __init__(self, seed=None, **kw):
        super(RVal, self).__init__(**kw)
        if seed is None:
            seed = np.random.randint(0, 1e6)
        self.rng = RandomStreams(seed=seed)
        self.value = None

    def binomial(self, shape, n=1, p=0.5, ndim=None, dtype="int32"):
        if isinstance(shape, Elem):
            shape = shape.d
        self.value = self.rng.binomial(shape, n, p, ndim, dtype)
        return self

    def normal(self, shape, avg=0.0, std=1.0, ndim=None, dtype=None):
        if isinstance(shape, Elem):
            shape = shape.d
        self.value = self.rng.normal(shape, avg, std, ndim, dtype)
        return self

    def multinomial(self,
                    shape,
                    n=1,
                    pvals=None,
                    without_replacement=False,
                    ndim=None,
                    dtype="int32"):
        if isinstance(shape, Elem):
            shape = shape.d
        if without_replacement:
            self.value = self.rng.multinomial_wo_replacement(
                shape, n, pvals, ndim, dtype)
        else:
            self.value = self.rng.multinomial(shape, n, pvals, ndim, dtype)
        return self

    def gumbel(self, shape, eps=1e-10):
        if isinstance(shape, Elem):
            shape = shape.d
        x = self.rng.uniform(shape, 0.0, 1.0)
        self.value = -theano.tensor.log(-theano.tensor.log(x + eps) + eps)
        return self

    @property
    def d(self):
        return self.value

    @property
    def v(self):
        return self.value.eval()

    @property
    def allparams(self):
        return set()

    @property
    def allupdates(self):
        return {}

    @property
    def all_extra_outs(self):
        return {}
class SCLmodel():

    #This class defines the switched constrained linear model, which was
    #designed to eliminate state-space 'explosions' that can occur when
    #doing prediction - a serious issue in the basic SL model

    def __init__(self, nx, ns, nh, npcl, xvar=1.0):

        #for this model I assume one linear generative model and a
        #combination of nh linear dynamical models

        #generative matrix
        init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32')
        #init_W=np.asarray(np.eye(2),dtype='float32')

        #always normalize the columns of W to be unit length
        init_W = init_W / np.sqrt(np.sum(init_W**2, axis=0))

        #observed variable means
        init_c = np.asarray(np.zeros(nx), dtype='float32')

        #dynamical matrices
        init_M = np.asarray(np.random.randn(nh, ns**2) / 2.0, dtype='float32')

        #state-variable variances
        #(covariance matrix of state variable noise assumed to be diagonal)
        init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32')

        #means for switching variable
        init_mu = np.asarray(np.random.randn(nh, ns) / 1.0, dtype='float32')

        #(natural log of) covariance matrices for switching variable
        #I assume the covariance matrices to be diagonal, so I
        #store all the diagonal elements in a ns-by-nh matrix
        init_A = np.asarray(np.zeros((nh, ns)), dtype='float32')

        init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_h_now = np.asarray(np.zeros((npcl, nh)), dtype='float32')
        init_h_now[:, 0] = 1.0
        init_weights_now = np.asarray(np.ones(npcl) / float(npcl),
                                      dtype='float32')

        init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32')
        init_h_past[:, 0] = 1.0
        init_weights_past = np.asarray(np.ones(npcl) / float(npcl),
                                       dtype='float32')

        self.W = theano.shared(init_W)
        self.c = theano.shared(init_c)
        self.M = theano.shared(init_M)
        self.b = theano.shared(init_b)
        self.A = theano.shared(init_A)
        self.mu = theano.shared(init_mu)

        #I define thes to avoid repeated computations of the exponential
        #of the elements of A and of the normalizing constants for each h
        self.exp_A = T.exp(self.A)
        self.ln_Z_h = T.reshape(0.5 * T.sum(self.A, axis=1), (nh, 1))

        self.s_now = theano.shared(init_s_now)
        self.h_now = theano.shared(init_h_now)
        self.weights_now = theano.shared(init_weights_now)

        self.s_past = theano.shared(init_s_past)
        self.h_past = theano.shared(init_h_past)
        self.weights_past = theano.shared(init_weights_past)

        self.xvar = np.asarray(xvar, dtype='float32')

        self.nx = nx  #dimensionality of observed variables
        self.ns = ns  #dimensionality of latent variables
        self.nh = nh  #number of (linear) dynamical modes
        self.npcl = npcl  #numer of particles in particle filter

        self.theano_rng = RandomStreams()

        self.params = [self.W, self.M, self.b, self.A, self.c, self.mu]
        self.rel_lrates = np.asarray([1.0, 1.0, 0.01, 1.0, 1.0, 10.0],
                                     dtype='float32')

    def sample_proposal_s(self, s, h, xpred, sig):

        s_pred = self.get_prediction(s, h)

        n = self.theano_rng.normal(size=T.shape(s))

        #This is the proposal distribution that arises when one assumes that W'W=I

        mean = 2.0 * (xpred + s_pred * (self.b**2)) * sig

        s_prop = mean + n * T.sqrt(sig)

        #I compute the term inside the exponent for the pdf of the proposal distrib
        prop_term = -T.sum(n**2) / 2.0

        return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast(
            prop_term, 'float32')

    #This function is required if we allow multiple generative models

    #def get_recon(self, s, h):

    #W_vec=T.sum(self.W*h, axis=0)
    #W=W.reshape((self.nx, self.ns))

    #xr=T.dot(W, s)

    #return xr

    def one_h_prob(self, exp_A_i, mu_i, s):

        #scan function for self.calc_h_probs
        smi = s - mu_i  #should be np by ns
        smia = smi * T.reshape(exp_A_i, (1, self.ns))
        gaussian_term = -T.sum(smia * smi, axis=1)
        return gaussian_term

    def calc_h_probs(self, s):

        #gterms, updates = theano.scan(fn=self.one_h_prob,
        #outputs_info=[None],
        #sequences=[self.exp_A, self.mu],
        #non_sequences=[s],
        #n_steps=self.nh)
        #vectorized version
        t1 = T.dot(s * s, self.exp_A.T)
        t2 = -2.0 * T.dot(s, (self.exp_A * self.mu).T)
        t3 = T.sum((self.mu * self.mu) * self.exp_A, axis=1)
        gterms = (t1 + t2 + t3).T

        #gterms should be nh by np

        #need to multiply by relative partition functions
        exp_terms = gterms + self.ln_Z_h

        #re-centering for numerical stability
        exp_terms_recentered = exp_terms - T.max(exp_terms)

        #exponentiation and normalization
        rel_probs = T.exp(exp_terms)
        probs = rel_probs / T.sum(rel_probs, axis=0)

        return probs

    def forward_filter_step(self, xp):

        #need to sample from the proposal distribution first

        #these terms are the same for every particle
        xpred = T.dot(self.W.T, (xp - self.c)) / (2.0 * self.xvar**2)
        sig = (1.0 / (self.b**2 + 1.0 / (2.0 * self.xvar**2))) / 2.0

        [s_samps, s_pred,
         prop_terms], updates = theano.scan(fn=self.sample_proposal_s,
                                            outputs_info=[None, None, None],
                                            sequences=[self.s_now, self.h_now],
                                            non_sequences=[xpred, sig],
                                            n_steps=self.npcl)

        #now that we have samples from the proposal distribution, we need to reweight them

        #would use this if we have multiple generative models
        #recons, updates = theano.scan(fn=get_recon,
        #outputs_info=[None],
        #sequences=[s_samps, h_samps],
        #n_steps=self.npcl)

        #this loops over every row of A and mu to calculate relative h probabilities
        #for each particle

        h_probs = self.calc_h_probs(s_samps)

        h_samps = self.theano_rng.multinomial(pvals=h_probs.T)

        recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1))

        x_terms = -T.sum(
            (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 *
                                                                  self.xvar**2)
        s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1)

        energies = x_terms + s_terms - prop_terms

        #to avoid exponentiating large or very small numbers, I
        #"re-center" the reweighting factors by adding a constant,
        #as this has no impact on the resulting new weights

        energies_recentered = energies - T.max(energies)

        alpha = T.exp(energies_recentered)  #these are the reweighting factors

        new_weights_unnorm = self.weights_now * alpha
        normalizer = T.sum(new_weights_unnorm)
        new_weights = new_weights_unnorm / normalizer  #need to normalize new weights

        updates[self.h_past] = T.cast(self.h_now, 'float32')
        updates[self.s_past] = T.cast(self.s_now, 'float32')

        updates[self.h_now] = T.cast(h_samps, 'float32')
        updates[self.s_now] = T.cast(s_samps, 'float32')

        updates[self.weights_past] = T.cast(self.weights_now, 'float32')
        updates[self.weights_now] = T.cast(new_weights, 'float32')

        #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates
        #return normalizer, energies_recentered, updates
        return h_samps, updates

    def get_prediction(self, s, h):

        M_vec = T.sum(self.M * T.reshape(h, (self.nh, 1)), axis=0)
        M = M_vec.reshape((self.ns, self.ns))

        sp = T.dot(M, s)

        return T.cast(sp, 'float32')

    def sample_joint(self, sp):

        t2_samp = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s2_samp = T.cast(
            T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')
        h2_samp = T.cast(
            T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')

        diffs = self.b * (s2_samp - sp)
        sqr_term = T.sum(diffs**2, axis=1)
        alpha = T.exp(-sqr_term)
        probs_unnorm = self.weights_past * alpha
        probs = probs_unnorm / T.sum(probs_unnorm)

        t1_samp = self.theano_rng.multinomial(
            pvals=T.reshape(probs, (1, self.npcl))).T
        s1_samp = T.cast(
            T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')
        h1_samp = T.cast(
            T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')

        return [s1_samp, h1_samp, s2_samp, h2_samp]

    #def sample_posterior(self, n_samps):

    #sp, updates = theano.scan(fn=self.get_prediction,
    #outputs_info=[None],
    #sequences=[self.s_past, self.h_past],
    #n_steps=self.npcl)

    ##sp should be np by ns

    #[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint,
    #outputs_info=[None, None, None, None],
    #non_sequences=[sp],
    #n_steps=n_samps)

    #return [s1_samps, h1_samps, s2_samps, h2_samps]

    def h_energy_step(self, s, h):

        #helper function for self.calc_s_energy

        exp_A_i = T.reshape(
            T.sum(self.exp_A * T.reshape(h, (self.nh, 1)), axis=0),
            (self.ns, 1))
        mu_i = T.reshape(T.sum(self.mu * T.reshape(h, (self.nh, 1)), axis=0),
                         (self.ns, 1))
        ln_Z_h_i = T.sum(self.ln_Z_h * T.reshape(h, (self.nh, 1)))
        diff = T.reshape(T.reshape(s, (self.ns, 1)) - mu_i, (self.ns, 1))
        diff_dot_exp_A_i = diff * exp_A_i
        gterm = -T.sum(T.sum(diff_dot_exp_A_i * diff))
        energy = gterm + ln_Z_h_i

        return energy

    def calc_mean_h_energy(self, s, h, nsamps):

        #you give this function a set of samples of s and h,
        #it gives you the average energy of those samples

        energies, updates = theano.scan(fn=self.h_energy_step,
                                        outputs_info=[None],
                                        sequences=[s, h],
                                        n_steps=nsamps)

        energy = T.mean(energies)

        return energy

    def update_params(self, x1, x2, n_samps, lrate):

        #this function samples from the joint posterior and performs
        # a step of gradient ascent on the log-likelihood

        sp, updates = theano.scan(fn=self.get_prediction,
                                  outputs_info=[None],
                                  sequences=[self.s_past, self.h_past],
                                  n_steps=self.npcl)

        #sp should be np by ns

        [s1_samps, h1_samps, s2_samps, h2_samps
         ], updates = theano.scan(fn=self.sample_joint,
                                  outputs_info=[None, None, None, None],
                                  non_sequences=[sp],
                                  n_steps=n_samps)

        x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1))
        x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1))

        s_pred, updates = theano.scan(fn=self.get_prediction,
                                      outputs_info=[None],
                                      sequences=[s1_samps, h1_samps],
                                      n_steps=n_samps)

        hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps, n_samps)
        hterm2 = self.calc_mean_h_energy(s2_samps, h2_samps, n_samps)

        sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1))

        xterm1 = -T.mean(
            T.sum((x1_recons - T.reshape(x1, (self.nx, 1)))**2, axis=0) /
            (2.0 * self.xvar**2))
        xterm2 = -T.mean(
            T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) /
            (2.0 * self.xvar**2))

        energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm

        gparams = T.grad(
            energy,
            self.params,
            consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps])

        # constructs the update dictionary
        for gparam, param, rel_lr in zip(gparams, self.params,
                                         self.rel_lrates):
            #gnat=T.dot(param, T.dot(param.T,param))
            updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32')

        #make sure W has unit-length columns
        #new_W=updates[self.W]
        #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32')

        #MIGHT NEED TO NORMALIZE A

        return energy, updates

    def get_ESS(self):

        return 1.0 / T.sum(self.weights_now**2)

    def resample_step(self):

        idx = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0)
        h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0)

        return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32')

    def resample(self):

        [s_samps, h_samps], updates = theano.scan(fn=self.resample_step,
                                                  outputs_info=[None, None],
                                                  n_steps=self.npcl)

        updates[self.s_now] = T.cast(s_samps, 'float32')
        updates[self.h_now] = T.cast(h_samps, 'float32')
        updates[self.weights_now] = T.cast(
            T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'),
            'float32')  #dtype paranoia

        return updates

    def simulate_step(self, s):

        #get h probabilities
        h_probs = self.calc_h_probs(s)

        h_samp = self.theano_rng.multinomial(
            pvals=T.reshape(h_probs, (1, self.nh)))

        M_vec = T.sum(self.M * T.reshape(h_samp, (self.nh, 1)), axis=0)

        #here I use the 'mean M' by combining the M's according to their probabilities
        #M_vec=T.sum(self.M*T.reshape(hprobs,(self.nh,1)),axis=0)
        M = M_vec.reshape((self.ns, self.ns))

        sp = T.dot(M, s)

        xp = T.dot(self.W, sp) + self.c

        return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp

    def simulate_forward(self, n_steps):

        s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)),
                   axis=0)
        [sp, xp, hs], updates = theano.scan(fn=self.simulate_step,
                                            outputs_info=[s0, None, None],
                                            n_steps=n_steps)

        return sp, xp, hs, updates
Exemplo n.º 53
0
class SLmodel():

    #This is the switched conditional linear model for integrating
    #action with sensation

    def __init__(self, nx, ns, nh, na, npcl, xvar=1.0):

        #for this model I assume one linear generative model and a
        #combination of nh linear dynamical models

        #generative matrix
        init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32')

        #observed variable means
        init_c = np.asarray(np.zeros(nx), dtype='float32')

        #dynamical matrices
        init_M = np.asarray((np.tile(np.eye(ns), (1, nh))),
                            dtype='float32')  #for state-based predictions
        init_C = np.asarray((np.tile(np.zeros((na, ns)), (1, nh))),
                            dtype='float32')  #for action-based predictions

        #state-variable variances
        #(covariance matrix of state variable noise assumed to be diagonal)
        init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32')

        #Switching parameter matrices
        init_A = np.asarray(np.zeros((ns, nh)),
                            dtype='float32')  #associated with the state
        init_B = np.asarray(np.zeros((na, nh)),
                            dtype='float32')  #associated with actions

        #priors for switching variable
        init_ph = np.asarray(np.zeros(nh), dtype='float32')

        init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_weights_now = np.asarray(np.ones(npcl) / float(npcl),
                                      dtype='float32')

        init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32')
        init_h_past[:, 0] = 1.0
        init_weights_past = np.asarray(np.ones(npcl) / float(npcl),
                                       dtype='float32')

        init_a_past = np.asarray(np.zeros((1, na)), dtype='float32')

        self.W = theano.shared(init_W)
        self.c = theano.shared(init_c)
        self.M = theano.shared(init_M)
        self.C = theano.shared(init_C)
        self.b = theano.shared(init_b)
        self.A = theano.shared(init_A)
        self.B = theano.shared(init_B)
        self.ph = theano.shared(init_ph)

        #this is to help vectorize operations
        self.sum_mat = T.as_tensor_variable(
            np.asarray((np.tile(np.eye(ns), nh)).T, dtype='float32'))

        self.s_now = theano.shared(init_s_now)
        self.weights_now = theano.shared(init_weights_now)

        self.s_past = theano.shared(init_s_past)
        self.h_past = theano.shared(init_h_past)
        self.a_past = theano.shared(init_a_past)
        self.weights_past = theano.shared(init_weights_past)

        self.xvar = np.asarray(xvar, dtype='float32')

        self.nx = nx  #dimensionality of observed variables
        self.ns = ns  #dimensionality of latent variables
        self.nh = nh  #number of (linear) dynamical modes
        self.na = na  #dimensionality of action variables
        self.npcl = npcl  #numer of particles in particle filter

        self.theano_rng = RandomStreams()

        self.params = [
            self.W, self.M, self.C, self.b, self.A, self.B, self.c, self.ph
        ]
        self.rel_lrates = np.asarray(
            [0.1, 1.0, 1.0, 0.01, 10.0, 10.0, 0.1, 1.0], dtype='float32')

    def sample_proposal_s(self, s, a, h, xpred, sig):

        s_pred = self.get_prediction(s, a, h)

        n = self.theano_rng.normal(size=T.shape(s))

        #This is the proposal distribution that arises when one assumes that W'W=I

        mean = 2.0 * (xpred + s_pred * (self.b**2)) * sig

        s_prop = mean + n * T.sqrt(sig)

        #I compute the term inside the exponent for the pdf of the proposal distrib
        prop_term = -T.sum(n**2) / 2.0

        return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast(
            prop_term, 'float32')

    #This function is required if we allow multiple generative models

    #def get_recon(self, s, h):

    #W_vec=T.sum(self.W*h, axis=0)
    #W=W.reshape((self.nx, self.ns))

    #xr=T.dot(W, s)

    #return xr

    def calc_h_probs(self, s, a):

        #this function takes an np by ns matrix of s samples plus
        #an action vector a
        #and returns an nh by np set of h probabilities

        exp_terms = T.dot(s, self.A) + T.reshape(T.dot(a, self.B),
                                                 (1, self.nh)) + T.reshape(
                                                     self.ph, (1, self.nh))

        #re-centering for numerical stability
        exp_terms_recentered = exp_terms - T.max(exp_terms, axis=1)

        #exponentiation and normalization
        rel_probs = T.exp(exp_terms)
        probs = rel_probs.T / T.sum(rel_probs, axis=1)

        return probs.T

    def forward_filter_step(self, a, xp):

        #first sample from h given s and a

        h_probs = self.calc_h_probs(self.s_now, a)
        h_samps = self.theano_rng.multinomial(pvals=h_probs)

        #need to sample from the proposal distribution
        #these terms are the same for every particle
        xpred = T.dot(self.W.T, (xp - self.c)) / (2.0 * self.xvar**2)
        sig = (1.0 / (self.b**2 + 1.0 / (2.0 * self.xvar**2))) / 2.0

        #sig=1.0/(self.b**2)

        #vectorized version
        s_pred = self.get_prediction(self.s_now, a, h_samps)

        n = self.theano_rng.normal(size=T.shape(self.s_now))

        mean = 2.0 * (xpred + s_pred * (self.b**2)) * sig

        #mean=s_pred  #trying out using solely predictive proposal distrib

        s_samps = mean + n * T.sqrt(sig)

        prop_terms = -T.sum(n**2, axis=1) / 2.0

        updates = {}

        #now that we have samples from the proposal distribution, we need to reweight them

        recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1))

        x_terms = -T.sum(
            (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 *
                                                                  self.xvar**2)
        s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1) / 2.0

        energies = x_terms + s_terms - prop_terms

        #to avoid exponentiating large or very small numbers, I
        #"re-center" the reweighting factors by adding a constant,
        #as this has no impact on the resulting new weights

        energies_recentered = energies - T.max(energies)

        alpha = T.exp(energies_recentered)  #these are the reweighting factors

        new_weights_unnorm = self.weights_now * alpha
        normalizer = T.sum(new_weights_unnorm)
        new_weights = new_weights_unnorm / normalizer  #need to normalize new weights

        updates[self.h_past] = T.cast(h_samps, 'float32')
        updates[self.s_past] = T.cast(self.s_now, 'float32')
        updates[self.a_past] = T.cast(a, 'float32')
        updates[self.s_now] = T.cast(s_samps, 'float32')

        updates[self.weights_past] = T.cast(self.weights_now, 'float32')
        updates[self.weights_now] = T.cast(new_weights, 'float32')

        #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates
        #return normalizer, energies_recentered, updates
        #return h_samps, updates
        return updates

    def get_prediction(self, s, a, h):

        s_dot_M = T.dot(s, self.M)  #this is np by nh*ns
        a_dot_C = T.dot(a, self.C)  #this is 1 by nh*ns
        tot = s_dot_M + a_dot_C  #should be np by nh*ns
        s_pred = T.dot(tot * T.extra_ops.repeat(h, self.ns, axis=1),
                       self.sum_mat)  #should be np by ns

        return T.cast(s_pred, 'float32')

    def sample_joint(self, sp):

        t2_samp = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s2_samp = T.cast(
            T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')
        h2_samp = T.cast(
            T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')

        diffs = self.b * (s2_samp - sp)
        sqr_term = T.sum(diffs**2, axis=1)
        alpha = T.exp(-sqr_term)
        probs_unnorm = self.weights_past * alpha
        probs = probs_unnorm / T.sum(probs_unnorm)

        t1_samp = self.theano_rng.multinomial(
            pvals=T.reshape(probs, (1, self.npcl))).T
        s1_samp = T.cast(
            T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')
        h1_samp = T.cast(
            T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')

        return [s1_samp, h1_samp, s2_samp, h2_samp]

    def calc_mean_h_energy(self, s, a, h):

        #you give this function a set of samples of s, a, and h,
        #it gives you the average energy of those samples

        exp_terms = T.dot(s, self.A) + T.reshape(T.dot(a, self.B),
                                                 (1, self.nh)) + T.reshape(
                                                     self.ph,
                                                     (1, self.nh))  #np by nh

        energies = T.sum(h * exp_terms, axis=1) - T.log(
            T.sum(T.exp(exp_terms), axis=1))  #should be np by 1

        energy = T.mean(energies)

        return energy

    def update_params(self, x1, x2, n_samps, lrate):

        #this function samples from the joint posterior and performs
        # a step of gradient ascent on the log-likelihood

        sp = self.get_prediction(self.s_past, self.a_past, self.h_past)

        #sp should be np by ns

        [s1_samps, h1_samps, s2_samps, h2_samps
         ], updates = theano.scan(fn=self.sample_joint,
                                  outputs_info=[None, None, None, None],
                                  non_sequences=[sp],
                                  n_steps=n_samps)

        x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1))
        x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1))

        s_pred = self.get_prediction(s1_samps, h1_samps)

        hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps)
        #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps)

        sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1)) / 2.0

        xterm1 = -T.mean(
            T.sum((x1_recons - T.reshape(x1, (self.nx, 1)))**2, axis=0) /
            (2.0 * self.xvar**2))
        xterm2 = -T.mean(
            T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) /
            (2.0 * self.xvar**2))

        #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2))
        energy = hterm1 + xterm1 + xterm2 + sterm

        gparams = T.grad(
            energy,
            self.params,
            consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps])

        # constructs the update dictionary
        for gparam, param, rel_lr in zip(gparams, self.params,
                                         self.rel_lrates):
            #gnat=T.dot(param, T.dot(param.T,param))
            updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32')

        #make sure W has unit-length columns
        #new_W=updates[self.W]
        #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32')

        #MIGHT NEED TO NORMALIZE A

        return energy, updates

    def get_ESS(self):

        return 1.0 / T.sum(self.weights_now**2)

    def resample_step(self):

        idx = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0)
        h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0)

        return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32')

    def resample(self):

        [s_samps, h_samps], updates = theano.scan(fn=self.resample_step,
                                                  outputs_info=[None, None],
                                                  n_steps=self.npcl)

        updates[self.s_now] = T.cast(s_samps, 'float32')
        updates[self.h_now] = T.cast(h_samps, 'float32')
        updates[self.weights_now] = T.cast(
            T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'),
            'float32')  #dtype paranoia

        return updates

    def simulate_step(self, s, a):

        s = T.reshape(s, (1, self.ns))
        a = T.reshape(a, (1, self.na))
        #get h probabilities
        h_probs = self.calc_h_probs(s, a)
        h_samp = self.theano_rng.multinomial(pvals=h_probs)

        sp = self.get_prediction(s, a, h_samp)

        xp = T.dot(self.W, sp.T) + T.reshape(self.c, (self.nx, 1))

        return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp

    def simulate_forward(self, a, n_steps):

        #a should be n_steps by na

        s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)),
                   axis=0)
        s0 = T.reshape(s0, (1, self.ns))
        [sp, xp, hs], updates = theano.scan(fn=self.simulate_step,
                                            outputs_info=[s0, None, None],
                                            sequences=[a],
                                            n_steps=n_steps)

        return sp, xp, hs, updates
class SLmodel():
	
	#This is a test of my idea to adapt the proposal distribution by 
	#maximizing the entropy of the weights
	
	def __init__(self, nx, ns, nh, npcl, xvar=1.0):
		
		#for this model I assume one linear generative model and a 
		#combination of nh linear dynamical models
		
		#generative matrix
		init_W=np.asarray(np.random.randn(nx,ns)/10.0,dtype='float32')
		#init_W=np.asarray(np.eye(2),dtype='float32')
		
		#always normalize the columns of W to be unit length
		init_W=init_W/np.sqrt(np.sum(init_W**2,axis=0))
		
		#observed variable means
		init_c=np.asarray(np.zeros(nx),dtype='float32')
		
		#dynamical matrices
		#init_M=np.asarray(np.random.randn(ns,ns*nh)/2.0,dtype='float32')
		init_M=np.asarray((np.tile(np.eye(ns),(1,nh))),dtype='float32')
		
		#state-variable variances
		#(covariance matrix of state variable noise assumed to be diagonal)
		init_b=np.asarray(np.ones(ns)*10.0,dtype='float32')
		
		#Switching parameter matrix
		init_A=np.asarray(np.zeros((ns,nh)),dtype='float32')
		
		#priors for switching variable
		init_ph=np.asarray(np.zeros(nh),dtype='float32')
		
		
		#parameters for proposal distribution
		init_D=np.asarray(np.eye(ns),dtype='float32')
		init_E=np.asarray(np.random.randn(nx,ns)/100.0,dtype='float32')
		init_k=np.asarray(np.zeros(ns),dtype='float32')
		init_sig=np.asarray(np.ones(ns),dtype='float32')
		
		
		init_s_now=np.asarray(np.zeros((npcl,ns)),dtype='float32')
		init_h_now=np.asarray(np.zeros((npcl,nh)),dtype='float32')
		init_h_now[:,0]=1.0
		init_weights_now=np.asarray(np.ones(npcl)/float(npcl),dtype='float32')
		
		init_s_past=np.asarray(np.zeros((npcl,ns)),dtype='float32')
		init_h_past=np.asarray(np.zeros((npcl,nh)),dtype='float32')
		init_h_past[:,0]=1.0
		init_weights_past=np.asarray(np.ones(npcl)/float(npcl),dtype='float32')
		
		self.W=theano.shared(init_W)
		self.c=theano.shared(init_c)
		self.M=theano.shared(init_M)
		self.b=theano.shared(init_b)
		self.A=theano.shared(init_A)
		self.ph=theano.shared(init_ph)
		
		self.D=theano.shared(init_D)
		self.E=theano.shared(init_E)
		self.k=theano.shared(init_k)
		self.sig=theano.shared(init_sig)
		
		#this is to help vectorize operations
		self.sum_mat=T.as_tensor_variable(np.asarray((np.tile(np.eye(ns),nh)).T,dtype='float32'))
		
		self.s_now=theano.shared(init_s_now)
		self.h_now=theano.shared(init_h_now)
		self.weights_now=theano.shared(init_weights_now)
		
		self.s_past=theano.shared(init_s_past)
		self.h_past=theano.shared(init_h_past)
		self.weights_past=theano.shared(init_weights_past)
		
		self.xvar=np.asarray(xvar,dtype='float32')
		
		self.nx=nx		#dimensionality of observed variables
		self.ns=ns		#dimensionality of latent variables
		self.nh=nh		#number of (linear) dynamical modes
		self.npcl=npcl	#numer of particles in particle filter
		
		self.theano_rng = RandomStreams()
		
		self.params=				[self.W, self.M, self.b, self.A, self.c, self.ph]
		self.rel_lrates=np.asarray([  0.1,    1.0,    0.01,   10.0,    0.1,     1.0]   ,dtype='float32')
		
		self.meta_params=     [self.D, self.E, self.k, self.sig]
		self.meta_rel_lrates=[   1.0,   1.0,     1.0,     1.0  ]
	
	
	def sample_proposal_s(self, s, h, xp):
		
		s_pred=self.get_prediction(s, h)
		
		n=self.theano_rng.normal(size=T.shape(s))
		
		prop_mean=T.dot(s_pred, self.D) + T.reshape(T.dot(xp, self.E),(1,self.ns)) + self.k
		
		s_prop=prop_mean + n*T.reshape(T.exp(self.sig/2.0),(1,self.ns))
		
		#I compute the term inside the exponent for the pdf of the proposal distrib
		prop_term=-T.sum(n**2)/2.0
		
		return T.cast(s_prop,'float32'), T.cast(s_pred,'float32'), T.cast(prop_term,'float32'), prop_mean
	
	
	def calc_h_probs(self, s):
		
		#this function takes an np by ns matrix of s samples
		#and returns an nh by np set of h probabilities
		
		exp_terms=T.dot(s, self.A) + T.reshape(self.ph,(1,self.nh))
		
		#re-centering for numerical stability
		exp_terms_recentered=exp_terms-T.max(exp_terms,axis=1)
		
		#exponentiation and normalization
		rel_probs=T.exp(exp_terms)
		probs=rel_probs.T/T.sum(rel_probs, axis=1)
		
		return probs.T
	
	
	def proposal_loss(self, s_pred, s_samps, xp, weights):
		
		#estimates the KL divergence between the proposal distribution
		#and the true posterior (minus one term, which we assume does not
		#depend on the proposal distribution).
		
		#prop means should be symblolic variables since we need to 
		#compute the derivatives of D and E through this function
		
		prop_means=T.dot(s_pred, self.D) + T.reshape(T.dot(xp, self.E),(1,self.ns)) + self.k  #np by ns
		
		diffs=(prop_means-s_samps)
		scl_diffs=diffs*T.reshape(T.exp(-self.sig),(1,self.ns))
		energies=0.5*T.sum(diffs*scl_diffs,axis=1)
		tot=T.sum(energies*weights)+0.5*T.sum(self.sig)
		return tot
	
	
	def forward_filter_step(self, xp):
		
		#need to sample from the proposal distribution first
		s_samps, s_pred, prop_terms, prop_means = self.sample_proposal_s(self.s_now,self.h_now,xp)
		
		updates={}
		
		#now that we have samples from the proposal distribution, we need to reweight them
		
		h_probs = self.calc_h_probs(s_samps)
		
		h_samps=self.theano_rng.multinomial(pvals=h_probs)
		
		recons=T.dot(self.W, s_samps.T) + T.reshape(self.c,(self.nx,1))
		
		x_terms=-T.sum((recons-T.reshape(xp,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)
		s_terms=-T.sum(((s_samps-s_pred)*self.b)**2,axis=1)/2.0
		
		energies=x_terms+s_terms-prop_terms
		
		#to avoid exponentiating large or very small numbers, I 
		#"re-center" the reweighting factors by adding a constant, 
		#as this has no impact on the resulting new weights
		
		energies_recentered=energies-T.max(energies)
		
		alpha=T.exp(energies_recentered) #these are the reweighting factors
		
		new_weights_unnorm=self.weights_now*alpha
		normalizer=T.sum(new_weights_unnorm)
		new_weights=new_weights_unnorm/normalizer  #need to normalize new weights
		
		
		#gradient updates for the proposal distribution parameters
		lrate=1e-2
		
		loss=self.proposal_loss(s_pred, s_samps, xp, new_weights)
		
		gparams=T.grad(loss, self.meta_params, consider_constant=[s_pred, s_samps, xp, new_weights])
		# constructs the update dictionary
		for gparam, param, rel_lr in zip(gparams, self.meta_params, self.meta_rel_lrates):
			updates[param] = T.cast(param - gparam*lrate*rel_lr,'float32')
		
		
		updates[self.h_past]=T.cast(self.h_now,'float32')
		updates[self.s_past]=T.cast(self.s_now,'float32')
		
		updates[self.h_now]=T.cast(h_samps,'float32')
		updates[self.s_now]=T.cast(s_samps,'float32')
		
		updates[self.weights_past]=T.cast(self.weights_now,'float32')
		updates[self.weights_now]=T.cast(new_weights,'float32')
		
		#return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates
		#return normalizer, energies_recentered, updates
		#return h_samps, updates
		return updates
		
	
	def get_prediction(self, s, h):
		
		s_dot_M=T.dot(s, self.M)  #this is np by nh*ns
		s_pred=T.dot(s_dot_M*T.extra_ops.repeat(h,self.ns,axis=1),self.sum_mat) #should be np by ns
		
		return T.cast(s_pred,'float32')
	
	
	def sample_joint(self, sp):
		
		t2_samp=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T
		s2_samp=T.cast(T.sum(self.s_now*T.addbroadcast(t2_samp,1),axis=0),'float32')
		h2_samp=T.cast(T.sum(self.h_now*T.addbroadcast(t2_samp,1),axis=0),'float32')
		
		diffs=self.b*(s2_samp-sp)
		sqr_term=T.sum(diffs**2,axis=1)
		alpha=T.exp(-sqr_term)
		probs_unnorm=self.weights_past*alpha
		probs=probs_unnorm/T.sum(probs_unnorm)
		
		t1_samp=self.theano_rng.multinomial(pvals=T.reshape(probs,(1,self.npcl))).T
		s1_samp=T.cast(T.sum(self.s_past*T.addbroadcast(t1_samp,1),axis=0),'float32')
		h1_samp=T.cast(T.sum(self.h_past*T.addbroadcast(t1_samp,1),axis=0),'float32')
		
		return [s1_samp, h1_samp, s2_samp, h2_samp]
	
	
	#def sample_posterior(self, n_samps):
		
		
		#sp, updates = theano.scan(fn=self.get_prediction,
									#outputs_info=[None],
									#sequences=[self.s_past, self.h_past],
									#n_steps=self.npcl)
		
		##sp should be np by ns
		
		
		#[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint,
									#outputs_info=[None, None, None, None],
									#non_sequences=[sp],
									#n_steps=n_samps)
		
		#return [s1_samps, h1_samps, s2_samps, h2_samps]
	
	
	def h_energy_step(self, s, h):
		
		#helper function for self.calc_mean_h_energy
		
		exp_A_i=T.reshape(T.sum(self.exp_A*T.reshape(h,(self.nh,1)),axis=0),(self.ns,1))
		mu_i=T.reshape(T.sum(self.mu*T.reshape(h,(self.nh,1)),axis=0), (self.ns,1))
		ln_Z_h_i=T.sum(self.ln_Z_h*T.reshape(h,(self.nh,1)))
		ph_i=T.sum(self.ph*T.reshape(h,(self.nh,1)))
		diff=T.reshape(T.reshape(s,(self.ns,1))-mu_i,(self.ns,1))
		diff_dot_exp_A_i=diff*exp_A_i
		gterm=-0.5*T.sum(T.sum(diff_dot_exp_A_i*diff))
		energy=gterm+ln_Z_h_i+ph_i
		
		
		return energy
	
	
	def calc_mean_h_energy(self, s, h):
		
		#you give this function a set of samples of s and h,
		#it gives you the average energy of those samples
		
		
		exp_terms=T.dot(s, self.A) + T.reshape(self.ph,(1,self.nh))  #np by nh
		
		energies=T.sum(h*exp_terms,axis=1) + T.log(T.sum(T.exp(exp_terms),axis=1)) #should be np by 1
		
		energy=T.mean(energies)
		
		return energy
	
	
	def update_params(self, x1, x2, n_samps, lrate):
		
		#this function samples from the joint posterior and performs
		# a step of gradient ascent on the log-likelihood
		
		sp=self.get_prediction(self.s_past, self.h_past)
									
		#sp should be np by ns
		
		
		[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint,
									outputs_info=[None, None, None, None],
									non_sequences=[sp],
									n_steps=n_samps)
		
		
		
		x1_recons=T.dot(self.W, s1_samps.T) + T.reshape(self.c,(self.nx,1))
		x2_recons=T.dot(self.W, s2_samps.T) + T.reshape(self.c,(self.nx,1))
		
		s_pred = self.get_prediction(s1_samps, h1_samps)
		
		
		hterm1=self.calc_mean_h_energy(s1_samps, h1_samps)
		#hterm2=self.calc_mean_h_energy(s2_samps, h2_samps)
		
		sterm=-T.mean(T.sum((self.b*(s2_samps-s_pred))**2,axis=1))/2.0
		
		#xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2))
		xterm2=-T.mean(T.sum((x2_recons-T.reshape(x2,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2))
		
		#energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2))
		energy = hterm1 + xterm2 + sterm 
		
		gparams=T.grad(energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps])
		
		# constructs the update dictionary
		for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates):
			#gnat=T.dot(param, T.dot(param.T,param))
			updates[param] = T.cast(param + gparam*lrate*rel_lr,'float32')
		
		
		#make sure W has unit-length columns
		#new_W=updates[self.W]
		#updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32')
		
		#MIGHT NEED TO NORMALIZE A
		
		
		return energy, updates
		
	
	def get_ESS(self):
		
		return 1.0/T.sum(self.weights_now**2)
	
	
	def resample_step(self):
		
		idx=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T
		s_samp=T.sum(self.s_now*T.addbroadcast(idx,1),axis=0)
		h_samp=T.sum(self.h_now*T.addbroadcast(idx,1),axis=0)
		
		return T.cast(s_samp,'float32'), T.cast(h_samp,'float32')
	
	
	def resample(self):
		
		[s_samps, h_samps], updates = theano.scan(fn=self.resample_step,
												outputs_info=[None, None],
												n_steps=self.npcl)
		
		updates[self.s_now]=T.cast(s_samps,'float32')
		updates[self.h_now]=T.cast(h_samps,'float32')
		updates[self.weights_now]=T.cast(T.ones_like(self.weights_now)/T.cast(self.npcl,'float32'),'float32') #dtype paranoia
		
		return updates
	
	
	def simulate_step(self, s):
		
		s=T.reshape(s,(1,self.ns))
		#get h probabilities
		h_probs = self.calc_h_probs(s)
		
		#h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(self.nh,1)))
		h_samp=self.theano_rng.multinomial(pvals=h_probs)
		
		sp=self.get_prediction(s,h_samp)
		
		xp=T.dot(self.W, sp.T) + T.reshape(self.c,(self.nx,1))
		
		return T.cast(sp,'float32'), T.cast(xp,'float32'), h_samp
		
	
	def simulate_forward(self, n_steps):
		
		
		s0=T.sum(self.s_now*T.reshape(self.weights_now,(self.npcl,1)),axis=0)
		s0=T.reshape(s0,(1,self.ns))
		[sp, xp, hs], updates = theano.scan(fn=self.simulate_step,
										outputs_info=[s0, None, None],
										n_steps=n_steps)
		
		return sp, xp, hs, updates
Exemplo n.º 55
0
class SLmodel():

    #This is a test of my idea to adapt the proposal distribution by
    #maximizing the entropy of the weights

    def __init__(self, nx, ns, nh, npcl, xvar=1.0):

        #for this model I assume one linear generative model and a
        #combination of nh linear dynamical models

        #generative matrix
        init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32')
        #init_W=np.asarray(np.eye(2),dtype='float32')

        #always normalize the columns of W to be unit length
        init_W = init_W / np.sqrt(np.sum(init_W**2, axis=0))

        #observed variable means
        init_c = np.asarray(np.zeros(nx), dtype='float32')

        #dynamical matrices
        #init_M=np.asarray(np.random.randn(ns,ns*nh)/2.0,dtype='float32')
        init_M = np.asarray((np.tile(np.eye(ns), (1, nh))), dtype='float32')

        #state-variable variances
        #(covariance matrix of state variable noise assumed to be diagonal)
        init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32')

        #Switching parameter matrix
        init_A = np.asarray(np.zeros((ns, nh)), dtype='float32')

        #priors for switching variable
        init_ph = np.asarray(np.zeros(nh), dtype='float32')

        #parameters for proposal distribution
        init_D = np.asarray(np.eye(ns), dtype='float32')
        init_E = np.asarray(np.random.randn(nx, ns) / 100.0, dtype='float32')
        init_k = np.asarray(np.zeros(ns), dtype='float32')
        init_sig = np.asarray(np.ones(ns), dtype='float32')

        init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_h_now = np.asarray(np.zeros((npcl, nh)), dtype='float32')
        init_h_now[:, 0] = 1.0
        init_weights_now = np.asarray(np.ones(npcl) / float(npcl),
                                      dtype='float32')

        init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32')
        init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32')
        init_h_past[:, 0] = 1.0
        init_weights_past = np.asarray(np.ones(npcl) / float(npcl),
                                       dtype='float32')

        self.W = theano.shared(init_W)
        self.c = theano.shared(init_c)
        self.M = theano.shared(init_M)
        self.b = theano.shared(init_b)
        self.A = theano.shared(init_A)
        self.ph = theano.shared(init_ph)

        self.D = theano.shared(init_D)
        self.E = theano.shared(init_E)
        self.k = theano.shared(init_k)
        self.sig = theano.shared(init_sig)

        #this is to help vectorize operations
        self.sum_mat = T.as_tensor_variable(
            np.asarray((np.tile(np.eye(ns), nh)).T, dtype='float32'))

        self.s_now = theano.shared(init_s_now)
        self.h_now = theano.shared(init_h_now)
        self.weights_now = theano.shared(init_weights_now)

        self.s_past = theano.shared(init_s_past)
        self.h_past = theano.shared(init_h_past)
        self.weights_past = theano.shared(init_weights_past)

        self.xvar = np.asarray(xvar, dtype='float32')

        self.nx = nx  #dimensionality of observed variables
        self.ns = ns  #dimensionality of latent variables
        self.nh = nh  #number of (linear) dynamical modes
        self.npcl = npcl  #numer of particles in particle filter

        self.theano_rng = RandomStreams()

        self.params = [self.W, self.M, self.b, self.A, self.c, self.ph]
        self.rel_lrates = np.asarray([0.1, 1.0, 0.01, 10.0, 0.1, 1.0],
                                     dtype='float32')

        self.meta_params = [self.D, self.E, self.k, self.sig]
        self.meta_rel_lrates = [1.0, 1.0, 1.0, 1.0]

    def sample_proposal_s(self, s, h, xp):

        s_pred = self.get_prediction(s, h)

        n = self.theano_rng.normal(size=T.shape(s))

        prop_mean = T.dot(s_pred, self.D) + T.reshape(T.dot(xp, self.E),
                                                      (1, self.ns)) + self.k

        s_prop = prop_mean + n * T.reshape(T.exp(self.sig / 2.0), (1, self.ns))

        #I compute the term inside the exponent for the pdf of the proposal distrib
        prop_term = -T.sum(n**2) / 2.0

        return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast(
            prop_term, 'float32'), prop_mean

    def calc_h_probs(self, s):

        #this function takes an np by ns matrix of s samples
        #and returns an nh by np set of h probabilities

        exp_terms = T.dot(s, self.A) + T.reshape(self.ph, (1, self.nh))

        #re-centering for numerical stability
        exp_terms_recentered = exp_terms - T.max(exp_terms, axis=1)

        #exponentiation and normalization
        rel_probs = T.exp(exp_terms)
        probs = rel_probs.T / T.sum(rel_probs, axis=1)

        return probs.T

    def proposal_loss(self, s_pred, s_samps, xp, weights):

        #estimates the KL divergence between the proposal distribution
        #and the true posterior (minus one term, which we assume does not
        #depend on the proposal distribution).

        #prop means should be symblolic variables since we need to
        #compute the derivatives of D and E through this function

        prop_means = T.dot(s_pred, self.D) + T.reshape(T.dot(
            xp, self.E), (1, self.ns)) + self.k  #np by ns

        diffs = (prop_means - s_samps)
        scl_diffs = diffs * T.reshape(T.exp(-self.sig), (1, self.ns))
        energies = 0.5 * T.sum(diffs * scl_diffs, axis=1)
        tot = T.sum(energies * weights) + 0.5 * T.sum(self.sig)
        return tot

    def forward_filter_step(self, xp):

        #need to sample from the proposal distribution first
        s_samps, s_pred, prop_terms, prop_means = self.sample_proposal_s(
            self.s_now, self.h_now, xp)

        updates = {}

        #now that we have samples from the proposal distribution, we need to reweight them

        h_probs = self.calc_h_probs(s_samps)

        h_samps = self.theano_rng.multinomial(pvals=h_probs)

        recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1))

        x_terms = -T.sum(
            (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 *
                                                                  self.xvar**2)
        s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1) / 2.0

        energies = x_terms + s_terms - prop_terms

        #to avoid exponentiating large or very small numbers, I
        #"re-center" the reweighting factors by adding a constant,
        #as this has no impact on the resulting new weights

        energies_recentered = energies - T.max(energies)

        alpha = T.exp(energies_recentered)  #these are the reweighting factors

        new_weights_unnorm = self.weights_now * alpha
        normalizer = T.sum(new_weights_unnorm)
        new_weights = new_weights_unnorm / normalizer  #need to normalize new weights

        #gradient updates for the proposal distribution parameters
        lrate = 1e-2

        loss = self.proposal_loss(s_pred, s_samps, xp, new_weights)

        gparams = T.grad(loss,
                         self.meta_params,
                         consider_constant=[s_pred, s_samps, xp, new_weights])
        # constructs the update dictionary
        for gparam, param, rel_lr in zip(gparams, self.meta_params,
                                         self.meta_rel_lrates):
            updates[param] = T.cast(param - gparam * lrate * rel_lr, 'float32')

        updates[self.h_past] = T.cast(self.h_now, 'float32')
        updates[self.s_past] = T.cast(self.s_now, 'float32')

        updates[self.h_now] = T.cast(h_samps, 'float32')
        updates[self.s_now] = T.cast(s_samps, 'float32')

        updates[self.weights_past] = T.cast(self.weights_now, 'float32')
        updates[self.weights_now] = T.cast(new_weights, 'float32')

        #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates
        #return normalizer, energies_recentered, updates
        #return h_samps, updates
        return updates

    def get_prediction(self, s, h):

        s_dot_M = T.dot(s, self.M)  #this is np by nh*ns
        s_pred = T.dot(s_dot_M * T.extra_ops.repeat(h, self.ns, axis=1),
                       self.sum_mat)  #should be np by ns

        return T.cast(s_pred, 'float32')

    def sample_joint(self, sp):

        t2_samp = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s2_samp = T.cast(
            T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')
        h2_samp = T.cast(
            T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32')

        diffs = self.b * (s2_samp - sp)
        sqr_term = T.sum(diffs**2, axis=1)
        alpha = T.exp(-sqr_term)
        probs_unnorm = self.weights_past * alpha
        probs = probs_unnorm / T.sum(probs_unnorm)

        t1_samp = self.theano_rng.multinomial(
            pvals=T.reshape(probs, (1, self.npcl))).T
        s1_samp = T.cast(
            T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')
        h1_samp = T.cast(
            T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32')

        return [s1_samp, h1_samp, s2_samp, h2_samp]

    #def sample_posterior(self, n_samps):

    #sp, updates = theano.scan(fn=self.get_prediction,
    #outputs_info=[None],
    #sequences=[self.s_past, self.h_past],
    #n_steps=self.npcl)

    ##sp should be np by ns

    #[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint,
    #outputs_info=[None, None, None, None],
    #non_sequences=[sp],
    #n_steps=n_samps)

    #return [s1_samps, h1_samps, s2_samps, h2_samps]

    def h_energy_step(self, s, h):

        #helper function for self.calc_mean_h_energy

        exp_A_i = T.reshape(
            T.sum(self.exp_A * T.reshape(h, (self.nh, 1)), axis=0),
            (self.ns, 1))
        mu_i = T.reshape(T.sum(self.mu * T.reshape(h, (self.nh, 1)), axis=0),
                         (self.ns, 1))
        ln_Z_h_i = T.sum(self.ln_Z_h * T.reshape(h, (self.nh, 1)))
        ph_i = T.sum(self.ph * T.reshape(h, (self.nh, 1)))
        diff = T.reshape(T.reshape(s, (self.ns, 1)) - mu_i, (self.ns, 1))
        diff_dot_exp_A_i = diff * exp_A_i
        gterm = -0.5 * T.sum(T.sum(diff_dot_exp_A_i * diff))
        energy = gterm + ln_Z_h_i + ph_i

        return energy

    def calc_mean_h_energy(self, s, h):

        #you give this function a set of samples of s and h,
        #it gives you the average energy of those samples

        exp_terms = T.dot(s, self.A) + T.reshape(self.ph,
                                                 (1, self.nh))  #np by nh

        energies = T.sum(h * exp_terms, axis=1) + T.log(
            T.sum(T.exp(exp_terms), axis=1))  #should be np by 1

        energy = T.mean(energies)

        return energy

    def update_params(self, x1, x2, n_samps, lrate):

        #this function samples from the joint posterior and performs
        # a step of gradient ascent on the log-likelihood

        sp = self.get_prediction(self.s_past, self.h_past)

        #sp should be np by ns

        [s1_samps, h1_samps, s2_samps, h2_samps
         ], updates = theano.scan(fn=self.sample_joint,
                                  outputs_info=[None, None, None, None],
                                  non_sequences=[sp],
                                  n_steps=n_samps)

        x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1))
        x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1))

        s_pred = self.get_prediction(s1_samps, h1_samps)

        hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps)
        #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps)

        sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1)) / 2.0

        #xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2))
        xterm2 = -T.mean(
            T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) /
            (2.0 * self.xvar**2))

        #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2))
        energy = hterm1 + xterm2 + sterm

        gparams = T.grad(
            energy,
            self.params,
            consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps])

        # constructs the update dictionary
        for gparam, param, rel_lr in zip(gparams, self.params,
                                         self.rel_lrates):
            #gnat=T.dot(param, T.dot(param.T,param))
            updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32')

        #make sure W has unit-length columns
        #new_W=updates[self.W]
        #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32')

        #MIGHT NEED TO NORMALIZE A

        return energy, updates

    def get_ESS(self):

        return 1.0 / T.sum(self.weights_now**2)

    def resample_step(self):

        idx = self.theano_rng.multinomial(
            pvals=T.reshape(self.weights_now, (1, self.npcl))).T
        s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0)
        h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0)

        return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32')

    def resample(self):

        [s_samps, h_samps], updates = theano.scan(fn=self.resample_step,
                                                  outputs_info=[None, None],
                                                  n_steps=self.npcl)

        updates[self.s_now] = T.cast(s_samps, 'float32')
        updates[self.h_now] = T.cast(h_samps, 'float32')
        updates[self.weights_now] = T.cast(
            T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'),
            'float32')  #dtype paranoia

        return updates

    def simulate_step(self, s):

        s = T.reshape(s, (1, self.ns))
        #get h probabilities
        h_probs = self.calc_h_probs(s)

        #h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(self.nh,1)))
        h_samp = self.theano_rng.multinomial(pvals=h_probs)

        sp = self.get_prediction(s, h_samp)

        xp = T.dot(self.W, sp.T) + T.reshape(self.c, (self.nx, 1))

        return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp

    def simulate_forward(self, n_steps):

        s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)),
                   axis=0)
        s0 = T.reshape(s0, (1, self.ns))
        [sp, xp, hs], updates = theano.scan(fn=self.simulate_step,
                                            outputs_info=[s0, None, None],
                                            n_steps=n_steps)

        return sp, xp, hs, updates
Exemplo n.º 56
0
class EncoderDecoder(object):
    def __init__(self, rng, **kwargs):
        self.n_in_src = kwargs.pop('nembed_src')
        self.n_in_trg = kwargs.pop('nembed_trg')
        self.n_hids_src = kwargs.pop('nhids_src')
        self.n_hids_trg = kwargs.pop('nhids_trg')
        self.src_vocab_size = kwargs.pop('src_vocab_size')
        self.trg_vocab_size = kwargs.pop('trg_vocab_size')
        self.method = kwargs.pop('method')
        self.dropout = kwargs.pop('dropout')
        self.maxout_part = kwargs.pop('maxout_part')
        self.path = kwargs.pop('saveto')
        self.clip_c = kwargs.pop('clip_c')
        self.rng = rng
        self.trng = RandomStreams(rng.randint(1e5))

        # added by  Zhaopeng  Tu, 2016-06-09
        self.with_attention = kwargs.pop('with_attention')

        # added by Zhaopeng Tu, 2016-04-29
        self.with_coverage = kwargs.pop('with_coverage')
        self.coverage_dim = kwargs.pop('coverage_dim')
        self.coverage_type = kwargs.pop('coverage_type')
        self.max_fertility = kwargs.pop('max_fertility')
        if self.coverage_type is 'linguistic':
            # make sure the dimension of linguistic coverage is always 1
            self.coverage_dim = 1

        # added by Zhaopeng Tu, 2016-05-30
        self.with_context_gate = kwargs.pop('with_context_gate')

        self.params = []
        self.layers = []

        self.table_src = LookupTable(self.rng,
                                     self.src_vocab_size,
                                     self.n_in_src,
                                     name='table_src')
        self.layers.append(self.table_src)

        self.encoder = BidirectionalEncoder(self.rng,
                                            self.n_in_src,
                                            self.n_hids_src,
                                            self.table_src,
                                            name='birnn_encoder')
        self.layers.append(self.encoder)

        # added by Longyue
        self.encoder_hist_1 = Encoder(self.rng,
                                      self.n_in_src,
                                      self.n_hids_src,
                                      self.table_src,
                                      name='rnn_encoder_hist_1')
        self.layers.append(self.encoder_hist_1)
        self.encoder_hist_2 = Encoder(self.rng,
                                      self.n_hids_src,
                                      self.n_hids_src,
                                      self.table_src,
                                      name='rnn_encoder_hist_2')
        self.layers.append(self.encoder_hist_2)

        self.table_trg = LookupTable(self.rng,
                                     self.trg_vocab_size,
                                     self.n_in_trg,
                                     name='table_trg')
        self.layers.append(self.table_trg)

        self.decoder = Decoder(self.rng, self.n_in_trg, self.n_hids_trg, 2*self.n_hids_src, self.n_hids_src, \
                               # added by Zhaopeng Tu, 2016-06-09

                               with_attention=self.with_attention, \
                               # added by Zhaopeng Tu, 2016-04-29

                               with_coverage=self.with_coverage, coverage_dim=self.coverage_dim, coverage_type=self.coverage_type, max_fertility=self.max_fertility, \
                               # added by Zhaopeng Tu, 2016-05-30

                               with_context_gate=self.with_context_gate, \
                               maxout_part=self.maxout_part, name='rnn_decoder')
        self.layers.append(self.decoder)
        self.logistic_layer = LogisticRegression(self.rng, self.n_in_trg,
                                                 self.trg_vocab_size)
        self.layers.append(self.logistic_layer)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        self.with_reconstruction = kwargs.pop('with_reconstruction')
        if self.with_reconstruction:
            # added by Zhaopeng Tu, 2016-07-27
            self.reconstruction_weight = kwargs.pop('reconstruction_weight')
            # note the source and target sides are reversed
            self.inverse_decoder = InverseDecoder(self.rng, self.n_in_src, 2*self.n_hids_src, self.n_hids_trg, \
                                   # added by Zhaopeng Tu, 2016-06-09

                                   with_attention=self.with_attention, \
                                   maxout_part=self.maxout_part, name='rnn_inverse_decoder')
            self.layers.append(self.inverse_decoder)

            self.srng = RandomStreams(rng.randint(1e5))
            self.inverse_logistic_layer = LogisticRegression(
                self.rng,
                self.n_in_src,
                self.src_vocab_size,
                name='inverse_LR')
            self.layers.append(self.inverse_logistic_layer)

        for layer in self.layers:
            self.params.extend(layer.params)

    def build_trainer(self, src, src_mask, src_hist, src_hist_mask, trg,
                      trg_mask, ite):

        # added by Longyue
        # checked by Zhaopeng: sentence dim = n_steps, hist_len, batch_size (4, 3, 25)
        # hist = (bath_size, sent_num, sent_len) --.T-->
        # hist = (sent_len, sent_num, bath_size) --lookup table-->
        # (sent_len, sent_num, bath_size, word_emb) --reshape-->
        # (sent_len, sent_num*bath_size, word_emb) --word-level rnn-->
        # (sent_len, sent_num*bath_size, hidden_size) --reshape-->
        # (sent_len, sent_num, bath_size, hidden_size) --[-1]-->
        # (sent_num, bath_size, hidden_size) --sent-level rnn-->
        # (sent_num, bath_size, hidden_size) --[-1]-->
        # (bath_size, hidden_size) = cross-sent context vector

        annotations_1 = self.encoder_hist_1.apply_1(src_hist, src_hist_mask)
        annotations_1 = annotations_1[-1]  # get last hidden states
        annotations_2 = self.encoder_hist_2.apply_2(annotations_1)
        annotations_3 = annotations_2[-1]  # get last hidden states

        #modified by Longyue
        annotations = self.encoder.apply(src, src_mask, annotations_3)
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = (annotations *
                        src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None]

        #added by Longyue
        init_context = concatenate([init_context, annotations_3],
                                   axis=annotations_3.ndim - 1)

        trg_emb = self.table_trg.apply(trg)
        trg_emb_shifted = T.zeros_like(trg_emb)
        trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1])
        # modified by Longyue
        hiddens, readout, alignment = self.decoder.run_pipeline(
            state_below=trg_emb_shifted,
            mask_below=trg_mask,
            init_context=init_context,
            c=annotations,
            c_mask=src_mask,
            hist=annotations_3)

        # apply dropout
        if self.dropout < 1.0:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(self.trng, readout, 1, self.dropout)

        p_y_given_x = self.logistic_layer.get_probs(readout)

        self.cost = self.logistic_layer.cost(p_y_given_x, trg,
                                             trg_mask) / trg.shape[1]

        # self.cost = theano.printing.Print('likilihood cost:')(self.cost)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        if self.with_reconstruction:
            # now hiddens is the annotations
            inverse_init_context = (hiddens * trg_mask[:, :, None]
                                    ).sum(0) / trg_mask.sum(0)[:, None]

            src_emb = self.table_src.apply(src)
            src_emb_shifted = T.zeros_like(src_emb)
            src_emb_shifted = T.set_subtensor(src_emb_shifted[1:],
                                              src_emb[:-1])
            inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline(
                state_below=src_emb_shifted,
                mask_below=src_mask,
                init_context=inverse_init_context,
                c=hiddens,
                c_mask=trg_mask)

            # apply dropout
            if self.dropout < 1.0:
                # logger.info('Apply dropout with p = {}'.format(self.dropout))
                inverse_readout = Dropout(self.srng, inverse_readout, 1,
                                          self.dropout)

            p_x_given_y = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            self.reconstruction_cost = self.inverse_logistic_layer.cost(
                p_x_given_y, src, src_mask) / src.shape[1]

            # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost)
            self.cost += self.reconstruction_cost * self.reconstruction_weight

        self.L1 = sum(T.sum(abs(param)) for param in self.params)
        self.L2 = sum(T.sum(param**2) for param in self.params)

        params_regular = self.L1 * 1e-6 + self.L2 * 1e-6
        # params_regular = theano.printing.Print('params_regular:')(params_regular)

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = T.grad(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # updates
        updates = adadelta(self.params, grads)

        # train function
        # modified by Longyue
        inps = [src, src_mask, src_hist, src_hist_mask, trg, trg_mask]

        self.train_fn = theano.function(inps, [train_cost],
                                        updates=updates,
                                        name='train_function')
        # self.train_fn = theano.function(inps, [train_cost], updates=updates, name='train_function', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True))

    def build_sampler(self):

        # added by Longyue
        x_hist = T.ltensor3()
        x_hist_mask = T.tensor3()
        annotations_1 = self.encoder_hist_1.apply_1(x_hist, x_hist_mask)
        annotations_1 = annotations_1[-1]
        annotations_2 = self.encoder_hist_2.apply_2(annotations_1)
        annotations_3 = annotations_2[-1]

        x = T.lmatrix()

        # Build Networks
        # src_mask is None
        c = self.encoder.apply(x, None, annotations_3)
        #init_context = ctx[0, :, -self.n_hids_src:]
        # mean pooling
        init_context = c.mean(0)

        # added by Longyue
        init_context = concatenate([init_context, annotations_3],
                                   axis=annotations_3.ndim - 1)

        init_state = self.decoder.create_init_state(init_context)

        outs = [init_state, c, annotations_3]
        if not self.with_attention:
            outs.append(init_context)

        # compile function
        print 'Building compile_init_state_and_context function ...'
        self.compile_init_and_context = theano.function(
            [x, x_hist, x_hist_mask], outs, name='compile_init_and_context')
        print 'Done'

        y = T.lvector()
        cur_state = T.matrix()
        # if it is the first word, emb should be all zero, and it is indicated by -1
        trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg),
                           self.table_trg.apply(y))

        # added by Zhaopeng Tu, 2016-06-09
        # for with_attention=False
        if self.with_attention and self.with_coverage:
            cov_before = T.tensor3()
            if self.coverage_type is 'linguistic':
                print 'Building compile_fertility ...'
                fertility = self.decoder._get_fertility(c)
                fertility = T.addbroadcast(fertility, 1)
                self.compile_fertility = theano.function(
                    [c], [fertility], name='compile_fertility')
                print 'Done'
            else:
                fertility = None
        else:
            cov_before = None
            fertility = None

        # apply one step
        # modified by Zhaopeng Tu, 2016-04-29
        # [next_state, ctxs] = self.decoder.apply(state_below=trg_emb,
        results = self.decoder.apply(
            state_below=trg_emb,
            init_state=cur_state,
            # added by Zhaopeng Tu, 2016-06-09
            init_context=None if self.with_attention else init_context,
            c=c if self.with_attention else None,
            hist=annotations_3,  # added by Longyue
            one_step=True,
            # added by Zhaopeng Tu, 2016-04-27
            cov_before=cov_before,
            fertility=fertility)
        next_state = results[0]
        if self.with_attention:
            ctxs, alignment = results[1], results[2]
            if self.with_coverage:
                cov = results[3]
        else:
            # if with_attention=False, we always use init_context as the source representation
            ctxs = init_context

        readout = self.decoder.readout(next_state, ctxs, trg_emb)

        # maxout
        if self.maxout_part > 1:
            readout = self.decoder.one_step_maxout(readout)

        # apply dropout
        if self.dropout < 1.0:
            readout = Dropout(self.trng, readout, 0, self.dropout)

        # compute the softmax probability
        next_probs = self.logistic_layer.get_probs(readout)

        # sample from softmax distribution to get the sample
        next_sample = self.trng.multinomial(pvals=next_probs).argmax(1)

        # compile function
        print 'Building compile_next_state_and_probs function ...'
        inps = [y, cur_state]
        if self.with_attention:
            inps.append(c)
        else:
            inps.append(init_context)

        # added by Longyue
        inps.append(annotations_3)

        outs = [next_probs, next_state, next_sample]
        # added by Zhaopeng Tu, 2016-06-09
        if self.with_attention:
            outs.append(alignment)
            # added by Zhaopeng Tu, 2016-04-29
            if self.with_coverage:
                inps.append(cov_before)
                if self.coverage_type is 'linguistic':
                    inps.append(fertility)
                outs.append(cov)

        self.compile_next_state_and_probs = theano.function(
            inps, outs, name='compile_next_state_and_probs')
        print 'Done'

        # added by Zhaopeng Tu, 2016-07-18
        # for reconstruction
        if self.with_reconstruction:
            # Build Networks
            # trg_mask is None
            inverse_c = T.tensor3()
            # mean pooling
            inverse_init_context = inverse_c.mean(0)

            inverse_init_state = self.inverse_decoder.create_init_state(
                inverse_init_context)

            outs = [inverse_init_state]
            if not self.with_attention:
                outs.append(inverse_init_context)

            # compile function
            print 'Building compile_inverse_init_state_and_context function ...'
            self.compile_inverse_init_and_context = theano.function(
                [inverse_c], outs, name='compile_inverse_init_and_context')
            print 'Done'

            src = T.lvector()
            inverse_cur_state = T.matrix()
            trg_mask = T.matrix()
            # if it is the first word, emb should be all zero, and it is indicated by -1
            src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src),
                               self.table_src.apply(src))

            # apply one step
            # modified by Zhaopeng Tu, 2016-04-29
            inverse_results = self.inverse_decoder.apply(
                state_below=src_emb,
                init_state=inverse_cur_state,
                # added by Zhaopeng Tu, 2016-06-09
                init_context=None
                if self.with_attention else inverse_init_context,
                c=inverse_c if self.with_attention else None,
                c_mask=trg_mask,
                one_step=True)
            inverse_next_state = inverse_results[0]
            if self.with_attention:
                inverse_ctxs, inverse_alignment = inverse_results[
                    1], inverse_results[2]
            else:
                # if with_attention=False, we always use init_context as the source representation
                inverse_ctxs = init_context

            inverse_readout = self.inverse_decoder.readout(
                inverse_next_state, inverse_ctxs, src_emb)

            # maxout
            if self.maxout_part > 1:
                inverse_readout = self.inverse_decoder.one_step_maxout(
                    inverse_readout)

            # apply dropout
            if self.dropout < 1.0:
                inverse_readout = Dropout(self.srng, inverse_readout, 0,
                                          self.dropout)

            # compute the softmax probability
            inverse_next_probs = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            # sample from softmax distribution to get the sample
            inverse_next_sample = self.srng.multinomial(
                pvals=inverse_next_probs).argmax(1)

            # compile function
            print 'Building compile_inverse_next_state_and_probs function ...'
            inps = [src, trg_mask, inverse_cur_state]
            if self.with_attention:
                inps.append(inverse_c)
            else:
                inps.append(inverse_init_context)
            outs = [
                inverse_next_probs, inverse_next_state, inverse_next_sample
            ]
            # added by Zhaopeng Tu, 2016-06-09
            if self.with_attention:
                outs.append(inverse_alignment)

            self.compile_inverse_next_state_and_probs = theano.function(
                inps, outs, name='compile_inverse_next_state_and_probs')
            print 'Done'

    def save(self, path=None):
        if path is None:
            path = self.path
        filenpz = open(path, "w")
        val = dict([(value.name, value.get_value())
                    for index, value in enumerate(self.params)])
        logger.info("save the model {}".format(path))
        numpy.savez(path, **val)
        filenpz.close()

    def load(self, path=None):
        if path is None:
            path = self.path
        if os.path.isfile(path):
            logger.info("load params {}".format(path))
            val = numpy.load(path)
            for index, param in enumerate(self.params):
                logger.info('Loading {} with shape {}'.format(
                    param.name,
                    param.get_value(borrow=True).shape))
                if param.name not in val.keys():
                    logger.info('Adding new param {} with shape {}'.format(
                        param.name,
                        param.get_value(borrow=True).shape))
                    continue
                if param.get_value().shape != val[param.name].shape:
                    logger.info("Error: model param != load param shape {} != {}".format(\
                                        param.get_value().shape, val[param.name].shape))
                    raise Exception("loading params shape mismatch")
                else:
                    param.set_value(val[param.name], borrow=True)
        else:
            logger.error("file {} does not exist".format(path))
            self.save()
class RNNUnidirectionalEncDec(Model):
    def __init__(self, hyperparams, encoder_vocab, decoder_vocab):
        self.hyperparams = hyperparams
        self.encoder_vocab = encoder_vocab
        self.decoder_vocab = decoder_vocab

        # hyperparams.encoder_vocab_size and hyperparams.decoder_vocab_size setting to max
        # TODO: Uncomment this and throw error
        #hyperparams.encoder_vocab_size = min(hyperparams.encoder_vocab_size, encoder_vocab.vocab_size)
        #hyperparams.decoder_vocab_size = min(hyperparams.decoder_vocab_size, decoder_vocab.vocab_size)

        # Preparing and Initializing Network Weights & Biases
        self.setup()

    # TODO: Loading and storing params
    def setup(self):
        """
        Setup the shared variables and model components
        """
        self._params = OrderedDict()

        # Encoder embeddings
        self.encoder_embeddings = Embeddings(
            'encoder_emb', self.hyperparams.encoder_vocab.vocab_size,
            self.hyperparams.encoder_emb_dim)
        self._params.update(self.encoder_embeddings.params())

        # Decoder embeddings
        self.decoder_embeddings = Embeddings(
            'decoder_emb',
            self.hyperparams.decoder_vocab.vocab_size,
            self.hyperparams.decoder_emb_dim,
            add_bos=True)
        self._params.update(self.decoder_embeddings.params())

        ################
        # Encoder Layer
        ################
        # TODO: make a different class
        if self.hyperparams.rnn_cell == 'gru':
            from ..nn.layers.gru import GRU as RNN
        elif self.hyperparams.rnn_cell == 'lstm':
            raise NotImplementedError
        else:
            logger.error("Invalid RNN Cell Type:" + self.hyperparams.rnn_cell)

        self.encoder_rnn_layer_l2r = RNN(
            name='encoder' + self.hyperparams.rnn_cell + '0_l2r',
            in_dim=self.hyperparams.encoder_emb_dim,
            num_units=self.hyperparams.encoder_units)
        self._params.update(self.encoder_rnn_layer_l2r.params())

        # Transform to prepare init state of decoder
        self.decoder_init_transform = Dense(
            name='decoder_init_transform',
            in_dim=self.hyperparams.encoder_units,
            num_units=self.hyperparams.decoder_units,
            activation=Activation.tanh)
        self._params.update(self.decoder_init_transform.params())

        ################
        # Decoder Layer
        ###############
        # TODO: make a different class
        if self.hyperparams.rnn_cell == 'gru':
            from ..nn.layers.gru import ConditionalGRU as ConditionalRNN
        elif self.hyperparams.rnn_cell == 'lstm':
            raise NotImplementedError
        else:
            logger.error("Invalid RNN Cell Type:" + self.hyperparams.rnn_cell)

        self.decoder_rnn_layer = ConditionalRNN(
            name='decoder_' + self.hyperparams.rnn_cell + '0',
            in_dim=self.hyperparams.decoder_emb_dim,
            num_units=self.hyperparams.decoder_units,
            context_dim=self.hyperparams.encoder_units)
        self._params.update(self.decoder_rnn_layer.params())

        # Read out words

        self.decoder_state_transform = Dense(
            name='decoder_state_transform',
            in_dim=self.hyperparams.decoder_units,
            num_units=self.hyperparams.decoder_emb_dim,
            activation=Activation.linear)
        self._params.update(self.decoder_state_transform.params())

        self.prev_emb_transform = Dense(
            name='prev_emb_transform',
            in_dim=self.hyperparams.decoder_emb_dim,
            num_units=self.hyperparams.decoder_emb_dim,
            activation=Activation.linear)
        self._params.update(self.prev_emb_transform.params())

        self.encoder_context_transform = Dense(
            name='encoder_context_transform',
            in_dim=self.hyperparams.encoder_units,
            num_units=self.hyperparams.decoder_emb_dim,
            activation=Activation.linear)
        self._params.update(self.encoder_context_transform.params())

        self.word_probs_transform = Dense(
            name='word_probs_transform',
            in_dim=self.hyperparams.decoder_emb_dim,
            num_units=self.decoder_vocab.vocab_size,
            activation=Activation.linear)
        self._params.update(self.word_probs_transform.params())

        # DEBUG
        #for k, v in self._params.iteritems():
        #    print k, v.get_value(), v.get_value().shape

    def build(self):
        self.trng = RandomStreams(1234)

        # dim(x) = (input_time_steps, num_samples)
        self.x = T.matrix('x', dtype='int64')
        # dim(x_mask) = (input_time_steps, num_samples)
        self.x_mask = T.matrix('x_mask', dtype='float32')
        # dim(y) = (output_time_steps, num_samples)
        self.y = T.matrix('y', dtype='int64')
        # dim(y_mask) = (output_time_steps, num_samples)
        self.y_mask = T.matrix('y_mask', dtype='float32')

        # get source word embeddings
        enc_emb = self.encoder_embeddings.Emb[self.x.flatten()]
        # dim(x) = timesteps x samples
        enc_emb = enc_emb.reshape([
            self.x.shape[0], self.x.shape[1], self.hyperparams.encoder_emb_dim
        ])

        # get decoder init state
        self.encoder_outputs = self.encoder_rnn_layer_l2r.build(
            enc_emb, self.x_mask)[0]
        last_encoder_output = self.encoder_outputs[
            -1]  # This will be the context at every input step

        # transform encoder output to get decoder init
        self.decoder_init = self.decoder_init_transform.build(
            last_encoder_output)

        # input
        # TODO: remove embedding shifting?
        dec_emb = self.decoder_embeddings.Emb[self.y.flatten()]
        dec_emb = dec_emb.reshape([
            self.y.shape[0], self.y.shape[1], self.hyperparams.decoder_emb_dim
        ])

        # Building the RNN layer
        self.decoder_outputs = self.decoder_rnn_layer.build(
            x=dec_emb,
            x_mask=self.y_mask,
            c=last_encoder_output,
            h_init=self.decoder_init)[
                0]  # Only one output, hidden states at every time step

        # context with new axis. condition the output on encoder context as well.
        # TODO: remove axis adding?
        context = last_encoder_output[None, :, :]

        # dim(proj_h) = #timesteps x #samples x #num_units

        # Computing word probabilities
        logit_decoder_rnn = self.decoder_state_transform.build(
            self.decoder_outputs
        )  # dim(logit_rnn) = #timesteps x #samples x #emb_dim
        logit_prev_emb = self.prev_emb_transform.build(
            dec_emb)  # dim(logit_prev) = #timesteps x #samples x #emb_dim
        logit_enc_context = self.encoder_context_transform.build(context)
        logit = self.word_probs_transform.build(
            Activation.tanh(logit_decoder_rnn + logit_prev_emb +
                            logit_enc_context)
        )  # dim(logit) = #timesteps x #samples x #vocab_size

        # reshaping logit as (#timesteps*#samples) x vocab_size and performing softmax across vocabulary
        self.probs = T.nnet.softmax(
            logit.reshape([
                logit.shape[0] * logit.shape[1], logit.shape[2]
            ]))  #dim(probs) = (#timesteps*#samples) x vocab_size
        self.debug = [self.probs.shape, self.y.shape, self.y_mask.shape]
        #Building loss function
        self.build_loss()

        self._outputs = [self.probs]

    def build_loss(self):
        # TODO: Make it better?
        # y[0]  is bos, remove it to calculate loss
        y_flat = self.y[1:].flatten(
        )  #x_flat: a linear array with size #timesteps*#samples
        y_flat_idx = T.arange(
            y_flat.shape[0]) * self.decoder_vocab.vocab_size + y_flat

        self._loss = -T.log(self.probs.flatten()[y_flat_idx])
        self._loss = self._loss.reshape([self.y.shape[0] - 1, self.y.shape[1]])
        self._loss = (self._loss * self.y_mask[1:]).sum(0)

    def build_sampler(self, sampling=True):
        initializer_input = [self.x, self.x_mask]
        initializer_output = [self.encoder_outputs, self.decoder_init]
        self.initializer = theano.function(initializer_input,
                                           initializer_output)

        sampler_input = [self.y, self.y_mask] + initializer_output

        if sampling == True:
            # sample a word from the output softmax, instead of selecting argmax
            next_token_index = self.trng.multinomial(pvals=self.probs).argmax(
                1
            )  # multinomial will represent 1 hot representation of the selected sample
        else:
            next_token_index = T.argmax(self.probs, axis=1)

        sampler_output = [self.probs, next_token_index, self.decoder_outputs]
        self.sampler = theano.function(sampler_input, sampler_output)

    def sample(self, batch, num_samples=5):
        source, source_mask, target, target_mask = self.prepare_train_input(
            batch)
        num_samples = np.minimum(1, source.shape[1])
        # TODO: replace by random sampling:
        source = source[:, 0:num_samples]
        source_mask = source_mask[:, 0:num_samples]
        target = target[:, 0:num_samples]
        target_mask = target_mask[:, 0:num_samples]

        samples = []
        for sample_index in xrange(num_samples):
            hypothesis = self.encode_decode([
                source[:, sample_index:sample_index + 1],
                source_mask[:, sample_index:sample_index + 1]
            ])
            hypothesis_sent = ' '.join(hypothesis)

            source_sent = ' '.join([
                self.encoder_vocab.get_token(index)
                for index in source[:, sample_index] if
                index != self.encoder_vocab.get_index(self.encoder_vocab.eos)
            ])

            #hypothesis_sent = ' '.join([self.decoder_vocab.get_token(index) for index in target[:, sample_index] if index != self.decoder_vocab.get_index(self.decoder_vocab.eos)])

            target_sent = ' '.join([
                self.decoder_vocab.get_token(index)
                for index in target[:, sample_index] if index != -1 and
                index != self.decoder_vocab.get_index(self.decoder_vocab.eos)
            ])
            # TODO: change sample to dictionary, and in trainer display all keys and values
            samples.append(
                OrderedDict({
                    'SRC': source_sent,
                    'HYP': hypothesis_sent,
                    'REF': target_sent
                }))

        return samples

    def encode_decode(self, test_input, max_length=50):
        encoding = self.initializer(*test_input)
        init_input = [np.array([[-1]]),
                      np.array([[1.]], dtype='float32')] + encoding

        probs, next_token_index, decoder_outputs = self.sampler(*init_input)

        hypothesis = []
        hyp_length = 0
        while (next_token_index[0] != self.decoder_vocab.get_index(
                self.decoder_vocab.eos)):
            hypothesis.append(self.decoder_vocab.get_token(
                next_token_index[0]))
            hyp_length += 1
            # This is if next_token_index is a scalar, i.e. when only one column is passed as test_input
            # [None] adds a new axis
            next_input = [
                next_token_index[None],
                np.array([[1.]], dtype='float32')
            ] + encoding
            probs, next_token_index, decoder_outputs = self.sampler(
                *next_input)
            if hyp_length == max_length:
                break

        return hypothesis

    def loss(self):
        return self._loss.mean()

    def log_probs(self):
        # TODO: Make it better?
        return self._loss

    def outputs(self):
        return self._outputs

    def inputs(self):
        return [self.x, self.x_mask, self.y, self.y_mask]

    def params(self):
        return self._params

    def prepare_train_input(self, batch, max_length=None):
        # setting maxlen to length of longest sample
        max_length_input = max([len(sample[0]) for sample in batch])
        max_length_target = max([len(sample[1]) for sample in batch])

        # adding end of sentence marker
        inp = [[self.encoder_vocab.get_index(token) for token in sample[0]] +
               [self.encoder_vocab.get_index(self.encoder_vocab.eos)]
               for sample in batch]
        target = [[self.decoder_vocab.get_index(token)
                   for token in sample[1]] +
                  [self.decoder_vocab.get_index(self.decoder_vocab.eos)]
                  for sample in batch]
        max_length_input += 1
        max_length_target += 1

        # preparing mask and input
        source_mask = np.array(
            [[1.] * len(inp_instance[:max_length_input]) + [0.] *
             (max_length_input - len(inp_instance)) for inp_instance in inp],
            dtype='float32').transpose()
        source = np.array([
            inp_instance[:max_length_input] + [0.] *
            (max_length_input - len(inp_instance)) for inp_instance in inp
        ],
                          dtype='int64').transpose()

        # taret preparation with -1 (beginning of sentence) row upfront
        target_mask = np.array(
            [[1.] + [1.] * len(target_instance[:max_length_target]) + [0.] *
             (max_length_target - len(target_instance))
             for target_instance in target],
            dtype='float32').transpose()
        target = np.array([[-1] + target_instance[:max_length_target] + [0.] *
                           (max_length_target - len(target_instance))
                           for target_instance in target],
                          dtype='int64').transpose()
        return source, source_mask, target, target_mask
Exemplo n.º 58
0
def stochastic_max_pool_bc01(bc01,
                             pool_shape,
                             pool_stride,
                             image_shape,
                             rng=None):
    """
    .. todo::

        WRITEME properly

    Stochastic max pooling for training as defined in:

    Stochastic Pooling for Regularization of Deep Convolutional Neural Networks
    Matthew D. Zeiler, Rob Fergus

    bc01: minibatch in format (batch size, channels, rows, cols),
        IMPORTANT: All values should be poitivie
    pool_shape: shape of the pool region (rows, cols)
    pool_stride: strides between pooling regions (row stride, col stride)
    image_shape: avoid doing some of the arithmetic in theano
    rng: theano random stream
    """
    r, c = image_shape
    pr, pc = pool_shape
    rs, cs = pool_stride

    batch = bc01.shape[0]
    channel = bc01.shape[1]

    if rng is None:
        rng = RandomStreams(2022)

    # Compute index in pooled space of last needed pool
    # (needed = each input pixel must appear in at least one pool)
    def last_pool(im_shp, p_shp, p_strd):
        rval = int(numpy.ceil(float(im_shp - p_shp) / p_strd))
        assert p_strd * rval + p_shp >= im_shp
        assert p_strd * (rval - 1) + p_shp < im_shp
        return rval

    # Compute starting row of the last pool
    last_pool_r = last_pool(image_shape[0], pool_shape[0],
                            pool_stride[0]) * pool_stride[0]
    # Compute number of rows needed in image for all indexes to work out
    required_r = last_pool_r + pr

    last_pool_c = last_pool(image_shape[1], pool_shape[1],
                            pool_stride[1]) * pool_stride[1]
    required_c = last_pool_c + pc

    # final result shape
    res_r = int(numpy.floor(last_pool_r / rs)) + 1
    res_c = int(numpy.floor(last_pool_c / cs)) + 1

    for bc01v in get_debug_values(bc01):
        assert not numpy.any(numpy.isinf(bc01v))
        assert bc01v.shape[2] == image_shape[0]
        assert bc01v.shape[3] == image_shape[1]

    # padding
    padded = tensor.alloc(0.0, batch, channel, required_r, required_c)
    name = bc01.name
    if name is None:
        name = 'anon_bc01'
    bc01 = tensor.set_subtensor(padded[:, :, 0:r, 0:c], bc01)
    bc01.name = 'zero_padded_' + name

    # unraveling
    window = tensor.alloc(0.0, batch, channel, res_r, res_c, pr, pc)
    window.name = 'unravlled_winodows_' + name

    for row_within_pool in xrange(pool_shape[0]):
        row_stop = last_pool_r + row_within_pool + 1
        for col_within_pool in xrange(pool_shape[1]):
            col_stop = last_pool_c + col_within_pool + 1
            win_cell = bc01[:, :, row_within_pool:row_stop:rs,
                            col_within_pool:col_stop:cs]
            window = tensor.set_subtensor(
                window[:, :, :, :, row_within_pool, col_within_pool], win_cell)

    # find the norm
    norm = window.sum(axis=[4, 5])
    norm = tensor.switch(tensor.eq(norm, 0.0), 1.0, norm)
    norm = window / norm.dimshuffle(0, 1, 2, 3, 'x', 'x')
    # get prob
    prob = rng.multinomial(pvals=norm.reshape(
        (batch * channel * res_r * res_c, pr * pc)),
                           dtype='float32')
    # select
    res = (window * prob.reshape(
        (batch, channel, res_r, res_c, pr, pc))).max(axis=5).max(axis=4)
    res.name = 'pooled_' + name

    return tensor.cast(res, theano.config.floatX)
Exemplo n.º 59
0
        ),
        dtype = theano.config.floatX
    ),
    name = 'w_01',
    borrow = True,
)
b_01 = theano.shared(
    value = np.zeros((4,), dtype=theano.config.floatX),
    name = 'b_01',
    borrow = True,
)
h_0 = T.tanh(T.dot(x, w_00) + b_00)
y_0 = T.dot(h_0, w_01) + b_01
distn_0 = T.nnet.softmax(y_0[:-1])
s_1 = T.nnet.sigmoid(y_0[-1:])
cate_chosen_0 = T.flatten(srng.multinomial(n = 1, pvals = distn_0))
ind_chosen_0 = T.argmax(cate_chosen_0)

w_10 = theano.shared(
    value = np.asarray(
        rng.uniform(
            low = -np.sqrt(6. / 8),
            high = np.sqrt(6. / 8),
            size = (4, 4)
        ),
        dtype = theano.config.floatX
    ),
    name = 'w_10',
    borrow = True,
)
b_10 = theano.shared(
Exemplo n.º 60
0
class ImportanceSampler():
	'''Implements importance sampling/resampling'''
	
	def __init__(self, ndims, n_particles, true_log_probs, proposal_func=None):
		'''
		true_log_probs: a function that returns the true relative log probabilities
		proposal_func: a function that returns (samples, relative_log_probabilities)
		n_particles: the number of particles to use
		'''
		self.true_log_probs=true_log_probs
		self.proposal_func=proposal_func
		self.n_particles=n_particles
		self.ndims=ndims
		
		init_particles=np.zeros((n_particles, self.ndims))
		init_weights=np.ones(n_particles)/float(n_particles)
		
		self.particles=theano.shared(init_particles.astype(np.float32))
		self.weights=theano.shared(init_weights.astype(np.float32))
		
		self.theano_rng=RandomStreams()
		
		self.get_ESS=None
		self.perform_resampling=None
		self.perform_sampling=None
	
	
	def set_proposal_func(self, proposal_func):
		'''You might need to use this if you want to make the proposal
		function depend on the current particles'''
		self.proposal_func=proposal_func
		return
	
	
	def sample_reweight(self):
		'''Samples new particles and reweights them'''
		samples, prop_log_probs = self.proposal_func()
		true_log_probs=self.true_log_probs(samples)
		diffs=true_log_probs-prop_log_probs
		weights_unnorm=T.exp(diffs)
		weights=weights_unnorm/T.sum(weights_unnorm)
		updates=OrderedDict()
		updates[self.weights]=T.cast(weights,'float32')
		updates[self.particles]=T.cast(samples,'float32')
		return updates
	
	
	def compute_ESS(self):
		'''Returns the effective sample size'''
		return 1.0/T.sum(self.weights**2)
	
	
	def resample(self):
		'''Resamples using the current weights'''
		samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.weights.dimshuffle('x',0),self.n_particles,axis=0))
		idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64')
		updates=OrderedDict()
		updates[self.particles]=self.particles[idxs]
		updates[self.weights]=T.cast(T.ones_like(self.weights)/float(self.n_particles),'float32')
		return updates
	
	
	def compile(self):
		'''Compiles the ESS, resampling, and sampling functions'''
		ess=self.compute_ESS()
		self.get_ESS=theano.function([],ess)
		resample_updates=self.resample()
		self.perform_resampling=theano.function([],updates=resample_updates)
		sample_updates=self.sample_reweight()
		self.perform_sampling=theano.function([],updates=sample_updates)
		return