def test_multinomial_n_samples(): mode_ = mode if mode == 'FAST_COMPILE': mode_ = 'FAST_RUN' if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or mode == 'Mode' and config.linker in ['py']): sample_size = (49, 5) else: sample_size = (450, 6) mode_ = theano.compile.mode.get_mode(mode_) pvals = numpy.asarray(numpy.random.uniform(size=sample_size)) pvals = numpy.apply_along_axis(lambda row: row / numpy.sum(row), 1, pvals) R = MRG_RandomStreams(234, use_cuda=False) for n_samples, steps in zip([5, 10, 100, 1000], [20, 10, 1, 1]): m = R.multinomial(pvals=pvals, n=n_samples, dtype=config.floatX, nstreams=30 * 256) f = theano.function([], m, mode=mode_) basic_multinomialtest(f, steps, sample_size, pvals, n_samples, prefix='mrg ') sys.stdout.flush() if mode != 'FAST_COMPILE' and cuda_available: R = MRG_RandomStreams(234, use_cuda=True) pvals = numpy.asarray(pvals, dtype='float32') n = R.multinomial(pvals=pvals, n=n_samples, dtype='float32', nstreams=30 * 256) assert n.dtype == 'float32' f = theano.function( [], theano.sandbox.cuda.basic_ops.gpu_from_host(n), mode=mode_.including('gpu')) sys.stdout.flush() basic_multinomialtest(f, steps, sample_size, pvals, n_samples, prefix='gpu mrg ')
def test_multinomial(): steps = 100 mode_ = mode if mode == 'FAST_COMPILE': mode_ = 'FAST_RUN' if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or mode == 'Mode' and config.linker in ['py']): sample_size = (49, 5) else: sample_size = (450, 6) mode_ = theano.compile.mode.get_mode(mode_) # print '' # print 'ON CPU:' pvals = numpy.asarray(numpy.random.uniform(size=sample_size)) pvals = numpy.apply_along_axis(lambda row: row / numpy.sum(row), 1, pvals) R = MRG_RandomStreams(234, use_cuda=False) # Note: we specify `nstreams` to avoid a warning. m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256) f = theano.function([], m, mode=mode_) # theano.printing.debugprint(f) out = f() basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1, prefix='mrg ') sys.stdout.flush() if mode != 'FAST_COMPILE' and cuda_available: # print '' # print 'ON GPU:' R = MRG_RandomStreams(234, use_cuda=True) pvals = numpy.asarray(pvals, dtype='float32') # We give the number of streams to avoid a warning. n = R.multinomial(pvals=pvals, dtype='float32', nstreams=30 * 256) # well, it's really that this test w GPU doesn't make sense otw assert n.dtype == 'float32' f = theano.function( [], theano.sandbox.cuda.basic_ops.gpu_from_host(n), mode=mode_.including('gpu')) # theano.printing.debugprint(f) gpu_out = f() sys.stdout.flush() basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1, prefix='gpu mrg ') numpy.testing.assert_array_almost_equal(out, gpu_out, decimal=6)
def prediction(self, h, bias): srng = RandomStreams(seed=42) prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \ self.compute_parameters(h, bias) mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1) v = T.arange(0, mean_x.shape[0]) m_x = mean_x[v, mode] m_y = mean_y[v, mode] s_x = std_x[v, mode] s_y = std_y[v, mode] r = rho[v, mode] # cov = r * (s_x * s_y) normal = srng.normal((h.shape[0], 2)) x = normal[:, 0] y = normal[:, 1] # x_n = T.shape_padright(s_x * x + cov * y + m_x) # y_n = T.shape_padright(s_y * y + cov * x + m_y) x_n = T.shape_padright(m_x + s_x * x) y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1.-r**2))) uniform = srng.uniform((h.shape[0],)) pin = T.shape_padright(T.cast(bernoulli > uniform, floatX)) return T.concatenate([x_n, y_n, pin], axis=1)
def dropout(X, p_use=1.): if p_use < 1.: rs = RandomStreams() out = rs.multinomial(pvals=[[p_use, 1.-p_use]]*len(X)) print out else: return X
def dropout(X, p_use=1.): if p_use < 1.: rs = RandomStreams() out = rs.multinomial(pvals=[[p_use, 1.-p_use]]) print out.flatten() print dir(out.T) else: return X
def test_multinomial(): steps = 100 mode_ = mode if mode == "FAST_COMPILE": mode_ = "FAST_RUN" if mode in ["DEBUG_MODE", "DebugMode", "FAST_COMPILE"]: sample_size = (49, 5) else: sample_size = (450, 6) mode_ = theano.compile.mode.get_mode(mode_) print "" print "ON CPU:" pvals = numpy.asarray(numpy.random.uniform(size=sample_size)) pvals = numpy.apply_along_axis(lambda row: row / numpy.sum(row), 1, pvals) R = MRG_RandomStreams(234, use_cuda=False) # Note: we specify `nstreams` to avoid a warning. m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256) f = theano.function([], m, mode=mode_) theano.printing.debugprint(f) out = f() basic_multinomialtest(f, steps, sample_size, pvals, prefix="mrg ") sys.stdout.flush() if mode != "FAST_COMPILE" and cuda_available: print "" print "ON GPU:" R = MRG_RandomStreams(234, use_cuda=True) pvals = numpy.asarray(pvals, dtype="float32") # We give the number of streams to avoid a warning. n = R.multinomial(pvals=pvals, dtype="float32", nstreams=30 * 256) assert n.dtype == "float32" # well, it's really that this test w GPU doesn't make sense otw f = theano.function([], theano.sandbox.cuda.basic_ops.gpu_from_host(n), mode=mode_.including("gpu")) theano.printing.debugprint(f) gpu_out = f() sys.stdout.flush() basic_multinomialtest(f, steps, sample_size, pvals, prefix="gpu mrg ") numpy.testing.assert_array_almost_equal(out, gpu_out, decimal=6)
def __init__(self, seq_len, emb_size, n_hidden, size_dict, batch_size, lr): self.seq_len = seq_len self.batch_size = batch_size w_emb = shared(np.random.normal( 0, 0.01, size=(size_dict, emb_size)).astype(dtype=floatX)) w_in = shared(np.random.normal( 0, 0.01, size=(emb_size, n_hidden)).astype(dtype=floatX)) b_in = shared(np.random.normal( 0, 0.01, size=(n_hidden,)).astype(dtype=floatX)) # IRNN initialization # w_hidden = shared(np.eye(n_hidden).astype(dtype=floatX)) w_hidden = shared(np.random.normal( 0, 0.01, size=(n_hidden, n_hidden)).astype(dtype=floatX)) b_hidden = shared(np.random.normal( 0, 0.01, size=(n_hidden,)).astype(dtype=floatX)) w_out = shared(np.random.normal( 0, 0.01, size=(n_hidden, size_dict)).astype(dtype=floatX)) b_out = shared(np.random.normal( 0, 0.01, size=(size_dict,)).astype(dtype=floatX)) self.params = [w_emb, w_in, b_in, w_hidden, b_hidden, w_out, b_out] x = t.imatrix('x') y = t.ivector('y') self.init_state = shared(np.zeros((batch_size, n_hidden), dtype=floatX)) buff = self.init_state for e in xrange(seq_len): emb = w_emb[x[:, e]] emb = emb.reshape((x.shape[0], -1)) buff = relu(t.dot(emb, w_in) + t.dot(buff, w_hidden) + b_hidden) y_hat = t.nnet.softmax((t.dot(buff, w_out)) + b_out) cost = t.nnet.categorical_crossentropy(y_hat, y).mean() params = [w_emb, w_in, w_hidden, b_hidden, w_out, b_out] grads = t.grad(cost, params) updates = [(self.init_state, buff)] + \ [(w, w - lr * p) for w, p in zip(params, grads)] self.fun_cost = function([x, y], cost, updates=updates) rng = MRG_RandomStreams(42) next_char = t.argmax(rng.multinomial(pvals=y_hat), axis=1) self.fun_predict = function([x], next_char)
def test_target_parameter(): srng = MRG_RandomStreams() pvals = np.array([[.98, .01, .01], [.01, .49, .50]]) def basic_target_parameter_test(x): f = theano.function([], x) assert isinstance(f(), np.ndarray) basic_target_parameter_test(srng.uniform((3, 2), target='cpu')) basic_target_parameter_test(srng.binomial((3, 2), target='cpu')) basic_target_parameter_test(srng.multinomial(pvals=pvals.astype('float32'), target='cpu')) basic_target_parameter_test(srng.choice(p=pvals.astype('float32'), replace=False, target='cpu')) basic_target_parameter_test(srng.multinomial_wo_replacement(pvals=pvals.astype('float32'), target='cpu'))
def test_undefined_grad_opt(): # Make sure that undefined grad get removed in optimized graph. random = RandomStreams(np.random.randint(1, 2147462579)) pvals = theano.shared(np.random.rand(10, 20).astype(theano.config.floatX)) pvals = pvals / pvals.sum(axis=1) pvals = gradient.zero_grad(pvals) samples = random.multinomial(pvals=pvals, n=1) samples = theano.tensor.cast(samples, pvals.dtype) samples = gradient.zero_grad(samples) cost = theano.tensor.sum(samples + pvals) grad = theano.tensor.grad(cost, samples) f = theano.function([], grad) theano.printing.debugprint(f) assert not any([isinstance(node.op, gradient.UndefinedGrad) for node in f.maker.fgraph.apply_nodes])
def get_decide_func(self): """ Returns a theano function that takes a minibatch (num_examples, num_features) of contexts and returns a minibatch (num_examples, num_classes) of one-hot codes for actions. """ X = T.matrix() y_hat = self.mlp.fprop(X) theano_rng = MRG_RandomStreams(2013 + 11 + 20) if self.stochastic: a = theano_rng.multinomial(pvals=y_hat, dtype='float32') else: mx = T.max(y_hat, axis=1).dimshuffle(0, 'x') a = T.eq(y_hat, mx) if self.epsilon is not None: a = theano_rng.multinomial(pvals = (1. - self.epsilon) * a + self.epsilon * T.ones_like(y_hat) / y_hat.shape[1], dtype = 'float32') if self.epsilon_stochastic is not None: a = theano_rng.multinomial(pvals = (1. - self.epsilon_stochastic) * a + self.epsilon_stochastic * y_hat, dtype = 'float32') print "Compiling classifier agent learning function" t1 = time.time() f = function([X], a) t2 = time.time() print "...done, took", t2 - t1 return f
def stochastic_pool(neibs, axis, deterministic): """ NOTE: assumes that inputs are >= 0 """ assert axis == 1 # TODO parameterize epsilon = 1e-6 as_p = neibs / (neibs.sum(axis=axis, keepdims=True) + epsilon) if deterministic: mask = as_p else: # FIXME save state in network srng = MRG_RandomStreams() mask = srng.multinomial(pvals=as_p).astype(fX) return (neibs * mask).sum(axis=axis)
def softmax_sample_layer(list_of_multinomial_inputs, name, random_state=None): theano_seed = random_state.randint(-2147462579, 2147462579) # Super edge case... if theano_seed == 0: print("WARNING: prior layer got 0 seed. Reseeding...") theano_seed = random_state.randint(-2**32, 2**32) theano_rng = MRG_RandomStreams(seed=theano_seed) conc_multinomial = concatenate(list_of_multinomial_inputs, name, axis=1) shape = expression_shape(conc_multinomial) conc_multinomial /= len(list_of_multinomial_inputs) tag_expression(conc_multinomial, name, shape) samp = theano_rng.multinomial(pvals=conc_multinomial, dtype="int32") tag_expression(samp, name, (shape[0], shape[1])) return samp
def test_multinomial(): steps = 100 if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or config.mode == 'Mode' and config.linker in ['py']): sample_size = (49, 5) else: sample_size = (450, 6) pvals = np.asarray(np.random.uniform(size=sample_size)) pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals) R = MRG_RandomStreams(234) # Note: we specify `nstreams` to avoid a warning. m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256) f = theano.function([], m) f() basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1, prefix='mrg ')
def test_multinomial_n_samples(): if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or config.mode == 'Mode' and config.linker in ['py']): sample_size = (49, 5) else: sample_size = (450, 6) pvals = np.asarray(np.random.uniform(size=sample_size)) pvals = np.apply_along_axis(lambda row: row / np.sum(row), 1, pvals) R = MRG_RandomStreams(234) for n_samples, steps in zip([5, 10, 100, 1000], [20, 10, 1, 1]): m = R.multinomial(pvals=pvals, n=n_samples, dtype=config.floatX, nstreams=30 * 256) f = theano.function([], m) basic_multinomialtest(f, steps, sample_size, pvals, n_samples, prefix='mrg ') sys.stdout.flush()
def get_cost(self, X, Y, **kwargs): # Dream theano_rng = MRG_RandomStreams(2012 + 12 + 18) exp_y = T.nnet.softmax(T.alloc(0., self.batch_size, self.n_classes) + self.gyb) dy = theano_rng.multinomial(pvals = exp_y, dtype='float32') dy = block_gradient(dy) exp_h2 = T.nnet.sigmoid(T.dot(dy, self.gh2w) + self.gh2b) dh2 = theano_rng.binomial(p = exp_h2, size = exp_h2.shape, dtype='float32') dh2 = block_gradient(dh2) exp_h1 = T.nnet.sigmoid(T.dot(dh2, self.gh1w) + self.gh1b) dh1 = theano_rng.binomial(p = exp_h1, size = exp_h1.shape, dtype='float32') dh1 = block_gradient(dh1) exp_v = T.nnet.sigmoid(T.dot(dh1, self.gvw) + self.gvb) dv = theano_rng.binomial(p = exp_v, size = exp_v.shape, dtype='float32') dv = block_gradient(dv) # Explanation of dream zh1, rh1 = self.infer_h1(dv) zh2 = T.dot(rh1, self.rh2w) + self.rh2b rh2 = T.nnet.sigmoid(zh2) zy = T.dot(rh2, self.ryw) + self.ryb # Probability of dream dream_prob = sigmoid_prob(zh1, dh1) + sigmoid_prob(zh2, dh2) + softmax_prob(zy, dy) # Explanation of reality zh1, rh1 = self.infer_h1(X) rh1 = block_gradient(rh1) zh2 = T.dot(rh1, self.rh2w) + self.rh2b rh2 = theano_rng.binomial(p = T.nnet.sigmoid(zh2), size = zh2.shape, dtype='float32') rh2 = block_gradient(rh2) # Probability of reality real_prob = softmax_prob(T.alloc(0., self.batch_size, self.n_classes) + self.gyb, Y) + \ sigmoid_prob(T.dot(Y, self.gh2w) + self.gh2b, rh2) + \ sigmoid_prob(T.dot(rh2, self.gh1w) + self.gh1b, rh1) + \ sigmoid_prob(T.dot(rh1, self.gvw) + self.gvb, X) return - dream_prob - real_prob + .0001 * ( T.sqr(self.gvw).sum() + T.sqr(self.gh1w).sum() + \ T.sqr(self.gh2w).sum() )
def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.output_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16)) h_exp = T.nnet.softmax(default_z) h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype) p_state = sharedX( self.output_space.get_origin_batch( num_examples)) t2 = time.time() f = function([], updates = { h_state : h_sample }) t3 = time.time() f() t4 = time.time() print str(self)+'.make_state took',t4-t1 print '\tcompose time:',t2-t1 print '\tcompile time:',t3-t2 print '\texecute time:',t4-t3 h_state.name = 'softmax_sample_shared' return h_state
def softmax_sample_layer(list_of_multinomial_inputs, graph, name, random_state=None): theano_seed = random_state.randint(-2147462579, 2147462579) # Super edge case... if theano_seed == 0: print("WARNING: prior layer got 0 seed. Reseeding...") theano_seed = random_state.randint(-2**32, 2**32) theano_rng = MRG_RandomStreams(seed=theano_seed) conc_multinomial = concatenate(list_of_multinomial_inputs, graph, name, axis=1) conc_multinomial /= len(list_of_multinomial_inputs) samp = theano_rng.multinomial(pvals=conc_multinomial, dtype="int32") # We know shape of conc_multinomial == shape of random sample shape = calc_expected_dims(graph, conc_multinomial) list_of_random = [samp, ] list_of_names = [name + "_random", ] list_of_shapes = [shape, ] add_random_to_graph(list_of_random, list_of_shapes, list_of_names, graph) return samp
def __init__(self, seq_len, emb_size, n_hidden, size_dict, lr): self.seq_len = seq_len # Parameters w_emb = shared(np.random.normal( 0, 0.01, size=(size_dict, emb_size)).astype(dtype=floatX)) w_hidden = shared(np.random.normal( 0, 0.01, size=(seq_len * emb_size, n_hidden)).astype(dtype=floatX)) b_hidden = shared(np.random.normal( 0, 0.01, size=(n_hidden,)).astype(dtype=floatX)) w_out = shared(np.random.normal( 0, 0.01, size=(n_hidden, size_dict)).astype(dtype=floatX)) b_out = shared(np.random.normal( 0, 0.01, size=(size_dict,)).astype(dtype=floatX)) # Graph x = t.imatrix('x') target = t.ivector('y') emb = w_emb[x] buff = relu(t.dot(emb.reshape((x.shape[0], -1)), w_hidden) + b_hidden) y_hat = t.nnet.softmax((t.dot(buff, w_out)) + b_out) cost = t.nnet.categorical_crossentropy(y_hat, target).mean() params = [w_emb, w_hidden, b_hidden, w_out, b_out] grads = t.grad(cost, params) updates = [(w, w - lr * p) for w, p in zip(params, grads)] self.fun_cost = theano.function([x, target], cost, updates=updates) # Sampling function rng = MRG_RandomStreams(42) next_char = t.argmax(rng.multinomial(pvals=y_hat), axis=1) self.fun_predict = theano.function([x], next_char)
def lwta(p, block_size): """ The hard local winner take all non-linearity from "Compete to Compute" by Rupesh Srivastava et al Our implementation differs slightly from theirs--we break ties randomly, they break them by earliest index. This difference is just due to ease of implementation in theano. """ batch_size = p.shape[0] num_filters = p.shape[1] num_blocks = num_filters // block_size w = p.reshape((batch_size, num_blocks, block_size)) block_max = w.max(axis=2).dimshuffle(0, 1, 'x') * T.ones_like(w) max_mask = T.cast(w >= block_max, 'float32') theano_rng = MRG_RandomStreams(20131206 % (2 ** 16)) denom = max_mask.sum(axis=2).dimshuffle(0, 1, 'x') probs = max_mask / denom probs = probs.reshape((batch_size * num_blocks, block_size)) max_mask = theano_rng.multinomial(pvals=probs, dtype='float32') max_mask = max_mask.reshape((batch_size, num_blocks, block_size)) w = w * max_mask w = w.reshape((p.shape[0], p.shape[1])) return w
class SLmodel(): #This is the switched conditional linear model for integrating #action with sensation def __init__(self, nx, ns, nh, na, npcl, xvar=1.0): #for this model I assume one linear generative model and a #combination of nh linear dynamical models #generative matrix init_W=np.asarray(np.random.randn(nx,ns)/10.0,dtype='float32') #observed variable means init_c=np.asarray(np.zeros(nx),dtype='float32') #dynamical matrices init_M=np.asarray((np.tile(np.eye(ns),(1,nh))),dtype='float32') #for state-based predictions init_C=np.asarray((np.tile(np.zeros((na,ns)),(1,nh))),dtype='float32') #for action-based predictions #state-variable variances #(covariance matrix of state variable noise assumed to be diagonal) init_b=np.asarray(np.ones(ns)*10.0,dtype='float32') #Switching parameter matrices init_A=np.asarray(np.zeros((ns,nh)),dtype='float32') #associated with the state init_B=np.asarray(np.zeros((na,nh)),dtype='float32') #associated with actions #priors for switching variable init_ph=np.asarray(np.zeros(nh),dtype='float32') init_s_now=np.asarray(np.zeros((npcl,ns)),dtype='float32') init_weights_now=np.asarray(np.ones(npcl)/float(npcl),dtype='float32') init_s_past=np.asarray(np.zeros((npcl,ns)),dtype='float32') init_h_past=np.asarray(np.zeros((npcl,nh)),dtype='float32') init_h_past[:,0]=1.0 init_weights_past=np.asarray(np.ones(npcl)/float(npcl),dtype='float32') init_a_past=np.asarray(np.zeros((1,na)),dtype='float32') self.W=theano.shared(init_W) self.c=theano.shared(init_c) self.M=theano.shared(init_M) self.C=theano.shared(init_C) self.b=theano.shared(init_b) self.A=theano.shared(init_A) self.B=theano.shared(init_B) self.ph=theano.shared(init_ph) #this is to help vectorize operations self.sum_mat=T.as_tensor_variable(np.asarray((np.tile(np.eye(ns),nh)).T,dtype='float32')) self.s_now=theano.shared(init_s_now) self.weights_now=theano.shared(init_weights_now) self.s_past=theano.shared(init_s_past) self.h_past=theano.shared(init_h_past) self.a_past=theano.shared(init_a_past) self.weights_past=theano.shared(init_weights_past) self.xvar=np.asarray(xvar,dtype='float32') self.nx=nx #dimensionality of observed variables self.ns=ns #dimensionality of latent variables self.nh=nh #number of (linear) dynamical modes self.na=na #dimensionality of action variables self.npcl=npcl #numer of particles in particle filter self.theano_rng = RandomStreams() self.params= [self.W, self.M, self.C, self.b, self.A, self.B, self.c, self.ph] self.rel_lrates=np.asarray([ 0.1, 1.0, 1.0, 0.01, 10.0, 10.0, 0.1, 1.0] ,dtype='float32') def sample_proposal_s(self, s, a, h, xpred, sig): s_pred=self.get_prediction(s, a, h) n=self.theano_rng.normal(size=T.shape(s)) #This is the proposal distribution that arises when one assumes that W'W=I mean=2.0*(xpred+s_pred*(self.b**2))*sig s_prop=mean+n*T.sqrt(sig) #I compute the term inside the exponent for the pdf of the proposal distrib prop_term=-T.sum(n**2)/2.0 return T.cast(s_prop,'float32'), T.cast(s_pred,'float32'), T.cast(prop_term,'float32') #This function is required if we allow multiple generative models #def get_recon(self, s, h): #W_vec=T.sum(self.W*h, axis=0) #W=W.reshape((self.nx, self.ns)) #xr=T.dot(W, s) #return xr def calc_h_probs(self, s, a): #this function takes an np by ns matrix of s samples plus #an action vector a #and returns an nh by np set of h probabilities exp_terms=T.dot(s, self.A)+ T.reshape(T.dot(a, self.B),(1,self.nh)) + T.reshape(self.ph,(1,self.nh)) #re-centering for numerical stability exp_terms_recentered=exp_terms-T.max(exp_terms,axis=1) #exponentiation and normalization rel_probs=T.exp(exp_terms) probs=rel_probs.T/T.sum(rel_probs, axis=1) return probs.T def forward_filter_step(self, a, xp): #first sample from h given s and a h_probs = self.calc_h_probs(self.s_now, a) h_samps=self.theano_rng.multinomial(pvals=h_probs) #need to sample from the proposal distribution #these terms are the same for every particle xpred=T.dot(self.W.T,(xp-self.c))/(2.0*self.xvar**2) sig=(1.0/(self.b**2+1.0/(2.0*self.xvar**2)))/2.0 #sig=1.0/(self.b**2) #vectorized version s_pred=self.get_prediction(self.s_now, a, h_samps) n=self.theano_rng.normal(size=T.shape(self.s_now)) mean=2.0*(xpred+s_pred*(self.b**2))*sig #mean=s_pred #trying out using solely predictive proposal distrib s_samps=mean+n*T.sqrt(sig) prop_terms=-T.sum(n**2,axis=1)/2.0 updates={} #now that we have samples from the proposal distribution, we need to reweight them recons=T.dot(self.W, s_samps.T) + T.reshape(self.c,(self.nx,1)) x_terms=-T.sum((recons-T.reshape(xp,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2) s_terms=-T.sum(((s_samps-s_pred)*self.b)**2,axis=1)/2.0 energies=x_terms+s_terms-prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights energies_recentered=energies-T.max(energies) alpha=T.exp(energies_recentered) #these are the reweighting factors new_weights_unnorm=self.weights_now*alpha normalizer=T.sum(new_weights_unnorm) new_weights=new_weights_unnorm/normalizer #need to normalize new weights updates[self.h_past]=T.cast(h_samps,'float32') updates[self.s_past]=T.cast(self.s_now,'float32') updates[self.a_past]=T.cast(a,'float32') updates[self.s_now]=T.cast(s_samps,'float32') updates[self.weights_past]=T.cast(self.weights_now,'float32') updates[self.weights_now]=T.cast(new_weights,'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates #return h_samps, updates return updates def get_prediction(self, s, a, h): s_dot_M=T.dot(s, self.M) #this is np by nh*ns a_dot_C=T.dot(a, self.C) #this is 1 by nh*ns tot=s_dot_M+a_dot_C #should be np by nh*ns s_pred=T.dot(tot*T.extra_ops.repeat(h,self.ns,axis=1),self.sum_mat) #should be np by ns return T.cast(s_pred,'float32') def sample_joint(self, sp): t2_samp=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T s2_samp=T.cast(T.sum(self.s_now*T.addbroadcast(t2_samp,1),axis=0),'float32') h2_samp=T.cast(T.sum(self.h_now*T.addbroadcast(t2_samp,1),axis=0),'float32') diffs=self.b*(s2_samp-sp) sqr_term=T.sum(diffs**2,axis=1) alpha=T.exp(-sqr_term) probs_unnorm=self.weights_past*alpha probs=probs_unnorm/T.sum(probs_unnorm) t1_samp=self.theano_rng.multinomial(pvals=T.reshape(probs,(1,self.npcl))).T s1_samp=T.cast(T.sum(self.s_past*T.addbroadcast(t1_samp,1),axis=0),'float32') h1_samp=T.cast(T.sum(self.h_past*T.addbroadcast(t1_samp,1),axis=0),'float32') return [s1_samp, h1_samp, s2_samp, h2_samp] def calc_mean_h_energy(self, s, a, h): #you give this function a set of samples of s, a, and h, #it gives you the average energy of those samples exp_terms=T.dot(s, self.A)+ T.reshape(T.dot(a, self.B),(1,self.nh)) + T.reshape(self.ph,(1,self.nh)) #np by nh energies=T.sum(h*exp_terms,axis=1) - T.log(T.sum(T.exp(exp_terms),axis=1)) #should be np by 1 energy=T.mean(energies) return energy def update_params(self, x1, x2, n_samps, lrate): #this function samples from the joint posterior and performs # a step of gradient ascent on the log-likelihood sp=self.get_prediction(self.s_past, self.a_past, self.h_past) #sp should be np by ns [s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint, outputs_info=[None, None, None, None], non_sequences=[sp], n_steps=n_samps) x1_recons=T.dot(self.W, s1_samps.T) + T.reshape(self.c,(self.nx,1)) x2_recons=T.dot(self.W, s2_samps.T) + T.reshape(self.c,(self.nx,1)) s_pred = self.get_prediction(s1_samps, h1_samps) hterm1=self.calc_mean_h_energy(s1_samps, h1_samps) #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps) sterm=-T.mean(T.sum((self.b*(s2_samps-s_pred))**2,axis=1))/2.0 xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) xterm2=-T.mean(T.sum((x2_recons-T.reshape(x2,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2)) energy = hterm1 + xterm1 + xterm2 + sterm gparams=T.grad(energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates): #gnat=T.dot(param, T.dot(param.T,param)) updates[param] = T.cast(param + gparam*lrate*rel_lr,'float32') #make sure W has unit-length columns #new_W=updates[self.W] #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32') #MIGHT NEED TO NORMALIZE A return energy, updates def get_ESS(self): return 1.0/T.sum(self.weights_now**2) def resample_step(self): idx=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T s_samp=T.sum(self.s_now*T.addbroadcast(idx,1),axis=0) h_samp=T.sum(self.h_now*T.addbroadcast(idx,1),axis=0) return T.cast(s_samp,'float32'), T.cast(h_samp,'float32') def resample(self): [s_samps, h_samps], updates = theano.scan(fn=self.resample_step, outputs_info=[None, None], n_steps=self.npcl) updates[self.s_now]=T.cast(s_samps,'float32') updates[self.h_now]=T.cast(h_samps,'float32') updates[self.weights_now]=T.cast(T.ones_like(self.weights_now)/T.cast(self.npcl,'float32'),'float32') #dtype paranoia return updates def simulate_step(self, s, a): s=T.reshape(s,(1,self.ns)) a=T.reshape(a,(1,self.na)) #get h probabilities h_probs = self.calc_h_probs(s,a) h_samp=self.theano_rng.multinomial(pvals=h_probs) sp=self.get_prediction(s,a,h_samp) xp=T.dot(self.W, sp.T) + T.reshape(self.c,(self.nx,1)) return T.cast(sp,'float32'), T.cast(xp,'float32'), h_samp def simulate_forward(self, a, n_steps): #a should be n_steps by na s0=T.sum(self.s_now*T.reshape(self.weights_now,(self.npcl,1)),axis=0) s0=T.reshape(s0,(1,self.ns)) [sp, xp, hs], updates = theano.scan(fn=self.simulate_step, outputs_info=[s0, None, None], sequences=[a], n_steps=n_steps) return sp, xp, hs, updates
class MultiRBM(RBM): def __init__(self, input, n_vis, n_hid, n_cate, W=None, vbias=None, hbias=None): ''' The input should be a 3D tensor with (n_cat, N_sample, n_vis) ''' self.input = input self.n_vis = n_vis self.n_hid = n_hid self.n_cate = n_cate if W is None: W = theano.shared(np.random.normal(size=(self.n_cate, self.n_vis, self.n_hid)).astype( theano.config.floatX), borrow=True) if vbias is None: vbias = theano.shared(np.zeros(shape=( self.n_cate, self.n_vis, )).astype(theano.config.floatX), borrow=True) if hbias is None: hbias = theano.shared(np.zeros(shape=(self.n_hid, )).astype( theano.config.floatX), borrow=True) self.numpy_rng = np.random.RandomState(1234) self.theano_rng = MRG_RandomStreams(self.numpy_rng.randint(2**30)) self.W = W self.vbias = vbias self.hbias = hbias def free_energy(self, vis): vW_b = T.batched_dot(vis, self.W) + T.addbroadcast(self.hbias, 1) visible_term = T.batched_dot(vis, self.vbias) hidden_term = T.sum(T.log(1 + T.exp(vW_b)), axis=2) return T.sum(-hidden_term - visible_term, axis=0) def propup(self, vis): x = T.batched_dot(vis, self.W) + self.hbias return [x, T.nnet.sigmoid(x)] def propdown(self, hid): x = T.batched_dot(hid, self.W.dimshuffle(0, 2, 1)) + \ self.vbias.dimshuffle((0, 'x', 1)) e_x = T.exp(x - x.max(axis=0, keepdims=True)) out = e_x / e_x.sum(axis=0, keepdims=True) return [x, out] def sample_v_given_h(self, hid): x, out = self.propdown(hid) v_sample = [] for v in range(self.n_vis): v_sample += [ self.theano_rng.multinomial(n=1, pvals=out[:, :, v].T).dimshuffle( 1, 0, 'x') ] v_sample = T.concate(v_sample, axis=2) return [x, out, v_sample] def sample_h_given_v(self, vis): x, out = self.propup(vis) h_sample = self.theano_rng.binomial(n=1, p=out, size=out.shape) return [x, out, h_sample]
def random_multinomial(shape=None, pvals=None, dtype=_FLOATX, seed=None): if seed is None: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) return rng.multinomial(size=shape, pvals=pvals, dtype=dtype)
def sample(p, seed=None): if seed is None: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) return rng.multinomial(n=1, pvals=p, dtype=theano.config.floatX)
class ImportanceSampler(): '''Implements importance sampling/resampling''' def __init__(self, ndims, n_particles, true_log_probs, proposal_func=None): ''' true_log_probs: a function that returns the true relative log probabilities proposal_func: a function that returns (samples, relative_log_probabilities) n_particles: the number of particles to use ''' self.true_log_probs = true_log_probs self.proposal_func = proposal_func self.n_particles = n_particles self.ndims = ndims init_particles = np.zeros((n_particles, self.ndims)) init_weights = np.ones(n_particles) / float(n_particles) self.particles = theano.shared(init_particles.astype(np.float32)) self.weights = theano.shared(init_weights.astype(np.float32)) self.theano_rng = RandomStreams() self.get_ESS = None self.perform_resampling = None self.perform_sampling = None def set_proposal_func(self, proposal_func): '''You might need to use this if you want to make the proposal function depend on the current particles''' self.proposal_func = proposal_func return def sample_reweight(self): '''Samples new particles and reweights them''' samples, prop_log_probs = self.proposal_func() true_log_probs = self.true_log_probs(samples) diffs = true_log_probs - prop_log_probs weights_unnorm = T.exp(diffs) weights = weights_unnorm / T.sum(weights_unnorm) updates = OrderedDict() updates[self.weights] = T.cast(weights, 'float32') updates[self.particles] = T.cast(samples, 'float32') return updates def compute_ESS(self): '''Returns the effective sample size''' return 1.0 / T.sum(self.weights**2) def resample(self): '''Resamples using the current weights''' samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat( self.weights.dimshuffle('x', 0), self.n_particles, axis=0)) idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64') updates = OrderedDict() updates[self.particles] = self.particles[idxs] updates[self.weights] = T.cast( T.ones_like(self.weights) / float(self.n_particles), 'float32') return updates def compile(self): '''Compiles the ESS, resampling, and sampling functions''' ess = self.compute_ESS() self.get_ESS = theano.function([], ess) resample_updates = self.resample() self.perform_resampling = theano.function([], updates=resample_updates) sample_updates = self.sample_reweight() self.perform_sampling = theano.function([], updates=sample_updates) return
m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64') s2 = numpy.random.randint(0, i32max, 3).astype('int32') m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") f0.input_storage[0].storage[0] = A1 f0.input_storage[1].storage[0] = s1 f0.input_storage[2].storage[0] = m1 f0.input_storage[3].storage[0] = A2 f0.input_storage[4].storage[0] = s2 f0.input_storage[5].storage[0] = m2 r_a1 = rng_mrg.matVecModM(A1, s1, m1) r_a2 = rng_mrg.matVecModM(A2, s2, m2) f0.fn() r_b = f0.output_storage[0].value assert numpy.allclose(r_a1, r_b[:3]) assert numpy.allclose(r_a2, r_b[3:]) if __name__ == "__main__": rng = MRG_RandomStreams(numpy.random.randint(2147462579)) import time print theano.__file__ pvals = theano.tensor.fmatrix() for i in range(10): t0 = time.time() multinomial = rng.multinomial(pvals=pvals) print time.time() - t0
def test_undefined_grad(): srng = MRG_RandomStreams(seed=1234) # checking uniform distribution low = tensor.scalar() out = srng.uniform((), low=low) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, low) high = tensor.scalar() out = srng.uniform((), low=0, high=high) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, high) out = srng.uniform((), low=low, high=high) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, (low, high)) # checking binomial distribution prob = tensor.scalar() out = srng.binomial((), p=prob) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, prob) # checking multinomial distribution prob1 = tensor.scalar() prob2 = tensor.scalar() p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])] out = srng.multinomial(size=None, pvals=p, n=4)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(theano.tensor.sum(out), prob1) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.multinomial(size=None, pvals=p, n=4)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(theano.tensor.sum(out), (prob1, prob2)) # checking choice p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out[0], prob1) # checking normal distribution avg = tensor.scalar() out = srng.normal((), avg=avg) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, avg) std = tensor.scalar() out = srng.normal((), avg=0, std=std) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, std) out = srng.normal((), avg=avg, std=std) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, (avg, std)) # checking truncated normal distribution avg = tensor.scalar() out = srng.truncated_normal((), avg=avg) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, avg) std = tensor.scalar() out = srng.truncated_normal((), avg=0, std=std) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, std) out = srng.truncated_normal((), avg=avg, std=std) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, (avg, std))
class VisibleLayer(object): def __init__(self, v_dim, h_dim, v_type, mrng=None, rng=None, name=''): self.name = name if name != '' else 'v_layer' self.v_dim = v_dim self.h_dim = h_dim self.v_type = v_type seed = np.random.randint(1, 2**30) self._rng = RandomStreams(seed) if rng is None else rng self._mrng = MRG_RandomStreams(seed) if mrng is None else mrng self._build_params() def set_total_count(self, total_count): if not (self.v_type == InputType.poisson): raise ValueError( "The input type should be Poisson to set total count") self.total_count = total_count def _build_params(self): # W to connect with hidden layer self.params = [] if self.v_type == InputType.poisson: init_W = np.random.uniform(low=-1 / self.h_dim, high=1 / self.h_dim, size=(self.v_dim, self.h_dim)) self.W = init_weight(self.v_dim, self.h_dim, value=init_W, name=self.name + '-W') else: self.W = init_weight(self.v_dim, self.h_dim, name=self.name + '-W') self.b_v = init_bias(self.v_dim, name=self.name + '-b_v') # Ca binary, gaussian, and categorical self.params.extend([self.W, self.b_v]) # Truong hop gaussian co them sigma if self.v_type == InputType.gaussian: self.sigma_v = T.ones(shape=(self.v_dim, ), dtype=theano.config.floatX) self.sigma_v.name = self.name + "-sigma_v" # Result in a vector of (n, 1) def v_free_term(self, v): if self.v_type == InputType.poisson: return -T.sum(T.gammaln(1 + v), axis=1) else: return 0 # Result in a vector of (n, 1) def v_bias_term(self, v): # Note that for gaussian case, the v_bias should be negative if self.v_type == InputType.gaussian: return -T.sum((v - self.b_v)**2 / (2 * self.sigma_v**2), axis=1) else: return T.dot(v, self.b_v) # Result in a vector of (n, H) def v_weight_term(self, v): if self.v_type == InputType.gaussian: return T.dot((v / (self.sigma_v**2)), self.W) else: return T.dot(v, self.W) # Only support binary, gaussian and categorical def v_given_h(self, h): if self.v_type == InputType.binary: p_v_h = T.nnet.sigmoid(self.b_v + T.dot(h, self.W.T)) return p_v_h elif self.v_type == InputType.gaussian: mu_v = self.b_v + T.dot(h, self.W.T) return mu_v elif self.v_type == InputType.categorical: p_v_h = T.nnet.softmax(self.b_v + T.dot(h, self.W.T)) return p_v_h elif self.v_type == InputType.poisson: if not hasattr(self, 'total_count') or self.total_count is None: raise ValueError( 'Total count should be set for constrained Poisson') unconstrained_lmbd_v = T.exp(self.b_v + T.dot(h, self.W.T)) lmbd_v = unconstrained_lmbd_v * 1.0 / T.sum(unconstrained_lmbd_v, axis=1, keepdims=True) \ * self.total_count return lmbd_v # Only support binary, gaussian and categorical def sample_v_given_h(self, h0_sample): if self.v_type == InputType.binary: v1_mean = self.v_given_h(h0_sample) v1_sample = self._mrng.binomial(size=v1_mean.shape, n=1, p=v1_mean, dtype=theano.config.floatX) return [v1_mean, v1_sample] elif self.v_type == InputType.gaussian: mu_v1 = self.v_given_h(h0_sample) # Note that mu_v1 is returned v1_sample = self._mrng.normal(size=mu_v1.shape, avg=mu_v1, std=self.sigma_v, dtype=theano.config.floatX) return [mu_v1, v1_sample] # Note that there is constraint in the case of Multinomial elif self.v_type == InputType.categorical: prob_v1 = self.v_given_h(h0_sample) v1_sample = self._mrng.multinomial(pvals=prob_v1, n=1, dtype=theano.config.floatX) return [prob_v1, v1_sample] elif self.v_type == InputType.poisson: lmbd_v1 = self.v_given_h(h0_sample) # We have to use RandomStreams, not MRG_RandomStreams v1_sample = self._rng.poisson(size=lmbd_v1.shape, lam=lmbd_v1, dtype=theano.config.floatX) return [lmbd_v1, v1_sample] def l1_grad(self, l1): gW = l1_grad(self.W, l1) return [gW, 0] def l2_grad(self, l2): gW = l2_grad(self.W, l2) return [gW, 0] def nll_grad_formula(self, v0, vk, h0, hk): n_instances = v0.shape[0] gW = (T.dot(vk.T, hk) - T.dot(v0.T, h0)) / n_instances if self.v_type == InputType.gaussian: gb_v = T.mean((vk - v0) / (self.sigma_v**2), axis=0) grads = [gW, gb_v] else: gb_v = T.mean(vk - v0, axis=0) grads = [gW, gb_v] return grads def get_viewed_cost(self, v0, vk_stat): # Binary cross-entropy cost = 0 if self.v_type == InputType.binary: # Clip to avoid log(0) clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.float32(0.999999)) cost = -T.sum(v0 * T.log(clip_vk_stat) + (1 - v0) * T.log(1 - clip_vk_stat), axis=1) # Sum square error elif self.v_type == InputType.gaussian: cost = T.sum((v0 - vk_stat)**2, axis=1) # Categorical cross-entropy elif self.v_type == InputType.categorical: clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.float32(0.999999)) cost = -T.sum(v0 * T.log(clip_vk_stat), axis=1) elif self.v_type == InputType.poisson: clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.inf) cost = -T.sum( -vk_stat + v0 * T.log(clip_vk_stat) - T.gammaln(1 + v0), axis=1) return cost def get_params(self): return self.params
class RBM(Model): def __init__(self, v_dim=784, h_dim=500, input_type=InputType.binary, W=None, b_h=None, b_v=None, sigma=None, input_var=None, mrng=None, rng=None, name='', **kwargs): name = 'rbm' if name == '' else name super(RBM, self).__init__(name=name) model_file = kwargs.get('model_file') if model_file is not None: self.load(model_file) self._load_params() else: # v_dim is the dimensions of visible variable v. v_dim = D self.v_dim = v_dim # v_dim is the dimensions of visible variable v. h_dim = H self.h_dim = h_dim self.input_type = input_type seed = np.random.randint(1, 2**30) self._rng = RandomStreams(seed) if rng is None else rng self._mrng = MRG_RandomStreams(seed) if mrng is None else mrng self._build_params(W, b_h, b_v, sigma) self.input = input_var if input_var is not None else T.matrix('input') if self.input_type == InputType.poisson or self.input_type == InputType.replicated_softmax: self.total_count = T.sum(self.input, axis=1, keepdims=True) def _load_params(self): [self.W, self.b_h, self.b_v] = self.params def _build_params(self, W, b_h, b_v, sigma): self.params = [] self.W = W if W is not None else init_weight(self.v_dim, self.h_dim, name=self.name+'-W') self.b_h = b_h if b_h is not None else init_bias(self.h_dim, name=self.name+'-b_h') self.b_v = b_v if b_v is not None else init_bias(self.v_dim, name=self.name+'-b_v') # sigma_v is not considered to be a param self.params.extend([self.W, self.b_h, self.b_v]) # Truong hop gaussian co them sigma self.sigma_v = None if self.input_type == InputType.gaussian: print "Your input must be whitened to achieve the desire result." if sigma is not None: sigma = np.asarray(sigma) if sigma.ndim == 0: print "Sigma is set to {} for all input dimensions.".format(sigma) self.sigma_v = theano.shared(sigma * np.ones((self.v_dim, ), dtype=theano.config.floatX)) self.sigma_v.name = self.name + "-sigma_v" else: assert sigma.ndim == 1 and sigma.shape[0] == self.v_dim, \ "Sigma must be 1D array with the length of {}".format(self.v_dim) self.sigma_v = theano.shared(sigma) self.sigma_v.name = self.name + "-sigma_v" else: print "Default value of sigma is 1.0 for all input dimensions." self.sigma_v = theano.shared(np.ones(self.v_dim, dtype=theano.config.floatX)) self.sigma_v.name = self.name + "-sigma_v" def print_model_info(self): print "\nInfo of model {}".format(self.name) print "v_dims: {} | h_dim: {} | input_type: {}".format(self.v_dim, self.h_dim, self.input_type) def get_save(self): return [self.name, self.v_dim, self.h_dim, self.input_type, self._mrng, self._rng, self.params, self.sigma_v] def set_load(self, saved_data): [self.name, self.v_dim, self.h_dim, self.input_type, self._mrng, self._rng, self.params, self.sigma_v] = saved_data def score(self, v_data): free_fn = theano.function([self.input], self.free_energy(self.input)) return free_fn(v_data) def reconstruct(self, v_data): h = self.h_given_v(self.input) rv = self.v_given_h(h) rec_fn = theano.function([self.input], rv) return rec_fn(v_data) def reconstruct_from_hidden(self, h_data): h = self.input.type('hidden') rv = self.v_given_h(h) rec_fn = theano.function([h], rv) return rec_fn(h_data) def encode(self, v_data): h_code = self.h_given_v(self.input) fn = theano.function([self.input], h_code) return fn(v_data) # Energy from many v an 1 h def energy(self, v, h): v_free = self.v_free_term(v) v_bias = self.v_bias_term(v) v_weight = self.v_weight_term(v) return -(v_free + v_bias + v_weight * h + T.dot(h, self.b_h)) def free_energy(self, v): v_free = self.v_free_term(v) v_bias = self.v_bias_term(v) v_weight = self.v_weight_term(v) h_term = T.sum(T.log(1 + T.exp(v_weight + self.b_h)), axis=1) return -(v_bias + v_free + h_term) def v_weight_term(self, v): if self.input_type == InputType.gaussian: return T.dot(v/(self.sigma_v ** 2), self.W) else: return T.dot(v, self.W) def v_bias_term(self, v): # Note that for gaussian case, the v_bias should be negative if self.input_type == InputType.gaussian: return -T.sum((v - self.b_v) ** 2 / (2 * self.sigma_v ** 2), axis=1) else: return T.dot(v, self.b_v) def v_free_term(self, v): if self.input_type == InputType.poisson: return -T.sum(T.gammaln(1 + v), axis=1) else: return 0 def rv(self, v): h = self.h_given_v(v) rv = self.v_given_h(h) return rv def h_given_v(self, v): v_weight = self.v_weight_term(v) p_h_v = T.nnet.sigmoid(v_weight + self.b_h) return p_h_v def v_given_h(self, h): if self.input_type == InputType.binary: p_v_h = T.nnet.sigmoid(self.b_v + T.dot(h, self.W.T)) return p_v_h elif self.input_type == InputType.gaussian: mu_v = self.b_v + T.dot(h, self.W.T) return mu_v elif self.input_type == InputType.categorical or \ self.input_type == InputType.replicated_softmax: p_v_h = T.nnet.softmax(self.b_v + T.dot(h, self.W.T)) return p_v_h elif self.input_type == InputType.poisson: if not hasattr(self, 'total_count') or self.total_count is None: raise ValueError('Total count should be set for constrained Poisson') unconstrained_lmbd_v = T.exp(self.b_v + T.dot(h, self.W.T)) lmbd_v = unconstrained_lmbd_v * 1.0 / T.sum(unconstrained_lmbd_v, axis=1, keepdims=True) \ * self.total_count return lmbd_v def sample_h_given_v(self, v0_sample): h1_mean = self.h_given_v(v0_sample) h1_sample = self._mrng.binomial(size=h1_mean.shape, n=1, p=h1_mean, dtype=theano.config.floatX) return [h1_mean, h1_sample] def sample_v_given_h(self, h0_sample): if self.input_type == InputType.binary: v1_mean = self.v_given_h(h0_sample) v1_sample = self._mrng.binomial(size=v1_mean.shape, n=1, p=v1_mean, dtype=theano.config.floatX) return [v1_mean, v1_sample] elif self.input_type == InputType.gaussian: mu_v1 = self.v_given_h(h0_sample) # Note that mu_v1 is returned v1_sample = self._mrng.normal(size=mu_v1.shape, avg=mu_v1, std=self.sigma_v, dtype=theano.config.floatX) return [mu_v1, v1_sample] # Note that there is constraint in the case of Multinomial elif self.input_type == InputType.categorical: prob_v1 = self.v_given_h(h0_sample) # Multinomial with n=1 (It is equal to categorical) v1_sample = self._mrng.multinomial(pvals=prob_v1, n=1, dtype=theano.config.floatX) return [prob_v1, v1_sample] elif self.input_type == InputType.poisson: lmbd_v1 = self.v_given_h(h0_sample) # We have to use RandomStreams, not MRG_RandomStreams v1_sample = self._rng.poisson(size=lmbd_v1.shape, lam=lmbd_v1, dtype=theano.config.floatX) return [lmbd_v1, v1_sample] elif self.input_type == InputType.replicated_softmax: if not hasattr(self, 'total_count') or self.total_count is None: raise ValueError('Total count should be set for replicated Softmax') prob_v1 = self.v_given_h(h0_sample) # We have to sample the vocabulary distribution given topic D times and sum over D samples v1_sample = self._mrng.multinomial(pvals=prob_v1, n=self.total_count, ndim=prob_v1.shape[1]) return [prob_v1, v1_sample] # One step of gibbs sampling def gibbs_hvh(self, h0_sample): # Here we use v1_stat to show that it is sufficient statistics of v1 [v1_stat, v1_sample] = self.sample_v_given_h(h0_sample) [h1_mean, h1_sample] = self.sample_h_given_v(v1_sample) return [v1_stat, v1_sample, h1_mean, h1_sample] def gibbs_vhv(self, v0_sample): [h1_mean, h1_sample] = self.sample_h_given_v(v0_sample) [v1_stat, v1_sample] = self.sample_v_given_h(h1_sample) return [h1_mean, h1_sample, v1_stat, v1_sample] def run_CD_from_h(self, k, data_h): start_h = T.matrix("start_h") # [v_stats, v_samples, h_means, h_samples], updates \ outputs, updates \ = theano.scan(fn=self.gibbs_hvh, outputs_info=[None, None, None, start_h], n_steps=k, name="gibbs_hvh") # Return the last h_sample after k steps CD_fn = theano.function([start_h], outputs=outputs[-1], updates=updates) return CD_fn(data_h) def run_CD_from_v(self, k, data_v): start_v = T.matrix("start_v") # [h_means, h_samples, v_stats, v_samples], updates \ outputs, updates \ = theano.scan(fn=self.gibbs_vhv, outputs_info=[None, None, None, start_v], n_steps=k, name="gibbs_vhv") # Return the last v_sample after k steps CD_fn = theano.function([start_v], outputs=outputs[-1], updates=updates) return CD_fn(data_v) # Return visible variables def _gibbs_vhv_to_v_fn(self, steps, persis_v, is_sample=True, name=''): [h_means, h_samples, v_stats, v_samples], updates \ = theano.scan(self.gibbs_vhv, outputs_info=[None, None, None, persis_v], n_steps=steps, # init_gibbs dung de init name='gibbs_vhv') updates.update({persis_v: v_samples[-1]}) if is_sample: gibbs_fn = theano.function([], v_samples[-1], updates=updates, name=name) else: gibbs_fn = theano.function([], v_stats[-1], updates=updates, name=name) return gibbs_fn # Also return visible variables def _gibbs_hvh_to_v_fn(self, steps, persis_h, is_sample=True, name=''): [v_stats, v_samples, h_means, h_samples], updates \ = theano.scan(self.gibbs_hvh, outputs_info=[None, None, None, persis_h], n_steps=steps, # init_gibbs dung de init name='gibbs_hvh') updates.update({persis_h: h_samples[-1]}) if is_sample: gibbs_fn = theano.function([], v_samples[-1], updates=updates, name=name) else: gibbs_fn = theano.function([], v_stats[-1], updates=updates, name=name) return gibbs_fn def sample_given_data(self, v_data, init_gibbs=1000, betw_gibbs=100, loops=10, is_sample=False): print "\nSample data from input using model {}".format(self.name) # Neu kich thuoc input la 1 thi phai chuyen no ve kich thuoc 2 if len(v_data.shape) == 1: persis_v = theano.shared(np.asarray(v_data.reshape(1, v_data.shape[0]), dtype=theano.config.floatX)) else: persis_v = theano.shared(np.asarray(v_data, dtype=theano.config.floatX)) if init_gibbs > 0: init_sampling_fn = self._gibbs_vhv_to_v_fn(init_gibbs, persis_v, is_sample=True, name='init_sampling_fn') else: init_sampling_fn = None sample_fn = self._gibbs_vhv_to_v_fn(betw_gibbs, persis_v, is_sample=is_sample, name='sample_fn') rvs_data = [] if init_sampling_fn is not None: init_sampling_fn() for idx in range(loops): print "Running sampling loop %d" % idx rv_data = sample_fn() rvs_data.append(rv_data) return np.asarray(rvs_data) # Sample randomly # We start from h and run gibbs chain until it reaches equilibrium def sample(self, init_gibbs=1000, betw_gibbs=100, n_samples=20, loops=10, is_sample=False): print "\nSample random data using model {}".format(self.name) persis_h = theano.shared(np.zeros((n_samples, self.h_dim), dtype=theano.config.floatX)) if init_gibbs > 0: init_sampling_fn = self._gibbs_hvh_to_v_fn(init_gibbs, persis_h, is_sample=True, name='init_sampling_fn') else: init_sampling_fn = None sample_fn = self._gibbs_hvh_to_v_fn(betw_gibbs, persis_h, is_sample=is_sample, name='sample_fn') rvs_data = [] if init_sampling_fn is not None: init_sampling_fn() for idx in range(loops): print "Running sampling loop %d" % idx rv_data = sample_fn() rvs_data.append(rv_data) return np.asarray(rvs_data) def get_cost_udpates(self, lr, k, persis_h, l1, l2, stable_update, store_grad): # Run one sample step to get h h_mean, h_sample = self.sample_h_given_v(self.input) # Run normal CD start_h = persis_h if persis_h is not None else h_sample [v_stats, v_samples, h_means, h_samples], updates \ = theano.scan(fn=self.gibbs_hvh, outputs_info=[None, None, None, start_h], n_steps=k, name="gibbs_hvh") vk = v_samples[-1] v_stat_k = v_stats[-1] if persis_h is not None: updates[persis_h] = h_samples[-1] cost = self.get_viewed_cost(self.input, v_stat_k) cost = T.mean(cost) # For stable update, use mean value instead of random sampled value if stable_update: print "\nStable update is set to be True" updates = self.params_updates(self.input, v_stat_k, lr, l1, l2, updates, store_grad) else: print "\nStable update is set to be False" updates = self.params_updates(self.input, vk, lr, l1, l2, updates, store_grad) # return cost, updates return cost, updates def get_viewed_cost(self, v0, vk_stat): # Binary cross-entropy cost = 0 if self.input_type == InputType.binary: clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.float32(0.999999)) cost = -T.sum(v0 * T.log(clip_vk_stat) + (1 - v0) * T.log(1 - clip_vk_stat), axis=1) # Sum square error elif self.input_type == InputType.gaussian: cost = T.sum((v0 - vk_stat) ** 2, axis=1) # Categorical cross-entropy elif self.input_type == InputType.categorical: clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.float32(0.999999)) cost = -T.sum(v0 * T.log(clip_vk_stat), axis=1) elif self.input_type == InputType.poisson: clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.inf) cost = -T.sum(-vk_stat + v0 * T.log(clip_vk_stat) - T.gammaln(1 + v0), axis=1) if self.input_type == InputType.replicated_softmax: clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.inf) cost = -T.sum((v0 / self.total_count) * T.log(clip_vk_stat), axis=1) return cost def params_updates(self, v0, vk, lr, l1, l2, updates, store_grad): if updates is None: updates = OrderedDict() if store_grad: self.stored_grads = OrderedDict() grads = [0 for _ in xrange(len(self.params))] o_grads = self.nll_grad_formula(v0, vk) grads = [grads[i] + o_grads[i] for i in xrange(len(self.params))] if store_grad: print "\nGradients over negative log-likelihood are stored in original_grads" o_shared_grads, updates = store_grads_in_update(self.params, o_grads, updates) self.stored_grads['original_grads'] = o_shared_grads if l1 is not None: print "Add L1 regularization ({}) to parameter updates".format(l1) l1_gW = l1_grad(self.W, l1) grads[0] = grads[0] + l1_gW if store_grad: print "\nGradients over L1 regularization are stored in l1_grads" l1_shared_grads, updates = store_grads_in_update([self.W], [l1_gW], updates) self.stored_grads['l1_grads'] = l1_shared_grads if l2 is not None: print "Add L2 regularization ({}) to parameter updates".format(l2) l2_gW = l2_grad(self.W, l2) grads[0] = grads[0] + l2_gW if store_grad: print "\nGradients over L2 regularization are stored in l2_grads" l2_shared_grads, updates = store_grads_in_update([self.W], [l2_gW], updates) self.stored_grads['l2_grads'] = l2_shared_grads if store_grad: print "\nGradients over total cost are stored in total_grads" t_shared_grads, updates = store_grads_in_update(self.params, grads, updates) self.stored_grads['total_grads'] = t_shared_grads grads = [grad.astype(theano.config.floatX) for grad in grads] if self.check_learning_algor(): params_updates = self.learning_algor(grads, self.params, lr, **self.learning_config) updates.update(params_updates) else: print "\nSimple SGD is used as training algorithm" for grad, param in zip(grads, self.params): updates[param] = param - grad * lr return updates def nll_grad_formula(self, v0, vk): n_instances = v0.shape[0] h0 = self.h_given_v(v0) hk = self.h_given_v(vk) gW = (T.dot(vk.T, hk) - T.dot(v0.T, h0)) / n_instances gb_h = T.mean(hk - h0, axis=0) if self.input_type == InputType.gaussian: gb_v = T.mean((vk - v0) / (self.sigma_v ** 2), axis=0) ugz_v = (((vk - self.b_v) ** 2 - 2 * vk * T.dot(hk, self.W.T)) - \ ((v0 - self.b_v) ** 2 - 2 * v0 * T.dot(h0, self.W.T))) / (self.sigma_v ** 2) gz_v = T.mean(ugz_v, axis=0) grads = [gW, gb_h, gb_v, gz_v] else: gb_v = T.mean(vk - v0, axis=0) grads = [gW, gb_h, gb_v] return grads def nll_grad_theano(self, v0, vk): cost = T.mean(self.free_energy(v0)) - T.mean(self.free_energy(vk)) # Note here we have to use consider_constant grads = T.grad(cost, self.params, consider_constant=[vk]) return grads def grad_check(self, data_v0, data_vk): # data_v0 and data_vk is numpy array # data_vk is computed by calling CD-k v0 = T.matrix('v0') vk = T.matrix('vk') theano_grads = self.nll_grad_theano(v0, vk) formula_grads = self.nll_grad_formula(v0, vk) grad_diffs = [] for t_grad, f_grad in zip(theano_grads, formula_grads): grad_diffs.append(abs(t_grad - f_grad)) grad_test_fn = theano.function([v0, vk], grad_diffs) diffs_results = grad_test_fn(data_v0, data_vk) for i in xrange(len(self.params)): if self.params[i].name is not None: name = self.params[i].name else: name = "" print ("Max " + name + " diffs: {}").format(np.max(diffs_results[i])) print ("Min " + name + " diffs: {}").format(np.min(diffs_results[i])) print ("Average " + name + " diffs: {}").format(np.mean(diffs_results[i])) print return diffs_results def config_train(self, **kwargs): k = kwargs.get('CD_k') persis_h_data = kwargs.get('persis_h') l1 = kwargs.get('L1') l2 = kwargs.get('L2') if l1 is None: print "L1 should be set to enable sparse weight regularization" if l2 is None: print "L2 should be set to enable sparse weight regularization" stable_update = kwargs.get('stable_update') if stable_update is None: stable_update = False store_grad = kwargs.get('store_grad') if store_grad is None: store_grad = False self._build_train(k, persis_h_data, l1, l2, stable_update, store_grad) # persis_v_data is a numpy array def _build_train(self, k, persis_h_data, l1, l2, stable_update, store_grad): print "\nBuild training function of model {}".format(self.name) if persis_h_data is not None: persis_h = theano.shared(persis_h_data, borrow=True) else: persis_h = None lr = T.scalar('lr') cost, updates = self.get_cost_udpates(lr, k, persis_h, l1, l2, stable_update, store_grad) print "\nBuild computation graph for training function of model {}".format(self.name) self.train_fn = theano.function([self.input, lr], cost, updates=updates) rv = self.v_given_h(self.h_given_v(self.input)) test_cost = self.get_viewed_cost(self.input, rv) test_cost = T.mean(test_cost) print "\nBuild computation graph for validation function of model {}".format(self.name) self.valid_fn = theano.function([self.input], test_cost)
class TextDecoder(EncoderDecoderBase): EVALUATION = 1 SAMPLING = 2 BEAM_SEARCH = 3 def __init__(self, state, rng, parent): EncoderDecoderBase.__init__(self, state, rng, parent) self.trng = MRG_RandomStreams(self.seed) self.init_params() def init_params(self): if self.multiplicative_input_from_encoders: if self.bidirectional_encoder: self.input_dim = self.qdim * 2 else: self.input_dim = self.qdim else: if self.bidirectional_encoder: self.input_dim = self.qdim * 4 else: self.input_dim = self.qdim * 2 if self.use_precomputed_features: self.input_dim += self.precomputed_features_count """ Decoder weights """ self.Wd_in = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.input_dim, self.mlp_out_dim), name='Wd_in')) self.bd_in = add_to_params( self.params, theano.shared(value=np.zeros((self.mlp_out_dim, ), dtype='float32'), name='bd_in')) if self.condition_on_previous_speaker_class: self.Wd_softmax_first = add_to_params( self.params, theano.shared(value=NormalInit3D( self.rng, self.segmentation_token_count, self.mlp_out_dim, self.segmentation_token_count), name='Wd_softmax_first')) self.bd_softmax_first = add_to_params( self.params, theano.shared(value=np.zeros((self.segmentation_token_count, self.segmentation_token_count), dtype='float32'), name='bd_softmax__first')) self.Wd_softmax_second = add_to_params( self.params, theano.shared(value=NormalInit3D( self.rng, self.segmentation_token_count, self.mlp_out_dim, self.segmentation_token_count), name='Wd_softmax_second')) self.bd_softmax_second = add_to_params( self.params, theano.shared(value=np.zeros((self.segmentation_token_count, self.segmentation_token_count), dtype='float32'), name='bd_softmax__second')) self.Wd_softmax_third = add_to_params( self.params, theano.shared(value=NormalInit3D( self.rng, self.segmentation_token_count, self.mlp_out_dim, self.segmentation_token_count), name='Wd_softmax_third')) self.bd_softmax_third = add_to_params( self.params, theano.shared(value=np.zeros((self.segmentation_token_count, self.segmentation_token_count), dtype='float32'), name='bd_softmax__third')) else: self.Wd_softmax_first = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.mlp_out_dim, self.segmentation_token_count), name='Wd_softmax_first')) self.bd_softmax_first = add_to_params( self.params, theano.shared(value=np.zeros((self.segmentation_token_count, ), dtype='float32'), name='bd_softmax__first')) self.Wd_softmax_second = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.mlp_out_dim, self.segmentation_token_count), name='Wd_softmax_second')) self.bd_softmax_second = add_to_params( self.params, theano.shared(value=np.zeros((self.segmentation_token_count, ), dtype='float32'), name='bd_softmax__second')) self.Wd_softmax_third = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.mlp_out_dim, self.segmentation_token_count), name='Wd_softmax_third')) self.bd_softmax_third = add_to_params( self.params, theano.shared(value=np.zeros((self.segmentation_token_count, ), dtype='float32'), name='bd_softmax__third')) def build_next_probs_predictor(self, inp, x, prev_state): """ Return output probabilities given prev_words x, hierarchical pass hs, and previous hd hs should always be the same (and should not be updated). """ return self.build_decoder(inp, x, mode=TextDecoder.BEAM_SEARCH, prev_state=prev_state) def build_decoder(self, decoder_inp, y=None, y_prev=None, mode=EVALUATION): # Run the decoder if self.mlp_activation_function == 'tanh': hidden_activation = T.tanh( T.dot(decoder_inp, self.Wd_in) + self.bd_in) elif self.mlp_activation_function == 'rectifier': hidden_activation = relu( T.dot(decoder_inp, self.Wd_in) + self.bd_in) elif self.mlp_activation_function == 'linear': hidden_activation = T.dot(decoder_inp, self.Wd_in) + self.bd_in else: raise Exception("Invalid activation function specified for MLP!") if self.condition_on_previous_speaker_class: first_output = T.nnet.softmax( T.dot(hidden_activation, self.Wd_softmax_first[y_prev[0]][ 0, :, :]) + self.bd_softmax_first[y_prev[0]]) second_output = T.nnet.softmax( T.dot(hidden_activation, self.Wd_softmax_second[y_prev[0]][ 0, :, :]) + self.bd_softmax_second[y_prev[0]]) third_output = T.nnet.softmax( T.dot(hidden_activation, self.Wd_softmax_third[y_prev[0]][ 0, :, :]) + self.bd_softmax_third[y_prev[0]]) outputs = T.concatenate( [first_output, second_output, third_output]) else: first_output = T.nnet.softmax( T.dot(hidden_activation, self.Wd_softmax_first) + self.bd_softmax_first) second_output = T.nnet.softmax( T.dot(hidden_activation, self.Wd_softmax_second) + self.bd_softmax_second) third_output = T.nnet.softmax( T.dot(hidden_activation, self.Wd_softmax_third) + self.bd_softmax_third) outputs = T.concatenate( [first_output, second_output, third_output]) # EVALUATION / BEAM SEARCH: Return outputs if mode == TextDecoder.EVALUATION: first_target_outputs = GrabProbs(first_output, y[0]) second_target_outputs = GrabProbs(second_output, y[1]) third_target_outputs = GrabProbs(third_output, y[1]) target_outputs = T.concatenate([ first_target_outputs, second_target_outputs, third_target_outputs ]) return outputs, target_outputs elif mode == TextDecoder.BEAM_SEARCH: return outputs # SAMPLING : Return a vector with sample elif mode == TextDecoder.SAMPLING: first_sample = self.trng.multinomial(pvals=first_output, dtype='int64').argmax(axis=-1) second_sample = self.trng.multinomial( pvals=second_output, dtype='int64').argmax(axis=-1) third_sample = self.trng.multinomial(pvals=third_output, dtype='int64').argmax(axis=-1) return T.concatenate([first_sample, second_sample, third_sample])
def theano_multinomial(n, pvals, seed): rng = RandomStreams(seed) return rng.multinomial(n=n, pvals=pvals, dtype='float32')
class OptionCritic_Network(): def __init__(self, model_network=None, gamma=0.99, learning_method="rmsprop", actor_lr=0.00025, batch_size=32, input_size=None, learning_params=None, dnn_type=True, clip_delta=0, scale=255., freeze_interval=100, grad_clip=0, termination_reg=0, num_options=8, double_q=False, temp=1, entropy_reg=0, BASELINE=False, **kwargs): x = T.ftensor4() next_x = T.ftensor4() a = T.ivector() o = T.ivector() r = T.fvector() terminal = T.ivector() self.freeze_interval = freeze_interval self.theano_rng = MRG_RandomStreams(1000) self.x_shared = theano.shared( np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32')) self.next_x_shared = theano.shared( np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32')) self.a_shared = theano.shared(np.zeros((batch_size), dtype='int32')) self.o_shared = theano.shared(np.zeros((batch_size), dtype='int32')) self.terminal_shared = theano.shared( np.zeros((batch_size), dtype='int32')) self.r_shared = theano.shared(np.zeros((batch_size), dtype='float32')) state_network = model_network[:-1] termination_network = copy.deepcopy([model_network[-1]]) termination_network[0]["activation"] = "sigmoid" print "NUM OPTIONS --->", num_options termination_network[0]["out_size"] = num_options option_network = copy.deepcopy([model_network[-1]]) option_network[0]["activation"] = "softmax" Q_network = copy.deepcopy([model_network[-1]]) Q_network[0]["out_size"] = num_options self.state_model = Model(state_network, input_size=input_size, dnn_type=dnn_type) self.state_model_prime = Model(state_network, input_size=input_size, dnn_type=dnn_type) output_size = [None, model_network[-2]["out_size"]] self.Q_model = Model(Q_network, input_size=output_size, dnn_type=dnn_type) self.Q_model_prime = Model(Q_network, input_size=output_size, dnn_type=dnn_type) self.termination_model = Model(termination_network, input_size=output_size, dnn_type=dnn_type) self.options_model = MLP3D(num_options, model_network, temp=temp) s = self.state_model.apply(x / scale) next_s = self.state_model.apply(next_x / scale) next_s_prime = self.state_model_prime.apply(next_x / scale) termination_probs = self.termination_model.apply( theano.gradient.disconnected_grad(s)) option_term_prob = termination_probs[T.arange(o.shape[0]), o] next_termination_probs = self.termination_model.apply( theano.gradient.disconnected_grad(next_s)) next_option_term_prob = next_termination_probs[T.arange(o.shape[0]), o] termination_sample = T.gt(option_term_prob, self.theano_rng.uniform(size=o.shape)) Q = self.Q_model.apply(s) next_Q = self.Q_model.apply(next_s) next_Q_prime = theano.gradient.disconnected_grad( self.Q_model_prime.apply(next_s_prime)) disc_option_term_prob = theano.gradient.disconnected_grad( next_option_term_prob) action_probs = self.options_model.apply(s, o) sampled_actions = T.argmax(self.theano_rng.multinomial( pvals=action_probs, n=1), axis=1).astype("int32") if double_q: print "TRAINING DOUBLE_Q" y = r + (1 - terminal) * gamma * ( (1 - disc_option_term_prob) * next_Q_prime[T.arange(o.shape[0]), o] + disc_option_term_prob * next_Q_prime[T.arange(next_Q.shape[0]), T.argmax(next_Q, axis=1)]) else: y = r + (1 - terminal) * gamma * ( (1 - disc_option_term_prob) * next_Q_prime[T.arange(o.shape[0]), o] + disc_option_term_prob * T.max(next_Q_prime, axis=1)) y = theano.gradient.disconnected_grad(y) option_Q = Q[T.arange(o.shape[0]), o] td_errors = y - option_Q if clip_delta > 0: quadratic_part = T.minimum(abs(td_errors), clip_delta) linear_part = abs(td_errors) - quadratic_part td_cost = 0.5 * quadratic_part**2 + clip_delta * linear_part else: td_cost = 0.5 * td_errors**2 # critic updates critic_cost = T.sum(td_cost) critic_params = self.Q_model.params + self.state_model.params learning_algo = self.Q_model.get_learning_method( learning_method, **learning_params) grads = T.grad(critic_cost, critic_params) critic_updates = learning_algo.apply(critic_params, grads, grad_clip=grad_clip) # actor updates actor_params = self.termination_model.params + self.options_model.params learning_algo = self.termination_model.get_learning_method("sgd", lr=actor_lr) disc_Q = theano.gradient.disconnected_grad(option_Q) disc_V = theano.gradient.disconnected_grad(T.max(Q, axis=1)) term_grad = T.sum(option_term_prob * (disc_Q - disc_V + termination_reg)) entropy = -T.sum(action_probs * T.log(action_probs)) if not BASELINE: policy_grad = - \ T.sum( T.log(action_probs[T.arange(a.shape[0]), a]) * y) - entropy_reg*entropy else: policy_grad = - \ T.sum(T.log(action_probs[T.arange(a.shape[0]), a]) * (y-disc_Q)) - entropy_reg*entropy grads = T.grad(term_grad + policy_grad, actor_params) actor_updates = learning_algo.apply(actor_params, grads, grad_clip=grad_clip) if self.freeze_interval > 1: target_updates = OrderedDict() for t, b in zip( self.Q_model_prime.params + self.state_model_prime.params, self.Q_model.params + self.state_model.params): target_updates[t] = b self._update_target_params = theano.function( [], [], updates=target_updates) self.update_target_params() print "freeze interval:", self.freeze_interval else: print "freeze interval: None" critic_givens = { x: self.x_shared, o: self.o_shared, r: self.r_shared, terminal: self.terminal_shared, next_x: self.next_x_shared } actor_givens = { a: self.a_shared, r: self.r_shared, terminal: self.terminal_shared, o: self.o_shared, next_x: self.next_x_shared } print "compiling...", self.train_critic = theano.function([], [critic_cost], updates=critic_updates, givens=critic_givens) self.train_actor = theano.function([s], [], updates=actor_updates, givens=actor_givens) self.pred_score = theano.function([], T.max(Q, axis=1), givens={x: self.x_shared}) self.sample_termination = theano.function( [s], [termination_sample, T.argmax(Q, axis=1)], givens={o: self.o_shared}) self.sample_options = theano.function([s], T.argmax(Q, axis=1)) self.sample_actions = theano.function([s], sampled_actions, givens={o: self.o_shared}) self.get_action_dist = theano.function([s, o], action_probs) self.get_s = theano.function([], s, givens={x: self.x_shared}) print "complete" def update_target_params(self): if self.freeze_interval > 1: self._update_target_params() return def predict_move(self, s): return self.sample_options(s) def predict_termination(self, s, a): self.a_shared.set_value(a) return tuple(self.sample_termination(s)) def get_q_vals(self, x): self.x_shared.set_value(x) return self.pred_score()[:, np.newaxis] def get_state(self, x): self.x_shared.set_value(x) return self.get_s() def get_action(self, s, o): self.o_shared.set_value(o) return self.sample_actions(s) def train_conv_net(self, train_set_x, next_x, options, r, terminal, actions=None, model=""): self.next_x_shared.set_value(next_x) self.o_shared.set_value(options) self.r_shared.set_value(r) self.terminal_shared.set_value(terminal) if model == "critic": self.x_shared.set_value(train_set_x) return self.train_critic() elif model == "actor": self.a_shared.set_value(actions) return self.train_actor(train_set_x) else: print "WRONG MODEL NAME" raise NotImplementedError def save_params(self): return [ self.state_model.save_params(), self.Q_model.save_params(), self.termination_model.save_params(), self.options_model.save_params() ] def load_params(self, values): self.state_model.load_params(values[0]) self.Q_model.load_params(values[1]) self.termination_model.load_params(values[2]) self.options_model.load_params(values[3])
class ParticleFilter(): ''' Implements particle filtering and smoothing for Markov Chains with arbitrary proposal/true distributions ''' def __init__(self, transition_model, observation_model, n_particles, observation_input=None, n_history=1): self.transition_model = transition_model self.observation_model = observation_model self.data_dims = observation_model.output_dims self.state_dims = transition_model.output_dims self.n_particles = n_particles self.n_history = n_history #this is used to keep track of what set of particles corresponds #to the previous point in time self.time_counter = theano.shared(0) self.theano_rng = RandomStreams() #init_particles=np.zeros((n_history+1, n_particles, self.state_dims)).astype(np.float32) init_particles = np.random.randn(n_history + 1, n_particles, self.state_dims).astype(np.float32) init_weights = (np.ones((n_history + 1, n_particles)) / float(n_particles)).astype(np.float32) self.particles = theano.shared(init_particles) self.weights = theano.shared(init_weights) self.next_state = self.particles[(self.time_counter + 1) % (self.n_history + 1)] self.current_state = self.particles[self.time_counter % (self.n_history + 1)] self.previous_state = self.particles[(self.time_counter - 1) % (self.n_history + 1)] self.next_weights = self.weights[(self.time_counter + 1) % (self.n_history + 1)] self.current_weights = self.weights[self.time_counter % (self.n_history + 1)] self.previous_weights = self.weights[(self.time_counter - 1) % (self.n_history + 1)] self.proposal_distrib = None self.true_log_transition_probs = self.transition_model.rel_log_prob self.true_log_observation_probs = self.observation_model.rel_log_prob self.perform_inference = None self.resample = None self.sample_joint = None self.observation_input = observation_input ess = self.compute_ESS() self.get_ESS = theano.function([], ess) n_samps = T.lscalar() n_T = T.lscalar() data_samples, state_samples, init_state_samples, data_sample_updates = self.sample_future( n_samps, n_T) self.sample_from_future = theano.function( [n_samps, n_T], [data_samples, state_samples, init_state_samples], updates=data_sample_updates) self.get_current_particles = theano.function([], self.current_state) self.get_current_weights = theano.function([], self.current_weights) def recompile(self): '''This function compiles each of the theano functions that might change following a change of the model. ''' samp_updates = self.sample_update(self.observation_input) self.perform_inference = theano.function([], updates=samp_updates) res_updates = self.resample_update() self.resample = theano.function([], updates=res_updates) nsamps = T.lscalar() joint_samples, joint_updates = self.sample_from_joint(nsamps) self.sample_joint = theano.function([nsamps], joint_samples, updates=joint_updates) new_ess, stddevhist, esshist, sr_updates = self.sequential_resample() self.perform_sequential_resampling = theano.function( [], [new_ess, stddevhist, esshist], updates=sr_updates) csamps = self.sample_current(nsamps) self.sample_current_state = theano.function([nsamps], csamps) psamps = self.sample_prev(nsamps) self.sample_previous_state = theano.function([nsamps], psamps) return def set_proposal(self, proposal_distrib): self.proposal_distrib = proposal_distrib return def set_true_log_transition_probs(self, true_log_transition_probs): self.true_log_transition_probs = true_log_transition_probs return def set_true_log_observation_probs(self, true_log_observation_probs): self.true_log_observation_probs = true_log_observation_probs return def sample_update(self, data): proposal_samples, log_proposal_probs = self.proposal_distrib printing = False if printing: log_transition_probs = theano.printing.Print( '1 log transition probs update')( self.true_log_transition_probs(self.current_state, proposal_samples)) log_observation_probs = theano.printing.Print( '2 log observation probs update')( self.true_log_observation_probs(proposal_samples, data.dimshuffle('x', 0))) log_unnorm_weights = theano.printing.Print( '3 log unnorm weights update')(log_transition_probs + log_observation_probs - log_proposal_probs) log_unnorm_weights_center = theano.printing.Print( '4 log unnorm weights center update')( log_unnorm_weights - T.max(log_unnorm_weights)) unnorm_weights = theano.printing.Print('5 unnorm weights update')( T.exp(log_unnorm_weights_center) * self.current_weights) normalizer = theano.printing.Print('6 normalizer update')( T.sum(unnorm_weights)) else: log_transition_probs = self.true_log_transition_probs( self.current_state, proposal_samples) log_observation_probs = self.true_log_observation_probs( proposal_samples, data.dimshuffle('x', 0)) log_unnorm_weights = log_transition_probs + log_observation_probs - log_proposal_probs log_unnorm_weights_center = log_unnorm_weights - T.max( log_unnorm_weights) unnorm_weights = T.exp( log_unnorm_weights_center) * self.current_weights normalizer = T.sum(unnorm_weights) weights = unnorm_weights / normalizer updates = OrderedDict() updates[self.weights] = T.set_subtensor(self.next_weights, weights) updates[self.particles] = T.set_subtensor(self.next_state, proposal_samples) updates[self.time_counter] = self.time_counter + 1 return updates def compute_ESS(self): return 1.0 / T.sum(self.current_weights**2) def resample_update(self): #shape: n_particles by n_particles samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat( self.current_weights.dimshuffle('x', 0), self.n_particles, axis=0)) idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64') updates = OrderedDict() updates[self.particles] = T.set_subtensor(self.current_state, self.current_state[idxs]) updates[self.weights] = T.set_subtensor( self.current_weights, T.cast( T.ones_like(self.current_weights) / float(self.n_particles), 'float32')) return updates def sample_step(self, future_samps, t, n_samples): particles_now = self.particles[(self.time_counter - t) % (self.n_history + 1)] weights_now = self.weights[(self.time_counter - t) % (self.n_history + 1)] #n_particles by n_samples rel_log_probs = self.true_log_transition_probs(particles_now, future_samps, all_pairs=True) unnorm_probs = T.exp(rel_log_probs) * weights_now.dimshuffle(0, 'x') probs = unnorm_probs / T.sum(unnorm_probs, axis=0).dimshuffle('x', 0) samps = self.theano_rng.multinomial(pvals=probs.T) idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64') output_samples = particles_now[idxs] return [output_samples, t + 1] def sample_from_joint(self, n_samples, output_2D=False): '''Samples from the joint posterior P(s_t-n_history:s_t | observations) n_samples: the number of samples to draw Returns an array with shape (n_history+1, n_samples, state_dims), where array[-1] corresponds to the current time. ''' samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat( self.current_weights.dimshuffle('x', 0), n_samples, axis=0)) idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64') samps_t0 = self.current_state[idxs] t0 = T.as_tensor_variable(1) [samples, ts], updates = theano.scan(fn=self.sample_step, outputs_info=[samps_t0, t0], non_sequences=[n_samples], n_steps=self.n_history) #the variable "samples" that results from the scan is time-flipped #in the sense that samples[0] corresponds to the most recent point #in time, and higher indices correspond to points in the past. #I will stick to the convention that for any collection of points in #time, [-1] will index the most recent time, and [0] will index #the point farthest in the past. So, the first axis of "samples" #needs to be flipped. flip_idxs = T.cast(-T.arange(self.n_history) + self.n_history - 1, 'int64') samples = T.concatenate( [samples[flip_idxs], samps_t0.dimshuffle('x', 0, 1)], axis=0) if output_2D: samples = T.reshape( samples, ((self.n_history + 1) * n_samples, self.state_dims)) return samples, updates def sample_future(self, n_samples, n_T): '''Samples from the "future" data distribution: P(s_t+1,...s_t+n_T, x_t+1,...x_t+n_T | s_t) n_samples: number of samples to draw n_T: the number of (future) time points to sample from Returns three arrays. The first two have shapes (n_T, n_samples, data_dims) and (n_T, n_samples, state_dims), corresponding to samples of future observations and states, and the third having size (n_samples,state_dims), corresponding to the "initial" samples taken from the current state distribution. ''' samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat( self.current_weights.dimshuffle('x', 0), n_samples, axis=0)) idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64') samps_t0 = self.current_state[idxs] def fstep(states): next_states = self.transition_model.get_samples_noprobs(states) next_data = self.observation_model.get_samples_noprobs(next_states) return next_states, next_data [state_samples, data_samples], updates = theano.scan(fn=fstep, outputs_info=[samps_t0, None], n_steps=n_T) #data_samples=self.observation_model.get_samples_noprobs(state_samples) return data_samples, state_samples, samps_t0, updates def sample_model(self, n_samples, n_T): '''Samples from the "future" data distribution: P(s_t+1,...s_t+n_T, x_t+1,...x_t+n_T | s_t) n_samples: number of samples to draw n_T: the number of (future) time points to sample from Returns three arrays. The first two have shapes (n_T, n_samples, data_dims) and (n_T, n_samples, state_dims), corresponding to samples of future observations and states, and the third having size (n_samples,state_dims), corresponding to the "initial" samples taken from the current state distribution. ''' samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat( self.current_weights.dimshuffle('x', 0), n_samples, axis=0)) idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64') samps_t0 = self.current_state[idxs] state_samples, updates = theano.scan( fn=self.transition_model.get_samples_noprobs, outputs_info=[samps_t0], n_steps=n_T) data_sample = self.observation_model.get_samples_noprobs( state_samples[-1]) return data_sample, state_samples[-1], state_samples[-2], updates def sr_step(self, means, weights, stddev, ess, decay): #Sampling from a mixture of gaussians msamps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat( weights.dimshuffle('x', 0), means.shape[0], axis=0)) idxs = T.cast(T.dot(msamps, T.arange(means.shape[0])), 'int64') sample_means = T.cast(means[idxs], 'float32') proposal_samples = self.theano_rng.normal( size=means.shape) * stddev.dimshuffle('x', 0) + sample_means diffs = proposal_samples.dimshuffle( 0, 'x', 1) - sample_means.dimshuffle('x', 0, 1) printing = False if printing: log_proposal_probs = theano.printing.Print('1 log_proposal_probs')( T.log( T.dot( T.exp(-T.sum( (1.0 / (2.0 * stddev**2)).dimshuffle('x', 'x', 0) * diffs**2, axis=2)), weights))) log_transition_probs = theano.printing.Print( '2 log transition probs')(self.true_log_transition_probs( self.previous_state, proposal_samples, all_pairs=True)) log_transition_probs_2 = theano.printing.Print( '3 log transition probs 2')(T.log( T.dot( T.exp(log_transition_probs).T, self.previous_weights))) log_observation_probs = theano.printing.Print( '4 log observation probs')(self.true_log_observation_probs( proposal_samples, self.observation_input.dimshuffle('x', 0))) log_unnorm_weights = theano.printing.Print( '5 log unnorm weights nomax')(log_transition_probs_2 + log_observation_probs - log_proposal_probs) log_unnorm_weights = theano.printing.Print('6 log unnorm weights')( log_unnorm_weights - T.max(log_unnorm_weights)) unnorm_weights = theano.printing.Print('7 unnorm weights')( T.exp(log_unnorm_weights)) normalizer = theano.printing.Print('8 normalizer')( T.sum(unnorm_weights)) else: log_proposal_probs = T.log( T.dot( T.exp(-T.sum((1.0 / (2.0 * stddev**2)).dimshuffle('x', 'x', 0) * diffs**2, axis=2)), weights)) log_transition_probs = self.true_log_transition_probs( self.previous_state, proposal_samples, all_pairs=True) log_transition_probs = T.log( T.dot(T.exp(log_transition_probs).T, self.previous_weights)) log_observation_probs = self.true_log_observation_probs( proposal_samples, self.observation_input.dimshuffle('x', 0)) log_unnorm_weights = log_transition_probs + log_observation_probs - log_proposal_probs log_unnorm_weights = log_unnorm_weights - T.max(log_unnorm_weights) unnorm_weights = T.exp(log_unnorm_weights) normalizer = T.sum(unnorm_weights) new_weights = unnorm_weights / normalizer new_ess = 1.0 / T.sum(new_weights**2) sampmean = T.dot(proposal_samples.T, new_weights) sampvar = T.dot( ((proposal_samples - sampmean.dimshuffle('x', 0))**2).T, new_weights) #propmean=T.mean(proposal_samples, axis=0) #propvar=T.mean((proposal_samples-propmean.dimshuffle('x',0))**2,axis=0) #new_stddev=stddev*T.clip(T.exp(decay*(1.0-propvar/sampvar)),0.5,2.0) #new_stddev=T.clip(stddev*T.clip(T.exp(decay*(1.0-stddev**2/sampvar)),0.5,2.0),0.0,4.0) new_stddev = T.clip( stddev * T.clip(T.exp(decay * (1.0 - stddev**2 / sampvar)), 0.5, 1.5), 0.0, 4.0) return [ proposal_samples, new_weights, new_stddev, T.cast(new_ess, 'float32') ] #, theano.scan_module.until(new_ess>100) def sequential_resample(self, init_stddev=4.0, max_steps=20, stddev_decay=0.1): '''Repeatedly resamples and then samples from a proposal distribution constructed from the current samples. Should be used when the main proposal distribution is poor or whenever the ESS is poor. ''' essT = T.as_tensor_variable(np.asarray(0.0, dtype='float32')) stddevT = T.as_tensor_variable( np.asarray(init_stddev * np.ones(self.state_dims), dtype='float32')) decayT = T.as_tensor_variable(np.asarray(stddev_decay, dtype='float32')) [samphist, weighthist, stddevhist, esshist], updates = theano.scan(fn=self.sr_step, outputs_info=[ self.current_state, self.current_weights, stddevT, essT ], non_sequences=decayT, n_steps=max_steps) end_samples = samphist[-1] end_weights = weighthist[-1] updates[self.particles] = T.set_subtensor(self.current_state, end_samples) updates[self.weights] = T.set_subtensor(self.current_weights, end_weights) return 1.0 / T.sum(end_weights**2), stddevhist, esshist, updates def sample_current(self, nsamps): samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat( self.current_weights.dimshuffle('x', 0), nsamps, axis=0)) idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64') samples = self.current_state[idxs] return samples def sample_prev(self, nsamps): samps = self.theano_rng.multinomial(pvals=T.extra_ops.repeat( self.previous_weights.dimshuffle('x', 0), nsamps, axis=0)) idxs = T.cast(T.dot(samps, T.arange(self.n_particles)), 'int64') samples = self.previous_state[idxs] return samples def get_history(self): '''This function returns a 3-D array containing all the particles and a 2-D array of weights for the entire memory. The first dimension indexes time, with the zeroth entry corresponding to the earliest point in memory.''' idxs = (T.arange(self.n_history + 1) - self.n_history + self.time_counter) % (self.n_history + 1) return self.particles[idxs], self.weights[idxs]
def test_undefined_grad(): srng = MRG_RandomStreams(seed=1234) # checking uniform distribution low = tensor.scalar() out = srng.uniform((), low=low) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, low) high = tensor.scalar() out = srng.uniform((), low=0, high=high) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, high) out = srng.uniform((), low=low, high=high) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, (low, high)) # checking binomial distribution prob = tensor.scalar() out = srng.binomial((), p=prob) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, prob) # checking multinomial distribution prob1 = tensor.scalar() prob2 = tensor.scalar() p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])] out = srng.multinomial(size=None, pvals=p, n=4)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, theano.tensor.sum(out), prob1) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.multinomial(size=None, pvals=p, n=4)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, theano.tensor.sum(out), (prob1, prob2)) # checking choice p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0], prob1) # checking normal distribution avg = tensor.scalar() out = srng.normal((), avg=avg) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg) std = tensor.scalar() out = srng.normal((), avg=0, std=std) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std) out = srng.normal((), avg=avg, std=std) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, (avg, std))
class MaskGenerator(object): def __init__(self, input_size, hidden_sizes, l, random_seed=1234): self._random_seed = random_seed self._mrng = MRG_RandomStreams(seed=random_seed) self._rng = RandomStreams(seed=random_seed) self._hidden_sizes = hidden_sizes self._input_size = input_size self._l = l self.ordering = theano.shared(np.arange(input_size, dtype=theano.config.floatX), 'ordering', borrow=False) # Initial layer connectivity self.layers_connectivity = [theano.shared((self.ordering + 1).eval(), 'layer_connectivity_input', borrow=False)] for i in range(len(self._hidden_sizes)): lc = theano.shared(np.zeros((self._hidden_sizes[i]),dtype=floatX), 'layer_connectivity_hidden{0}'.format(i), borrow=False) self.layers_connectivity += [lc] self.layers_connectivity += [self.ordering] ## Theano functions new_ordering = self._rng.shuffle_row_elements(self.ordering) updates = [(self.ordering, new_ordering), (self.layers_connectivity[0], new_ordering + 1)] self.shuffle_ordering = theano.function(name='shuffle_ordering', inputs=[], updates=updates) self.layers_connectivity_updates = [] for i in range(len(self._hidden_sizes)): lcu = self._get_hidden_layer_connectivity(i) self.layers_connectivity_updates += [lcu] hsizes = range(len(self._hidden_sizes)) updates = [(self.layers_connectivity[i+1], self.layers_connectivity_updates[i]) for i in hsizes] self.sample_connectivity = theano.function(name='sample_connectivity', inputs=[], updates=updates) # Save random initial state self._initial_mrng_rstate = copy.deepcopy(self._mrng.rstate) self._initial_mrng_state_updates = [sup[0].get_value() for sup in self._mrng.state_updates] # Ensuring valid initial connectivity self.sample_connectivity() def reset(self): # Set Original ordering self.ordering.set_value(np.arange(self._input_size, dtype=theano.config.floatX)) # Reset RandomStreams self._rng.seed(self._random_seed) # Initial layer connectivity self.layers_connectivity[0].set_value((self.ordering + 1).eval()) for i in range(1, len(self.layers_connectivity)-1): value = np.zeros((self._hidden_sizes[i-1]), dtype=theano.config.floatX) self.layers_connectivity[i].set_value(value) self.layers_connectivity[-1].set_value(self.ordering.get_value()) # Reset MRG_RandomStreams (GPU) self._mrng.rstate = self._initial_mrng_rstate states_values = zip(self._mrng.state_updates, self._initial_mrng_state_updates) for state, value in states_values: state[0].set_value(value) self.sample_connectivity() def _get_p(self, start_choice): start_choice_idx = (start_choice-1).astype('int32') prob = T.nnet.nnet.softmax(self._l * T.arange(start_choice, self._input_size, dtype=floatX))[0] p_vals = T.concatenate([T.zeros((start_choice_idx,)),prob]) p_vals = T.inc_subtensor(p_vals[start_choice_idx], 1.) return p_vals def _get_hidden_layer_connectivity(self, layerIdx): layer_size = self._hidden_sizes[layerIdx] if layerIdx == 0: lc = self.layers_connectivity[layerIdx] p_vals = self._get_p(T.min(lc)) else: lc = self.layers_connectivity_updates[layerIdx-1] p_vals = self._get_p(T.min(lc)) return T.sum( T.cumsum(self._mrng.multinomial( pvals=T.tile(p_vals[::-1][None, :],(layer_size, 1)), dtype=floatX), axis=1), axis=1 ) def _get_mask(self, layerIdxIn, layerIdxOut): return (self.layers_connectivity[layerIdxIn][:, None] <= self.layers_connectivity[layerIdxOut][None, :]).astype(floatX) def get_mask_layer_UPDATE(self, layerIdx): return self._get_mask(layerIdx, layerIdx + 1) def get_direct_input_mask_layer_UPDATE(self, layerIdx): return self._get_mask(0, layerIdx) def get_direct_output_mask_layer_UPDATE(self, layerIdx): return self._get_mask(layerIdx, -1)
class RNNtsg(model): ''' The attention-based NMT model for TSG ''' def __init__(self, config, name=''): self.config = config self.name = name self.creater = LayerFactory() self.trng = RandomStreams(numpy.random.randint(int(10e6))) def translate(self, x, T, beam_size=10, return_array=False): ''' Decode with beam search. :type x: numpy array :param x: the indexed source sentence :type beam_size: int :param beam_size: beam size :returns: a numpy array, the indexed translation result ''' # initialize variables result = [[]] loss = [0.] result_eos = [] loss_eos = [] beam = beam_size nonterms = [ ['S'] ] # same length as result, nonterms for each hypothesis # (n_hyps, nonterm for each hyp) par_state_time = [[0]] # (n_hyps, len(nonterm) for each hyp) # get encoder states c, state = self.get_context_and_init(x) emb_y = numpy.zeros((1, self.config['dim_emb_trg']), dtype='float32') state_hist = [[ numpy.zeros((1, self.config['dim_rec_enc']), dtype='float32') ]] # (n_hyps, l) for l in range(x.shape[0] * 3): cur_nonterm_idx = [ ] # length lists, each list is the rule indices for expanding LHS #print result for i in range(len(nonterms)): if len(nonterms[i]) > 0: potent_rules = T.rule_idx_with_root( nonterms[i][-1] ) # list of potential rules with the given lhs as root #print potent_rules + i * self.config['dim_emb_trg'] cur_nonterm_idx += [ r + i * self.config['num_vocab_trg'] for r in potent_rules ] nonterms[i].pop() # only take the first k results if we have k < beam_size potential nonterms if len(cur_nonterm_idx) < beam_size: beam = len(cur_nonterm_idx) else: beam = beam_size # get word probability energy, ctx = self.get_probs(numpy.repeat(c, len(result), axis=1), state, emb_y) # multiply energy by cur_nonterm_idx mask energy_mask = numpy.zeros((energy.shape[0] * energy.shape[1]), dtype='float32') energy_mask[cur_nonterm_idx] = 1. energy_mask = energy_mask.reshape( (energy.shape[0], energy.shape[1])) energy = energy * energy_mask probs = tools.softmax(energy) losses = -numpy.log(probs) # prevent translation to be too short. if l < x.shape[0] / 2: losses[:, self.config['index_eos_trg']] = numpy.inf # prevent rules that do not have required lhs #losses[:, not_cur_nonterm_idx] = numpy.inf for i in range(len(loss)): losses[i] += loss[i] # get the n-best partial translations best_index_flatten = numpy.argpartition(losses.flatten(), beam)[:beam] best_index = [(index / self.config['num_vocab_trg'], index % self.config['num_vocab_trg']) for index in best_index_flatten] # save the partial translations in the beam new_ctx = numpy.zeros((beam, 2 * self.config['dim_rec_enc']), dtype='float32') new_y = [] new_state = numpy.zeros((beam, self.config['dim_rec_dec']), dtype='float32') new_result = [] new_loss = [] new_nonterms = [] new_par_state_time = [] new_state_hist = [] new_par_state = numpy.zeros((beam, self.config['dim_rec_dec']), dtype='float32') #print best_index #print len(result), len(state_hist), len(par_state_time) for i in range(beam): index = best_index[i] new_result.append(result[index[0]] + [index[1]]) new_loss.append(losses[index[0], index[1]]) new_ctx[i] = ctx[index[0]] new_y.append(index[1]) new_state[i] = state[index[0]] par_state_t = par_state_time[index[0]][-1] new_par_state[i] = state_hist[index[0]][par_state_t] r = T.get_rule_from_idx(index[1]) if r: add_nonterms = r.get_expand_tags()[::-1] else: add_nonterms = [] new_nonterms.append(nonterms[index[0]] + add_nonterms) # set the parent of expanded tags to be current # do not include last par_state_time[] for current hyp new_par_state_time.append(par_state_time[index[0]][:-1] + [l + 1] * len(add_nonterms)) new_state_hist.append(state_hist[index[0]] + [state[index[0]]]) # get the next decoder hidden state new_emby = self.get_trg_embedding( numpy.asarray(new_y, dtype='int64'))[0] new_state = self.get_next(new_ctx, new_state, new_par_state, new_emby) # remove finished translation from the beam state = [] emb_y = [] result = [] loss = [] nonterms = [] state_hist = [] par_state_time = [] for i in range(beam): if len(new_nonterms[i]) == 0: # par_state_time and nonterms should have same length for each hyp # par_state_time records parent state timestep for each nonterms that needs to be expanded assert len(new_par_state_time[i]) == 0 result_eos.append(new_result[i]) #print new_result[i] loss_eos.append(new_loss[i]) beam -= 1 else: result.append(new_result[i]) loss.append(new_loss[i]) state.append(new_state[i]) emb_y.append(new_emby[i]) nonterms.append(new_nonterms[i]) state_hist.append(new_state_hist[i]) par_state_time.append(new_par_state_time[i]) #print len(result), len(state_hist), len(par_state_time) if beam <= 0: break state = numpy.asarray(state, dtype='float32') emb_y = numpy.asarray(emb_y, dtype='float32') # only used in semi-supervised training if return_array: if len(result_eos) > 0: return result_eos else: return [result[-1][:1]] if len(result_eos) > 0: # return the best translation return result_eos[numpy.argmin(loss_eos)] elif beam_size > 100: # double the beam size on failure logging.warning('cannot find translation in beam size %d' % beam_size) return [] else: logging.info('cannot find translation in beam size %d, try %d' % (beam_size, beam_size * 2)) return self.translate(x, beam_size=beam_size * 2) def sampling_step(self, state, prev, context, par_state): ''' Build the computational graph which samples the next word. :type state: theano variables :param state: the previous hidden state :type prev: theano variables :param prev: the last generated word :type context: theano variables :param context: the context vectors. ''' emb = self.emb_trg.forward(prev) energy, c = self.decoderGRU.decode_probs(context, state, emb) probs = tensor.nnet.softmax(energy) sample = self.trng.multinomial(pvals=probs, dtype='int64').argmax(axis=-1) newemb = self.emb_trg.forward(sample) newstate = self.decoderGRU.decode_next(c, state, newemb, par_state) return newstate, sample, probs def decode_sample(self, state_init, c, length, n_samples): ''' Build the decoder graph for sampling. :type state_init: theano variables :param state_init: the initial state of decoder :type c: theano variables :param c: the context vectors :type length: int :param length: the limitation of sample length :type n_samples: int :param n_samples: the number of samples ''' state = tensor.repeat(state_init, n_samples, axis=0) sample = tensor.zeros((n_samples, ), dtype='int64') c = tensor.repeat(c, n_samples, axis=1) result, updates = theano.scan(self.sampling_step, outputs_info=[state, sample, None], non_sequences=[c], n_steps=length) samples = result[1] probs = result[2] y_idx = tensor.arange(samples.flatten( ).shape[0]) * self.config['num_vocab_trg'] + samples.flatten() probs = probs.flatten()[y_idx] probs.reshape(samples.shape) return samples, probs, updates def build(self, verbose=False): ''' Build the computational graph. :type verbose: bool :param verbose: only set to True on visualization ''' config = self.config #create layers logging.info('initializing layers...') self.emb_src = self.creater.createLookupTable(self.name + 'emb_src', config['num_vocab_src'], config['dim_emb_src'], offset=True) self.emb_trg = self.creater.createLookupTable(self.name + 'emb_trg', config['num_vocab_trg'], config['dim_emb_trg'], offset=True) self.encoderGRU = self.creater.createGRU(self.name + 'GRU_enc', config['dim_emb_src'], config['dim_rec_enc'], verbose=verbose) self.encoderGRU_back = self.creater.createGRU(self.name + 'GRU_enc_back', config['dim_emb_src'], config['dim_rec_enc'], verbose=verbose) self.decoderGRU = self.creater.createGRU_tsg(self.name + 'GRU_dec', config['dim_emb_trg'], 2 * config['dim_rec_enc'], config['dim_rec_dec'], config['num_vocab_trg'], verbose=verbose) self.initer = self.creater.createFeedForwardLayer( self.name + 'initer', config['dim_rec_enc'], config['dim_rec_dec'], offset=True) # create input variables self.x = tensor.matrix('x', dtype='int64') # size: (length, batchsize) self.xmask = tensor.matrix( 'x_mask', dtype='float32') # size: (length, batchsize) self.y_idx = tensor.matrix('y_idx', dtype='int64') # size: (length, batchsize) self.ymask = tensor.matrix( 'y_mask', dtype='float32') # size: (length, batchsize) #self.y_parent_idx = tensor.matrix('y_parent_idx', dtype='int64') # size: (length, batchsize) self.y_parent_t = tensor.matrix( 'y_parent_t', dtype='int64') # size: (length, batchsize) if 'MRT' in config and config['MRT'] is True: self.MRTLoss = tensor.vector('MRTLoss') self.inputs = [ self.x, self.xmask, self.y_idx, self.y_parent_t, self.ymask, self.MRTLoss ] else: self.MRTLoss = None self.inputs = [ self.x, self.xmask, self.y_idx, self.y_parent_t, self.ymask ] # create computational graph for training logging.info('building computational graph...') # ----encoder----- emb = self.emb_src.forward( self.x.flatten()) # size: (length, batch_size, dim_emb) back_emb = self.emb_src.forward(self.x[::-1].flatten()) self.encode_forward = self.encoderGRU.forward( emb, self.x.shape[0], batch_size=self.x.shape[1], mask=self.xmask) # size: (length, batch_size, dim) self.encode_backward = self.encoderGRU_back.forward( back_emb, self.x.shape[0], batch_size=self.x.shape[1], mask=self.xmask[::-1]) # size: (length, batch_size, dim) context_forward = self.encode_forward[0] context_backward = self.encode_backward[0][::-1] self.context = tensor.concatenate( (context_forward, context_backward), axis=2) # size: (length, batch_size, 2*dim) # ----decoder---- self.init_c = context_backward[0] self.state_init = self.initer.forward(context_backward[0]) emb = self.emb_trg.forward( self.y_idx.flatten()) # size: (length, batch_size, dim_emb) self.decode = self.decoderGRU.forward( emb, self.y_idx.shape[0], self.context, self.state_init, self.y_parent_t, batch_size=self.y_idx.shape[1], mask=self.ymask, cmask=self.xmask) # size: (length, batch_size, dim) energy = self.decode[1] self.attention = self.decode[2] self.softmax = tensor.nnet.softmax(energy) # compute costs and grads y_idx = tensor.arange(self.y_idx.flatten( ).shape[0]) * self.config['num_vocab_trg'] + self.y_idx.flatten() cost = self.softmax.flatten()[y_idx] cost = -tensor.log(cost) self.cost = cost.reshape( (self.y_idx.shape[0], self.y_idx.shape[1])) * self.ymask self.cost_per_sample = self.cost.sum(axis=0) if 'MRT' in config and config['MRT'] is True: self.cost_per_sample = self.cost.sum(axis=0) tmp = self.cost_per_sample tmp *= config['MRT_alpha'] tmp -= tmp.min() tmp = tensor.exp(-tmp) tmp /= tmp.sum() tmp *= self.MRTLoss tmp = -tmp.sum() self.cost = tmp else: self.cost = self.cost.sum() # build sampling graph self.x_sample = tensor.matrix('x_sample', dtype='int64') self.n_samples = tensor.scalar('n_samples', dtype='int64') self.length_sample = tensor.scalar('length', dtype='int64') emb_sample = self.emb_src.forward( self.x_sample.flatten()) # (length, batch_size, dim_emb) back_emb_sample = self.emb_src.forward(self.x_sample[::-1].flatten()) encode_forward_sample = self.encoderGRU.forward( emb_sample, self.x_sample.shape[0], batch_size=self.x_sample.shape[1]) # (length, batch_size, dim) encode_backward_sample = self.encoderGRU_back.forward( back_emb_sample, self.x_sample.shape[0], batch_size=self.x_sample.shape[1]) # (length, batch_size, dim) context_sample = tensor.concatenate( (encode_forward_sample[0], encode_backward_sample[0][::-1]), axis=2) # (length, batch_size, 2*dim) state_init_sample = self.initer.forward( encode_backward_sample[0][::-1][0]) self.state_init_sample = state_init_sample self.context_sample = context_sample #self.samples, self.probs_sample, self.updates_sample = self.decode_sample(state_init_sample, context_sample, # self.length_sample, self.n_samples) # parameter for decoding self.y_decode = tensor.vector('y_decode', dtype='int64') self.context_decode = tensor.tensor3('context_decode', dtype='float32') self.c_decode = tensor.matrix('c_decode', dtype='float32') self.state_decode = tensor.matrix('state_decode', dtype='float32') self.par_state_decode = tensor.matrix('par_state_decode', dtype='float32') self.emb_decode = tensor.matrix('emb_decode', dtype='float32') def encode(self, x): ''' Encode source sentence to context vector. ''' if not hasattr(self, "encoder"): self.encoder = theano.function(inputs=[self.x, self.xmask], outputs=[self.context]) x = numpy.reshape(x, (x.shape[0], 1)) xmask = numpy.ones(x.shape, dtype='float32') return self.encoder(x, xmask) def get_trg_embedding(self, y): ''' Get the embedding of target sentence. ''' if not hasattr(self, "get_trg_embeddinger"): self.get_trg_embeddinger = theano.function( inputs=[self.y_decode], outputs=[self.emb_trg.forward(self.y_decode)]) return self.get_trg_embeddinger(y) def get_init(self, c): ''' Get the initial decoder hidden state with context vector. ''' if not hasattr(self, "get_initer"): self.get_initer = theano.function( inputs=[self.context], outputs=[self.initer.forward(context_backward[0])]) return self.get_initer(c) def get_context_and_init(self, x): ''' Encode source sentence to context vectors and get the initial decoder hidden state. ''' if not hasattr(self, "get_context_and_initer"): self.get_context_and_initer = theano.function( inputs=[self.x, self.xmask], outputs=[self.context, self.state_init]) x = numpy.reshape(x, (x.shape[0], 1)) xmask = numpy.ones(x.shape, dtype='float32') return self.get_context_and_initer(x, xmask) def get_probs(self, c, state, emb): ''' Get the probability of the next target word. ''' if not hasattr(self, "get_probser"): self.get_probser = theano.function( inputs=[ self.context_decode, self.state_decode, self.emb_decode ], outputs=self.decoderGRU.decode_probs(self.context_decode, self.state_decode, self.emb_decode)) return self.get_probser(c, state, emb) def get_next(self, c, state, par_state, emb): ''' Get the next hidden state. ''' if not hasattr(self, "get_nexter"): self.get_nexter = theano.function( inputs=[ self.c_decode, self.state_decode, self.par_state_decode, self.emb_decode ], outputs=self.decoderGRU.decode_next(self.c_decode, self.state_decode, self.par_state_decode, self.emb_decode)) return self.get_nexter(c, state, par_state, emb) def get_cost(self, x, xmask, y, ymask): ''' Get the negative log-likelihood of parallel sentences. ''' if not hasattr(self, "get_coster"): self.get_coster = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=[self.cost]) return self.get_coster(x, xmask, y, ymask) def get_sample(self, x, length, n_samples): ''' Get sampling results. ''' if not hasattr(self, "get_sampler"): self.get_sampler = theano.function( inputs=[self.x_sample, self.length_sample, self.n_samples], outputs=[self.samples, self.probs_sample], updates=self.updates_sample) return self.get_sampler(x, length, n_samples) def get_attention(self, x, xmask, y, ymask): ''' Get the attention weight of parallel sentences. ''' if not hasattr(self, "get_attentioner"): self.get_attentioner = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=[self.attention]) return self.get_attentioner(x, xmask, y, ymask) def get_layer(self, x, xmask, y, ymask): ''' Get the hidden states essential for visualization ''' if not hasattr(self, "get_layerer"): self.get_layerer = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=self.encode_forward + self.encode_backward + tuple(self.decode[0]) + tuple(self.decode[1:])) layers = self.get_layerer(x, xmask, y, ymask) enc_names = [ 'h', 'gate', 'reset', 'state', 'reseted', 'state_in', 'gate_in', 'reset_in' ] dec_names = [ 'h', 'c', 'att', 'gate_cin', 'gate_preactive', 'gate', 'reset_cin', 'reset_preactive', 'reset', 'state_cin', 'reseted', 'state_preactive', 'state' ] dec_names += [ 'outenergy', 'state_in', 'gate_in', 'reset_in', 'state_in_prev', 'readout', 'maxout', 'outenergy_1', 'outenergy_2' ] value_name = ['enc_for_' + name for name in enc_names] value_name += ['enc_back_' + name for name in enc_names] value_name += ['dec_' + name for name in dec_names] result = {} for i in range(len(layers)): print layers[i].shape if value_name[i] != '': result[value_name[i]] = layers[i] return result
class CRBM: """CRBM class. The class :class:`CRBM` implements functionality for a *convolutional restricted Boltzmann machine* (cRBM) that extracts redundant DNA sequence features from a provided set of sequences. The model can subsequently be used to study the sequence content of (e.g. regulatory) sequences, by visualizing the features in terms of sequence logos or in order to cluster the sequences based on sequence content. Parameters ----------- num_motifs : int Number of motifs. motif_length : int Motif length. epochs : int Number of epochs to train (Default: 100). input_dims :int Input dimensions aka alphabet size (Default: 4 for DNA). doublestranded : bool Single strand or both strands. If set to True, both strands are scanned. (Default: True). batchsize : int Batch size (Default: 20). learning_rate : float) Learning rate (Default: 0.1). momentum : float Momentum term (Default: 0.95). pooling : int Pooling factor (not relevant for cRBM, but for future work) (Default: 1). cd_k : int Number of Gibbs sampling iterations in each persistent contrastive divergence step (Default: 5). rho : float Target frequency of motif occurrences (Default: 0.01). lambda_rate : float Sparsity enforcement aka penality term (Default: 0.1). """ def __init__(self, num_motifs, motif_length, epochs = 100, input_dims=4, \ doublestranded = True, batchsize = 20, learning_rate = 0.1, \ momentum = 0.95, pooling = 1, cd_k = 5, rho = 0.01, lambda_rate = 0.1): # sanity checks: if num_motifs <= 0: raise Exception("Number of motifs must be positive.") if motif_length <= 0: raise Exception("Motif length must be positive.") if epochs < 0: raise Exception("Epochs must be non-negative.") if input_dims <= 0: raise Exception("input_dims must be positive.") elif input_dims != 4: warnings.warn( "input_dims != 4 was not comprehensively \ tested yet. Be careful when interpreting the results.", UserWarning) if batchsize <= 0: raise Exception("batchsize must be positive.") if learning_rate <= 0.0: raise Exception("learning_rate must be positive.") if not (momentum >= 0.0 and momentum < 1.): raise Exception("momentum must be between zero and one.") if pooling <= 0: raise Exception("pooling must be positive.") if cd_k <= 0: raise Exception("cd_k must be positive.") if not (rho >= 0.0 and rho < 1.): raise Exception("rho must be between zero and one.") if lambda_rate < 0.: raise Exception("lambda_rate must be non-negative.") # parameters for the motifs self.num_motifs = num_motifs self.motif_length = motif_length self.input_dims = input_dims self.doublestranded = doublestranded self.batchsize = batchsize self.learning_rate = learning_rate self.momentum = momentum self.rho = rho self.lambda_rate = lambda_rate self.pooling = pooling self.cd_k = cd_k self.epochs = epochs self.spmethod = 'entropy' self._gradientSparsityConstraint = \ self._gradientSparsityConstraintEntropy x = np.random.randn(self.num_motifs, 1, self.input_dims, self.motif_length).astype(theano.config.floatX) self.motifs = theano.shared(value=x, name='W', borrow=True) # determine the parameter rho for the model if not given if not rho: rho = 1. / (self.num_motifs * self.motif_length) if self.doublestranded: rho = rho / 2. self.rho = rho # cRBM parameters (2*x to respect both strands of the DNA) b = np.zeros((1, self.num_motifs)).astype(theano.config.floatX) # adapt the bias such that it will initially have rho motif hits in H # That is, we want to have rho percent of the samples positive # randn draws from 'standard normal', this is why we have 0 and 1 b = b + scipy.stats.norm.ppf(self.rho, 0, np.sqrt(self.motif_length)) self.bias = theano.shared(value=b, name='bias', borrow=True) c = np.zeros((1, self.input_dims)).astype(theano.config.floatX) self.c = theano.shared(value=c, name='c', borrow=True) # infrastructural parameters self.theano_rng = RS(seed=int(time.time())) self.rng_data_permut = theano.tensor.shared_randomstreams.RandomStreams( ) self.motif_velocity = theano.shared(value=np.zeros( self.motifs.get_value().shape).astype(theano.config.floatX), name='velocity_of_W', borrow=True) self.bias_velocity = theano.shared(value=np.zeros(b.shape).astype( theano.config.floatX), name='velocity_of_bias', borrow=True) self.c_velocity = theano.shared(value=np.zeros(c.shape).astype( theano.config.floatX), name='velocity_of_c', borrow=True) val = np.zeros((self.batchsize, self.num_motifs, 1, 200)).astype(theano.config.floatX) self.fantasy_h = theano.shared(value=val, name='fantasy_h', borrow=True) if self.doublestranded: self.fantasy_h_prime = theano.shared(value=\ np.zeros((self.batchsize, self.num_motifs, 1, 200)).astype(theano.config.floatX), \ name='fantasy_h_prime', borrow=True) self._compileTheanoFunctions() def saveModel(self, filename): """Save the model parameters and additional hyper-parameters. Parameters ----------- filename : str Pickle filename where the model parameters are stored. """ numpyParams = (self.motifs.get_value(), self.bias.get_value(), self.c.get_value()) hyperparams = (self.num_motifs, self.motif_length, self.input_dims, self.doublestranded, self.batchsize, self.learning_rate, self.momentum, self.rho, self.lambda_rate, self.pooling, self.cd_k, self.epochs, self.spmethod) pickleObject = (numpyParams, hyperparams) joblib.dump(pickleObject, filename, protocol=2) @classmethod def loadModel(cls, filename): """Load a model from a given pickle file. Parameters ----------- filename : str Pickle file containing the model parameters. returns : :class:`CRBM` object An instance of CRBM with reloaded parameters. """ numpyParams, hyperparams = joblib.load(filename) (num_motifs, motif_length, input_dims, \ doublestranded, batchsize, learning_rate, \ momentum, rho, lambda_rate, pooling, cd_k, epochs, spmethod) = hyperparams obj = cls(num_motifs, motif_length, epochs=epochs, input_dims=input_dims, doublestranded=doublestranded, batchsize=batchsize, learning_rate=learning_rate, momentum=momentum, pooling=pooling, cd_k=cd_k, rho=rho, lambda_rate=lambda_rate) motifs, bias, c = numpyParams obj.motifs.set_value(motifs) obj.bias.set_value(bias) obj.c.set_value(c) return obj def _bottomUpActivity(self, data, flip_motif=False): """Theano function for computing bottom up activity.""" out = conv(data, self.motifs, filter_flip=flip_motif) out = out + self.bias.dimshuffle('x', 1, 0, 'x') return out def _bottomUpProbability(self, activities): """Theano function for computing bottom up Probability.""" pool = self.pooling x = activities.reshape((activities.shape[0], \ activities.shape[1], activities.shape[2], \ activities.shape[3]//pool, pool)) norm = T.sum(1. + T.exp(x), axis=4, keepdims=True) x = T.exp(x) / norm x=x.reshape((activities.shape[0], \ activities.shape[1], activities.shape[2], \ activities.shape[3])) return x def _bottomUpSample(self, probs): """Theano function for bottom up sampling.""" pool = self.pooling _probs = probs.reshape((probs.shape[0], probs.shape[1], probs.shape[2], probs.shape[3] // pool, pool)) _probs_reshape = _probs.reshape( (_probs.shape[0] * _probs.shape[1] * _probs.shape[2] * _probs.shape[3], pool)) samples = self.theano_rng.multinomial(pvals=_probs_reshape) samples = samples.reshape( (probs.shape[0], probs.shape[1], probs.shape[2], probs.shape[3])) return T.cast(samples, theano.config.floatX) def _computeHgivenV(self, data, flip_motif=False): """Theano function for complete bottom up pass.""" activity = self._bottomUpActivity(data, flip_motif) probability = self._bottomUpProbability(activity) sample = self._bottomUpSample(probability) return [probability, sample] def _topDownActivity(self, h, hprime): """Theano function for top down activity.""" W = self.motifs.dimshuffle(1, 0, 2, 3) C = conv(h, W, border_mode='full', filter_flip=True) out = T.sum(C, axis=1, keepdims=True) # sum over all K if hprime: C = conv(hprime, W[:,:,::-1,::-1], \ border_mode='full', filter_flip=True) out = out + T.sum(C, axis=1, keepdims=True) # sum over all K c_bc = self.c c_bc = c_bc.dimshuffle('x', 0, 1, 'x') activity = out + c_bc return activity def _topDownProbability(self, activity, softmaxdown=True): """Theano function for top down probability.""" if softmaxdown: return self._softmax(activity) else: return 1. / (1. - T.exp(-activity)) def _topDownSample(self, probability, softmaxdown=True): """Theano function for top down sample.""" if softmaxdown: pV_ = probability.dimshuffle(0, 1, 3, 2).reshape( \ (probability.shape[0]*probability.shape[3], probability.shape[2])) V_ = self.theano_rng.multinomial(n=1, pvals=pV_).astype( theano.config.floatX) V = V_.reshape((probability.shape[0], 1, probability.shape[3], probability.shape[2])).dimshuffle(0, 1, 3, 2) else: V=self.theano_rng.multinomial(n=1,\ pvals=probability).astype(theano.config.floatX) return V def _computeVgivenH(self, H_sample, H_sample_prime, softmaxdown=True): """Theano function for complete top down pass.""" activity = self._topDownActivity(H_sample, H_sample_prime) prob = self._topDownProbability(activity, softmaxdown) sample = self._topDownSample(prob, softmaxdown) return [prob, sample] def _collectVHStatistics(self, prob_of_H, data): """Theano function for collecting V*H statistics.""" # reshape input data = data.dimshuffle(1, 0, 2, 3) prob_of_H = prob_of_H.dimshuffle(1, 0, 2, 3) avh = conv(data, prob_of_H, border_mode="valid", filter_flip=False) avh = avh / T.prod(prob_of_H.shape[1:]) avh = avh.dimshuffle(1, 0, 2, 3).astype(theano.config.floatX) return avh def _collectVStatistics(self, data): """Theano function for collecting V statistics.""" # reshape input a = T.mean(data, axis=(0, 1, 3)).astype(theano.config.floatX) a = a.dimshuffle('x', 0) a = T.inc_subtensor(a[:, :], a[:, ::-1]) # match a-t and c-g occurances return a def _collectHStatistics(self, data): """Theano function for collecting H statistics.""" # reshape input a = T.mean(data, axis=(0, 2, 3)).astype(theano.config.floatX) a = a.dimshuffle('x', 0) return a def _collectUpdateStatistics(self, prob_of_H, prob_of_H_prime, data): """Theano function for collecting the complete update statistics.""" average_VH = self._collectVHStatistics(prob_of_H, data) average_H = self._collectHStatistics(prob_of_H) if prob_of_H_prime: average_VH_prime = self._collectVHStatistics(prob_of_H_prime, data) average_H_prime = self._collectHStatistics(prob_of_H_prime) average_VH = (average_VH + average_VH_prime[:, :, ::-1, ::-1]) / 2. average_H = (average_H + average_H_prime) / 2. average_V = self._collectVStatistics(data) return average_VH, average_H, average_V def _updateWeightsOnMinibatch(self, D, gibbs_chain_length): """Theano function that defines an SGD update step with momentum.""" # calculate the data gradient for weights (motifs), bias and c [prob_of_H_given_data, H_given_data] = self._computeHgivenV(D) if self.doublestranded: [prob_of_H_given_data_prime,H_given_data_prime] = \ self._computeHgivenV(D, True) else: [prob_of_H_given_data_prime, H_given_data_prime] = [None, None] # calculate data gradients G_motif_data, G_bias_data, G_c_data = \ self._collectUpdateStatistics(prob_of_H_given_data, \ prob_of_H_given_data_prime, D) # calculate model probs H_given_model = self.fantasy_h if self.doublestranded: H_given_model_prime = self.fantasy_h_prime else: H_given_model_prime = None for i in range(gibbs_chain_length): prob_of_V_given_model, V_given_model = \ self._computeVgivenH(H_given_model, H_given_model_prime) #sample up prob_of_H_given_model, H_given_model = \ self._computeHgivenV(V_given_model) if self.doublestranded: prob_of_H_given_model_prime, H_given_model_prime = \ self._computeHgivenV(V_given_model, True) else: prob_of_H_given_model_prime, H_given_model_prime = None, None # compute the model gradients G_motif_model, G_bias_model, G_c_model = \ self._collectUpdateStatistics(prob_of_H_given_model, \ prob_of_H_given_model_prime, V_given_model) mu = self.momentum alpha = self.learning_rate sp = self.lambda_rate reg_motif, reg_bias = self._gradientSparsityConstraint(D) vmotifs = mu * self.motif_velocity + \ alpha * (G_motif_data - G_motif_model - sp*reg_motif) vbias = mu * self.bias_velocity + \ alpha * (G_bias_data - G_bias_model - sp*reg_bias) vc = mu*self.c_velocity + \ alpha * (G_c_data - G_c_model) new_motifs = self.motifs + vmotifs new_bias = self.bias + vbias new_c = self.c + vc updates = [(self.motifs, new_motifs), (self.bias, new_bias), (self.c, new_c), (self.motif_velocity, vmotifs), (self.bias_velocity, vbias), (self.c_velocity, vc), (self.fantasy_h, H_given_model)] if self.doublestranded: updates.append((self.fantasy_h_prime, H_given_model_prime)) return updates def _gradientSparsityConstraintEntropy(self, data): """Theano function that defines the entropy-based sparsity constraint.""" # get expected[H|V] [prob_of_H, _] = self._computeHgivenV(data) q = self.rho p = T.mean(prob_of_H, axis=(0, 2, 3)) gradKernels = -T.grad(T.mean(q * T.log(p) + (1 - q) * T.log(1 - p)), self.motifs) gradBias = -T.grad(T.mean(q * T.log(p) + (1 - q) * T.log(1 - p)), self.bias) return gradKernels, gradBias def _compileTheanoFunctions(self): """This methods compiles all theano functions.""" print("Start compiling Theano training function...") D = T.tensor4('data') updates = self._updateWeightsOnMinibatch(D, self.cd_k) self.theano_trainingFct = theano.function([D], None, updates=updates, name='train_CRBM') #compute mean free energy mfe_ = self._meanFreeEnergy(D) #compute number of motif hits [_, H] = self._computeHgivenV(D) #H = self.bottomUpProbability(self.bottomUpActivity(D)) nmh_ = T.mean(H) # mean over samples (K x 1 x N_h) #compute norm of the motif parameters twn_ = T.sqrt(T.mean(self.motifs**2)) #compute information content pwm = self._softmax(self.motifs) entropy = -pwm * T.log2(pwm) entropy = T.sum(entropy, axis=2) # sum over letters ic_= T.log2(self.motifs.shape[2]) - \ T.mean(entropy) # log is possible information due to length of sequence medic_= T.log2(self.motifs.shape[2]) - \ T.mean(T.sort(entropy, axis=2)[:, :, entropy.shape[2] // 2]) self.theano_evaluateData = theano.function([D], [mfe_, nmh_], name='evaluationData') W = T.tensor4("W") self.theano_evaluateParams = theano.function([], [twn_, ic_, medic_], givens={W: self.motifs}, name='evaluationParams') fed = self._freeEnergyForData(D) self.theano_freeEnergy = theano.function([D], fed, name='fe_per_datapoint') fed = self._freeEnergyPerMotif(D) self.theano_fePerMotif = theano.function([D], fed, name='fe_per_motif') if self.doublestranded: self.theano_getHitProbs = theano.function([D], \ self._bottomUpProbability(self._bottomUpActivity(D))) else: self.theano_getHitProbs = theano.function([D], \ #self.bottomUpProbability( T.maximum(self.bottomUpActivity(D), self._bottomUpProbability( self._bottomUpActivity(D) + self._bottomUpActivity(D, True))) print("Compilation of Theano training function finished") def _evaluateData(self, data): """Evaluate performance on given numpy array. This is used to monitor training progress. """ return self.theano_evaluateData(data) def _trainingFct(self, data): """Train on mini-batch given numpy array.""" return self.theano_trainingFct(data) def _evaluateParams(self): """Evaluate parameters. This is used to monitor training progress. """ return self.theano_evaluateParams() def motifHitProbs(self, data): """Motif match probabilities. Parameters ----------- data : numpy-array 4D numpy array representing a DNA sequence in one-hot encoding. See :meth:`crbm.sequences.seqToOneHot`. returns : numpy-array Per-position motif match probabilities of all motifs as numpy array. """ return self.theano_getHitProbs(data) def freeEnergy(self, data, permotif=False): """Free energy determined on the given dataset. Parameters ----------- data : numpy-array 4D numpy array representing a DNA sequence in one-hot encoding. See :meth:`crbm.sequences.seqToOneHot`. permotif : boolean Indicates whether the free energy should be computed per motif. Default: The free energy is computed per sequence, by summing over the individual motif contributions. returns : numpy-array Free energy per sequence. """ if permotif: return self.theano_fePerMotif(data) else: return self.theano_freeEnergy(data) def fit(self, training_data, test_data=None): """Fits the cRBM to the provided training sequences. Parameters ----------- training_data : numpy-array 4D-Numpy array representing the training sequence in one-hot encoding. See :meth:`crbm.sequences.seqToOneHot`. test_data : numpy-array 4D-Numpy array representing the validation sequence in one-hot encoding. If no test_data is provided, the training progress will be reported on the training set itself. See :meth:`crbm.sequences.seqToOneHot`. """ # assert that pooling can be done without rest to the division # compute sequence length nseq=int((training_data.shape[3]-\ self.motif_length + 1)/\ self.pooling)*\ self.pooling+ \ self.motif_length -1 training_data = training_data[:, :, :, :nseq] if type(test_data) != type(None): nseq=int((test_data.shape[3]-\ self.motif_length + 1)/\ self.pooling)*\ self.pooling+ \ self.motif_length -1 test_data = test_data[:, :, :, :nseq] else: test_data = training_data print(("BatchSize: " + str(self.batchsize))) start = time.time() # compile training function # now perform training print("Start training the model...") starttime = time.time() for epoch in range(self.epochs): for [start,end] in self._iterateBatchIndices(\ training_data.shape[0],self.batchsize): self._trainingFct(training_data[start:end, :, :, :]) meanfe = 0.0 meannmh = 0.0 nb = 0 for [start,end] in self._iterateBatchIndices(\ test_data.shape[0],self.batchsize): [mfe_, nmh_] = self._evaluateData(test_data[start:end, :, :, :]) meanfe = meanfe + mfe_ meannmh = meannmh + nmh_ nb = nb + 1 [twn_, ic_, medic_] = self._evaluateParams() print(("Epoch {:d}: ".format(epoch) + \ "FE={:1.3f} ".format(meanfe/nb) + \ "NumH={:1.4f} ".format(meannmh/nb) + \ "WNorm={:2.2f} ".format(float(twn_)) + \ "IC={:1.3f} medIC={:1.3f}".format(float(ic_), float(medic_)))) # done with training print(("Training finished after: {:5.2f} seconds!".format(\ time.time()-starttime))) def _meanFreeEnergy(self, D): """Theano function for computing the mean free energy.""" return T.sum(self._freeEnergyForData(D)) / D.shape[0] def getPFMs(self): """Returns the weight matrices converted to *position frequency matrices*. Parameters ----------- returns: numpy-array List of position frequency matrices as numpy arrays. """ def softmax_(x): x_exp = np.exp(x) y = np.zeros(x.shape) for i in range(x.shape[1]): y[:, i] = x_exp[:, i] / np.sum(x_exp[:, i]) return y return [softmax_(m[0, :, :]) for m in self.motifs.get_value()] def _freeEnergyForData(self, D): """Theano function for computing the free energy (per position).""" pool = self.pooling x = self._bottomUpActivity(D) x = x.reshape( (x.shape[0], x.shape[1], x.shape[2], x.shape[3] // pool, pool)) free_energy = -T.sum(T.log(1. + T.sum(T.exp(x), axis=4)), axis=(1, 2, 3)) if self.doublestranded: x = self._bottomUpActivity(D, True) x = x.reshape( (x.shape[0], x.shape[1], x.shape[2], x.shape[3] // pool, pool)) free_energy = free_energy - T.sum( T.log(1. + T.sum(T.exp(x), axis=4)), axis=(1, 2, 3)) cMod = self.c cMod = cMod.dimshuffle('x', 0, 1, 'x') # make it 4D and broadcastable there free_energy = free_energy - T.sum(D * cMod, axis=(1, 2, 3)) return free_energy / D.shape[3] def _freeEnergyPerMotif(self, D): """Theano function for computing the free energy (per motif).""" pool = self.pooling x = self._bottomUpActivity(D) x = x.reshape( (x.shape[0], x.shape[1], x.shape[2], x.shape[3] // pool, pool)) free_energy = -T.sum(T.log(1. + T.sum(T.exp(x), axis=4)), axis=(2, 3)) if self.doublestranded: x = self._bottomUpActivity(D, True) x = x.reshape( (x.shape[0], x.shape[1], x.shape[2], x.shape[3] // pool, pool)) free_energy = free_energy - T.sum( T.log(1. + T.sum(T.exp(x), axis=4)), axis=(2, 3)) cMod = self.c cMod = cMod.dimshuffle('x', 0, 1, 'x') # make it 4D and broadcastable there free_energy = free_energy - T.sum(D * cMod, axis=(1, 2, 3)).dimshuffle( 0, 'x') return free_energy def _softmax(self, x): """Softmax operation.""" return T.exp(x) / T.exp(x).sum(axis=2, keepdims=True) def __repr__(self): st = "Parameters:\n\n" st += "Number of motifs: {}\n".format(self.num_motifs) st += "Motif length: {}\n".format(self.motif_length) st += "\n" st += "Hyper-parameters:\n\n" st += "input dims: {:d}".format(self.input_dims) st += "doublestranded: {}".format(self.doublestranded) st += "batchsize: {:d}".format(self.batchsize) st += "learning rate: {:1.3f}".format(self.learning_rate) st += "momentum: {:1.3f}".format(self.momentum) st += "rho: {:1.4f}".format(self.rho) st += "lambda: {:1.3f}".format(self.lambda_rate) st += "pooling: {:d}".format(self.pooling) st += "cd_k: {:d}".format(self.cd_k) st += "epochs: {:d}".format(self.epochs) return st def _iterateBatchIndices(self, totalsize, nbatchsize): """Returns indices in batches.""" return [ [i,i+nbatchsize] if i+nbatchsize<=totalsize \ else [i,totalsize] for i in range(totalsize)[0::nbatchsize] ]
class Decoder(EncoderDecoderBase): NCE = 0 EVALUATION = 1 SAMPLING = 2 BEAM_SEARCH = 3 def __init__(self, state, rng, parent, encoder): EncoderDecoderBase.__init__(self, state, rng, parent) # Take as input the encoder instance for the embeddings.. # To modify in the future self.encoder = encoder self.trng = MRG_RandomStreams(self.seed) self.init_params() def init_params(self): """ Decoder weights """ self.bd_out = add_to_params( self.params, theano.shared(value=np.zeros((self.idim, ), dtype='float32'), name='bd_out')) self.Wd_emb = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.idim, self.rankdim), name='Wd_emb')) self.Wd_hh = add_to_params( self.params, theano.shared(value=OrthogonalInit(self.rng, self.qdim, self.qdim), name='Wd_hh')) self.bd_hh = add_to_params( self.params, theano.shared(value=np.zeros((self.qdim, ), dtype='float32'), name='bd_hh')) self.Wd_in = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.rankdim, self.qdim), name='Wd_in')) self.Wd_s_0 = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.sdim, self.qdim), name='Wd_s_0')) self.bd_s_0 = add_to_params( self.params, theano.shared(value=np.zeros((self.qdim, ), dtype='float32'), name='bd_s_0')) if self.decoder_bias_type == 'all': self.Wd_s_q = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.sdim, self.qdim), name='Wd_s_q')) if self.query_step_type == "gated": self.Wd_in_r = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.rankdim, self.qdim), name='Wd_in_r')) self.Wd_in_z = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.rankdim, self.qdim), name='Wd_in_z')) self.Wd_hh_r = add_to_params( self.params, theano.shared(value=OrthogonalInit(self.rng, self.qdim, self.qdim), name='Wd_hh_r')) self.Wd_hh_z = add_to_params( self.params, theano.shared(value=OrthogonalInit(self.rng, self.qdim, self.qdim), name='Wd_hh_z')) self.bd_r = add_to_params( self.params, theano.shared(value=np.zeros((self.qdim, ), dtype='float32'), name='bd_r')) self.bd_z = add_to_params( self.params, theano.shared(value=np.zeros((self.qdim, ), dtype='float32'), name='bd_z')) if self.decoder_bias_type == 'all': self.Wd_s_z = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.sdim, self.qdim), name='Wd_s_z')) self.Wd_s_r = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.sdim, self.qdim), name='Wd_s_r')) ###################### # Output layer weights ###################### out_target_dim = self.qdim if not self.maxout_out: out_target_dim = self.rankdim self.Wd_out = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.qdim, out_target_dim), name='Wd_out')) # Set up deep output if self.deep_out: self.Wd_e_out = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.rankdim, out_target_dim), name='Wd_e_out')) self.bd_e_out = add_to_params( self.params, theano.shared(value=np.zeros((out_target_dim, ), dtype='float32'), name='bd_e_out')) if self.decoder_bias_type != 'first': self.Wd_s_out = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.sdim, out_target_dim), name='Wd_s_out')) """ Rank """ if hasattr(self, 'train_rank'): self.Wr_out = add_to_params( self.params, theano.shared(value=NormalInit(self.rng, self.sdim, 1), name='Wr_out')) self.br_out = add_to_params( self.params, theano.shared(value=np.zeros((1, ), dtype='float32'), name='br_out')) def build_rank_layer(self, hs): return T.dot(hs, self.Wr_out) + self.br_out def build_output_layer(self, hs, xd, hd): pre_activ = T.dot(hd, self.Wd_out) if self.deep_out: pre_activ += T.dot(xd, self.Wd_e_out) + self.bd_e_out if self.decoder_bias_type != 'first': pre_activ += T.dot(hs, self.Wd_s_out) # ^ if bias all, bias the deep output if self.maxout_out: pre_activ = Maxout(2)(pre_activ) return pre_activ def build_next_probs_predictor(self, hs, x, prev_hd): """ Return output probabilities given prev_words x, hierarchical pass hs, and previous hd hs should always be the same (and should not be updated). """ return self.build_decoder(hs, x, mode=Decoder.BEAM_SEARCH, prev_hd=prev_hd) def approx_embedder(self, x): # Here we use the same embeddings learnt in the encoder.. !!! return self.encoder.approx_embedder(x) def output_softmax(self, pre_activ): # returns a (timestep, bs, idim) matrix (huge) return SoftMax(T.dot(pre_activ, self.Wd_emb.T) + self.bd_out) def output_nce(self, pre_activ, y, y_hat): # returns a (timestep, bs, pos + neg) matrix (very small) target_embedding = self.Wd_emb[y] # ^ target embedding is (timestep x bs, rankdim) noise_embedding = self.Wd_emb[y_hat] # ^ noise embedding is (10, timestep x bs, rankdim) # pre_activ is (timestep x bs x rankdim) pos_scores = (target_embedding * pre_activ).sum(2) neg_scores = (noise_embedding * pre_activ).sum(3) pos_scores += self.bd_out[y] neg_scores += self.bd_out[y_hat] pos_noise = self.parent.t_noise_probs[y] * 10 neg_noise = self.parent.t_noise_probs[y_hat] * 10 pos_scores = -T.log(T.nnet.sigmoid(pos_scores - T.log(pos_noise))) neg_scores = -T.log(1 - T.nnet.sigmoid(neg_scores - T.log(neg_noise))).sum(0) return pos_scores + neg_scores def build_decoder(self, hs, x, xmask=None, y=None, y_neg=None, mode=EVALUATION, prev_hd=None, step_num=None): # Check parameter consistency if mode == Decoder.EVALUATION or mode == Decoder.NCE: assert not prev_hd assert y else: assert not y assert prev_hd # if mode == EVALUATION # xd = (timesteps, batch_size, qdim) # # if mode != EVALUATION # xd = (n_samples, dim) xd = self.approx_embedder(x) if not xmask: xmask = T.neq(x, self.eoq_sym) # we must zero out the </s> embedding # i.e. the embedding x_{-1} is the 0 vector # as well as hd_{-1} which will be reseted in the scan functions if xd.ndim != 3: assert mode != Decoder.EVALUATION xd = (xd.dimshuffle((1, 0)) * xmask).dimshuffle((1, 0)) else: assert mode == Decoder.EVALUATION or mode == Decoder.NCE xd = (xd.dimshuffle((2, 0, 1)) * xmask).dimshuffle((1, 2, 0)) # Run the decoder if mode == Decoder.EVALUATION or mode == Decoder.NCE: hd_init = T.alloc(np.float32(0), x.shape[1], self.qdim) else: hd_init = prev_hd if self.query_step_type == "gated": f_dec = self.gated_step o_dec_info = [hd_init, None, None, None] else: f_dec = self.plain_step o_dec_info = [hd_init] # If the mode of the decoder is EVALUATION # then we evaluate by default all the sentence # xd - i.e. xd.ndim == 3, xd = (timesteps, batch_size, qdim) if mode == Decoder.EVALUATION or mode == Decoder.NCE: _res, _ = theano.scan(f_dec, sequences=[xd, xmask, hs],\ outputs_info=o_dec_info) # else we evaluate only one step of the recurrence using the # previous hidden states and the previous computed hierarchical # states. else: _res = f_dec(xd, xmask, hs, prev_hd) if isinstance(_res, list) or isinstance(_res, tuple): hd = _res[0] else: hd = _res # if we are using selective bias, we should update our hs # to the step-selective hs pre_activ = self.build_output_layer(hs, xd, hd) # EVALUATION : Return target_probs + all the predicted ranks # target_probs.ndim == 3 if mode == Decoder.EVALUATION: target_probs = GrabProbs(self.output_softmax(pre_activ), y) return target_probs, hd, _res elif mode == Decoder.NCE: return self.output_nce(pre_activ, y, y_neg), hd # BEAM_SEARCH : Return output (the softmax layer) + the new hidden states elif mode == Decoder.BEAM_SEARCH: return self.output_softmax(pre_activ), hd # SAMPLING : Return a vector of n_sample from the output layer # + log probabilities + the new hidden states elif mode == Decoder.SAMPLING: outputs = self.output_softmax(pre_activ) if outputs.ndim == 1: outputs = outputs.dimshuffle('x', 0) sample = self.trng.multinomial(pvals=outputs, dtype='int64').argmax(axis=-1) if outputs.ndim == 1: sample = sample[0] log_prob = -T.log(T.diag(outputs.T[sample])) return sample, log_prob, hd def sampling_step(self, *args): args = iter(args) # Arguments that correspond to scan's "sequences" parameteter: step_num = next(args) assert step_num.ndim == 0 # Arguments that correspond to scan's "outputs" parameteter: prev_word = next(args) assert prev_word.ndim == 1 # skip the previous word log probability log_prob = next(args) assert log_prob.ndim == 1 prev_h = next(args) assert prev_h.ndim == 2 prev_hs = next(args) assert prev_hs.ndim == 2 prev_hd = next(args) assert prev_hd.ndim == 2 # When we sample we shall recompute the encoder for one step... encoder_args = dict(prev_hs=prev_hs, prev_h=prev_h) h, hs = self.parent.encoder.build_encoder(prev_word, **encoder_args) assert h.ndim == 2 assert hs.ndim == 2 # ...and decode one step. sample, log_prob, hd = self.build_decoder(hs, prev_word, prev_hd=prev_hd, step_num=step_num, mode=Decoder.SAMPLING) assert sample.ndim == 1 assert log_prob.ndim == 1 assert hd.ndim == 2 return [sample, log_prob, h, hs, hd] def build_sampler(self, n_samples, n_steps): # For the naive sampler, the states are: # 1) a vector [</q>] * n_samples to seed the sampling # 2) a vector of [ 0. ] * n_samples for the log_probs # 3) prev_h hidden layers # 4) prev_hs hidden layers # 5) prev_hd hidden layers states = [ T.alloc(np.int64(self.eoq_sym), n_samples), T.alloc(np.float32(0.), n_samples), T.alloc(np.float32(0.), n_samples, self.qdim), T.alloc(np.float32(0.), n_samples, self.sdim), T.alloc(np.float32(0.), n_samples, self.qdim) ] outputs, updates = theano.scan( self.sampling_step, outputs_info=states, sequences=[T.arange(n_steps, dtype='int64')], n_steps=n_steps, name="sampler_scan") # Return sample, log_probs and updates (for tnrg multinomial) return (outputs[0], outputs[1]), updates def gated_step(self, xd_t, m_t, hs_t, hd_tm1): if m_t.ndim >= 1: m_t = m_t.dimshuffle(0, 'x') hd_tm1 = (m_t) * hd_tm1 + ( 1 - m_t) * T.tanh(T.dot(hs_t, self.Wd_s_0) + self.bd_s_0) # ^ iff x_{t - 1} = </s> (m_t = 0) then x_{t - 1} = 0 # and hd_{t - 1} = tanh(W_s_0 hs_t + bd_s_0) else hd_{t - 1} is left unchanged (m_t = 1) # In the 'all' decoder bias type each hidden state of the decoder # RNN receives the hs_t vector as bias without modification if self.decoder_bias_type == 'all': rd_t = T.nnet.sigmoid( T.dot(xd_t, self.Wd_in_r) + T.dot(hd_tm1, self.Wd_hh_r) + T.dot(hs_t, self.Wd_s_r) + self.bd_r) zd_t = T.nnet.sigmoid( T.dot(xd_t, self.Wd_in_z) + T.dot(hd_tm1, self.Wd_hh_z) + T.dot(hs_t, self.Wd_s_z) + self.bd_z) hd_tilde = self.query_rec_activation(T.dot(xd_t, self.Wd_in) \ + T.dot(rd_t * hd_tm1, self.Wd_hh) \ + T.dot(hs_t, self.Wd_s_q) \ + self.bd_hh) hd_t = (np.float32(1.) - zd_t) * hd_tm1 + zd_t * hd_tilde output = (hd_t, rd_t, zd_t, hd_tilde) else: # Do not bias all the decoder (force to store very useful information in the first state) rd_t = T.nnet.sigmoid( T.dot(xd_t, self.Wd_in_r) + T.dot(hd_tm1, self.Wd_hh_r) + self.bd_r) zd_t = T.nnet.sigmoid( T.dot(xd_t, self.Wd_in_z) + T.dot(hd_tm1, self.Wd_hh_z) + self.bd_z) hd_tilde = self.query_rec_activation(T.dot(xd_t, self.Wd_in) \ + T.dot(rd_t * hd_tm1, self.Wd_hh) \ + self.bd_hh) hd_t = (np.float32(1.) - zd_t) * hd_tm1 + zd_t * hd_tilde output = (hd_t, rd_t, zd_t, hd_tilde) return output def plain_step(self, xd_t, m_t, hs_t, hd_tm1): if m_t.ndim >= 1: m_t = m_t.dimshuffle(0, 'x') # We already assume that xd are zeroed out hd_tm1 = (m_t) * hd_tm1 + ( 1 - m_t) * T.tanh(T.dot(hs_t, self.Wd_s_0) + self.bd_s_0) # ^ iff x_{t - 1} = </s> (m_t = 0) then x_{t-1} = 0 # and hd_{t - 1} = 0 else hd_{t - 1} is left unchanged (m_t = 1) if self.decoder_bias_type == 'first': # Do not bias all the decoder (force to store very useful information in the first state) hd_t = self.query_rec_activation( T.dot(xd_t, self.Wd_in) \ + T.dot(hd_tm1, self.Wd_hh) \ + self.bd_hh ) output = (hd_t, ) elif self.decoder_bias_type == 'all': hd_t = self.query_rec_activation( T.dot(xd_t, self.Wd_in) \ + T.dot(hd_tm1, self.Wd_hh) \ + T.dot(hs_t, self.Wd_s_q) \ + self.bd_hh ) output = (hd_t, ) return output
class RNNsearch(model): ''' The attention-based NMT model ''' def __init__(self, config, name='', fls=None): self.config = config self.name = name self.creater = LayerFactory() self.fls = fls #print(self.fls) self.trng = RandomStreams(numpy.random.randint(int(10e6))) def sampling_step(self, state, prev, context): ''' Build the computational graph which samples the next word. :type state: theano variables :param state: the previous hidden state :type prev: theano variables :param prev: the last generated word :type context: theano variables :param context: the context vectors. ''' emb = self.emb_trg.forward(prev) energy, c = self.decoderGRU.decode_probs(context, state, emb) probs = tensor.nnet.softmax(energy) sample = self.trng.multinomial(pvals=probs, dtype='int64').argmax(axis=-1) newemb = self.emb_trg.forward(sample) newstate = self.decoderGRU.decode_next(c, state, newemb) return newstate, sample, probs def decode_sample(self, state_init, c, length, n_samples): ''' Build the decoder graph for sampling. :type state_init: theano variables :param state_init: the initial state of decoder :type c: theano variables :param c: the context vectors :type length: int :param length: the limitation of sample length :type n_samples: int :param n_samples: the number of samples ''' state = tensor.repeat(state_init, n_samples, axis=0) # copy state n times sample = tensor.zeros((n_samples, ), dtype='int64') c = tensor.repeat(c, n_samples, axis=1) result, updates = theano.scan(self.sampling_step, outputs_info=[state, sample, None], non_sequences=[c], n_steps=length) samples = result[1] probs = result[2] y_idx = tensor.arange(samples.flatten( ).shape[0]) * self.config['num_vocab_trg'] + samples.flatten() #probs = probs.flatten()[y_idx] #probs = probs.reshape(samples.shape) return samples, probs, updates def build(self, verbose=False): ''' Build the computational graph. :type verbose: bool :param verbose: only set to True on visualization ''' config = self.config # create layers logging.info('Initializing layers') self.emb_src = self.creater.createLookupTable( self.name + 'emb_src', config['num_vocab_src'], config['dim_emb_src'], offset=True) #(input,output)-->[30000,620] self.emb_trg = self.creater.createLookupTable( self.name + 'emb_trg', config['num_vocab_trg'], config['dim_emb_trg'], offset=True) #(input,output)-->[30000,620] self.encoderGRU = self.creater.createGRU(self.name + 'GRU_enc', config['dim_emb_src'], config['dim_rec_enc'], verbose=verbose) self.encoderGRU_back = self.creater.createGRU(self.name + 'GRU_enc_back', config['dim_emb_src'], config['dim_rec_enc'], verbose=verbose) self.decoderGRU = self.creater.createGRU_attention( self.name + 'GRU_dec', config['dim_emb_trg'], 2 * config['dim_rec_enc'], config['dim_rec_dec'], config['num_vocab_trg'], verbose=verbose) self.initer = self.creater.createFeedForwardLayer( self.name + 'initer', config['dim_rec_enc'], config['dim_rec_dec'], offset=True) if self.fls: #print("loaded feature") fl_weight = [] for fl in self.fls: fl_weight.append(fl.feature_weight) #logging.info("sen weight") #print(fl.feature_weight) fl_weight = numpy.concatenate(fl_weight) self.feature_weight = theano.shared(fl_weight.astype('float32'), name="feature_weight") self.creater.params += [self.feature_weight] self.feature_weight_dim = self.feature_weight.dimshuffle( 'x', 0) # equal to a.T (m,n)-->(n,m) # create input variables self.x = tensor.matrix('x', dtype='int64') # size: (length, batchsize) self.xmask = tensor.matrix( 'x_mask', dtype='float32') # size: (length, batchsize) self.y = tensor.matrix('y', dtype='int64') # size: (length, batchsize) self.ymask = tensor.matrix( 'y_mask', dtype='float32') # size: (length, batchsize) if 'MRT' in config and config['MRT'] is True: self.MRTLoss = tensor.vector('MRTLoss') self.inputs = [ self.x, self.xmask, self.y, self.ymask, self.MRTLoss ] else: self.MRTLoss = None self.inputs = [self.x, self.xmask, self.y, self.ymask] if config['PR']: self.ans = tensor.scalar('ans', dtype='int64') self.features = tensor.matrix('features', dtype='float32') self.inputs += [self.features, self.ans] # create computational graph for training logging.info('Building computational graph') # ----encoder----- emb = self.emb_src.forward( self.x.flatten()) # size: (length, batch_size, dim_emb) back_emb = self.emb_src.forward(self.x[::-1].flatten()) self.encode_forward = self.encoderGRU.forward( emb, self.x.shape[0], batch_size=self.x.shape[1], mask=self.xmask) # size: (length, batch_size, dim) self.encode_backward = self.encoderGRU_back.forward( back_emb, self.x.shape[0], batch_size=self.x.shape[1], mask=self.xmask[::-1]) # size: (length, batch_size, dim) context_forward = self.encode_forward[0] # only hiddens context_backward = self.encode_backward[0][::-1] self.context = tensor.concatenate( (context_forward, context_backward), axis=2) # size: (length, batch_size, 2*dim) # ----decoder---- self.init_c = context_backward[0] self.state_init = self.initer.forward(context_backward[0]) emb = self.emb_trg.forward( self.y.flatten()) # size: (length, batch_size, dim_emb) self.decode = self.decoderGRU.forward( emb, self.y.shape[0], self.context, state_init=self.state_init, batch_size=self.y.shape[1], mask=self.ymask, cmask=self.xmask) # size: (length, batch_size, dim) energy = self.decode[1] self.attention = self.decode[2] self.softmax = tensor.nnet.softmax(energy) # compute costs and grads y_idx = tensor.arange(self.y.flatten( ).shape[0]) * self.config['num_vocab_trg'] + self.y.flatten() cost = self.softmax.flatten()[y_idx] cost = -tensor.log(cost) self.cost = cost.reshape( (self.y.shape[0], self.y.shape[1])) * self.ymask self.cost_per_sample = self.cost.sum(axis=0) if 'MRT' in config and config['MRT'] is True: self.cost_per_sample = self.cost.sum(axis=0) tmp = self.cost_per_sample tmp *= config['MRT_alpha'] tmp -= tmp.min() tmp = tensor.exp(-tmp) tmp /= tmp.sum() tmp *= self.MRTLoss tmp = -tmp.sum() self.cost = tmp elif config['PR'] and self.fls: # calculate p self.cost_per_sample = self.cost.sum(axis=0) self.cost_per_sample *= config['alpha_PR'] cost_min = self.cost_per_sample - self.cost_per_sample.min() probs = tensor.exp(-cost_min) log_probs = -cost_min - tensor.log(probs.sum()) probs /= probs.sum() self.probs = log_probs # calculate q energy_q = self.features * self.feature_weight_dim energy_q = energy_q.sum(axis=1) self.energy_q = energy_q energy_q_min = energy_q - energy_q.max() probs_q = tensor.exp(energy_q_min) log_probs_q = energy_q_min - tensor.log(probs_q.sum()) probs_q /= probs_q.sum() self.probs_q = log_probs_q # calculate KL divergence cost_KL = tensor.exp(log_probs_q) * (log_probs_q - log_probs) self.cost_KLs = cost_KL self.cost_KL = cost_KL.sum() self.cost_NMT = self.cost_per_sample[self.ans] self.cost = config['lambda_PR'] * self.cost_KL + config[ 'lambda_MLE'] * self.cost_NMT else: self.cost = self.cost.sum() # build sampling graph self.x_sample = tensor.matrix('x_sample', dtype='int64') self.n_samples = tensor.scalar('n_samples', dtype='int64') self.length_sample = tensor.scalar('length', dtype='int64') emb_sample = self.emb_src.forward( self.x_sample.flatten()) # (length, batch_size, dim_emb) back_emb_sample = self.emb_src.forward(self.x_sample[::-1].flatten()) encode_forward_sample = self.encoderGRU.forward( emb_sample, self.x_sample.shape[0], batch_size=self.x_sample.shape[1]) # (length, batch_size, dim) encode_backward_sample = self.encoderGRU_back.forward( back_emb_sample, self.x_sample.shape[0], batch_size=self.x_sample.shape[1]) # (length, batch_size, dim) context_sample = tensor.concatenate( (encode_forward_sample[0], encode_backward_sample[0][::-1]), axis=2) # (length, batch_size, 2*dim) state_init_sample = self.initer.forward( encode_backward_sample[0][::-1][0]) self.state_init_sample = state_init_sample self.context_sample = context_sample self.samples, self.probs_sample, self.updates_sample = self.decode_sample( state_init_sample, context_sample, self.length_sample, self.n_samples) # parameter for decoding self.y_decode = tensor.vector('y_decode', dtype='int64') self.context_decode = tensor.tensor3('context_decode', dtype='float32') self.c_decode = tensor.matrix('c_decode', dtype='float32') self.state_decode = tensor.matrix('state_decode', dtype='float32') self.emb_decode = tensor.matrix('emb_decode', dtype='float32') def encode(self, x): ''' Encode source sentence to context vector. ''' if not hasattr(self, "encoder"): self.encoder = theano.function(inputs=[self.x, self.xmask], outputs=[self.context]) x = numpy.reshape(x, (x.shape[0], 1)) xmask = numpy.ones(x.shape, dtype='float32') return self.encoder(x, xmask) def get_trg_embedding(self, y): ''' Get the embedding of target sentence. ''' if not hasattr(self, "get_trg_embeddinger"): self.get_trg_embeddinger = theano.function( inputs=[self.y_decode], outputs=[self.emb_trg.forward(self.y_decode)]) return self.get_trg_embeddinger(y) def get_init(self, c): ''' Get the initial decoder hidden state with context vector. ''' if not hasattr(self, "get_initer"): self.get_initer = theano.function( inputs=[self.context], outputs=[self.initer.forward(context_backward[0])]) return self.get_initer(c) def get_context_and_init(self, x): ''' Encode source sentence to context vectors and get the initial decoder hidden state. ''' if not hasattr(self, "get_context_and_initer"): self.get_context_and_initer = theano.function( inputs=[self.x, self.xmask], outputs=[self.context, self.state_init]) x = numpy.reshape(x, (x.shape[0], 1)) xmask = numpy.ones(x.shape, dtype='float32') return self.get_context_and_initer(x, xmask) def get_probs(self, c, state, emb): ''' Get the probability of the next target word. ''' if not hasattr(self, "get_probser"): self.get_probser = theano.function(inputs = [self.context_decode, \ self.state_decode, \ self.emb_decode], \ outputs = self.decoderGRU.decode_probs(self.context_decode, \ self.state_decode, \ self.emb_decode)) return self.get_probser(c, state, emb) def get_next(self, c, state, emb): ''' Get the next hidden state. ''' if not hasattr(self, "get_nexter"): self.get_nexter = theano.function(inputs = [self.c_decode, \ self.state_decode, \ self.emb_decode], outputs = self.decoderGRU.decode_next(self.c_decode, \ self.state_decode, \ self.emb_decode)) return self.get_nexter(c, state, emb) def get_cost(self, x, xmask, y, ymask): ''' Get the negative log-likelihood of parallel sentences. ''' if not hasattr(self, "get_coster"): self.get_coster = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=[self.cost]) return self.get_coster(x, xmask, y, ymask) def get_sample(self, x, length, n_samples): ''' Get sampling results. ''' if not hasattr(self, "get_sampler"): self.get_sampler = theano.function( inputs=[self.x_sample, self.length_sample, self.n_samples], outputs=[self.samples, self.probs_sample], updates=self.updates_sample) return self.get_sampler(x, length, n_samples) def get_attention(self, x, xmask, y, ymask): ''' Get the attention weight of parallel sentences. ''' if not hasattr(self, "get_attentioner"): self.get_attentioner = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=[self.attention]) return self.get_attentioner(x, xmask, y, ymask) def get_layer(self, x, xmask, y, ymask): ''' Get the hidden states essential for visualization ''' if not hasattr(self, "get_layerer"): self.get_layerer = theano.function(inputs = [self.x, self.xmask, self.y, self.ymask], outputs = self.encode_forward + \ self.encode_backward + \ tuple(self.decode[0]) + tuple(self.decode[1:])) layers = self.get_layerer(x, xmask, y, ymask) enc_names = [ 'h', 'gate', 'reset', 'state', 'reseted', 'state_in', 'gate_in', 'reset_in' ] dec_names = [ 'h', 'c', 'att', 'gate_cin', 'gate_preactive', 'gate', 'reset_cin', 'reset_preactive', 'reset', 'state_cin', 'reseted', 'state_preactive', 'state' ] dec_names += [ 'outenergy', 'state_in', 'gate_in', 'reset_in', 'state_in_prev', 'readout', 'maxout', 'outenergy_1', 'outenergy_2' ] value_name = ['enc_for_' + name for name in enc_names] value_name += ['enc_back_' + name for name in enc_names] value_name += ['dec_' + name for name in dec_names] result = {} for i in range(len(layers)): if value_name[i] != '': result[value_name[i]] = layers[i] return result
class Categorical(Distribution): def __init__(self, dim): self._dim = dim self._srng = RandomStreams() @property def dim(self): return self._dim def kl_sym(self, old_dist_info_vars, new_dist_info_vars): """ Compute the symbolic KL divergence of two categorical distributions """ old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * A return TT.sum( old_prob_var * (TT.log(old_prob_var + TINY) - TT.log(new_prob_var + TINY)), axis=-1) def kl(self, old_dist_info, new_dist_info): """ Compute the KL divergence of two categorical distributions """ old_prob = old_dist_info["prob"] new_prob = new_dist_info["prob"] return np.sum(old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)), axis=-1) def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] x_var = TT.cast(x_var, 'float32') # Assume layout is N * A return (TT.sum(new_prob_var * x_var, axis=-1) + TINY) / (TT.sum(old_prob_var * x_var, axis=-1) + TINY) def entropy(self, info): probs = info["prob"] return -np.sum(probs * np.log(probs + TINY), axis=1) def entropy_sym(self, dist_info_vars): prob_var = dist_info_vars["prob"] return -TT.sum(prob_var * TT.log(prob_var + TINY), axis=1) def log_likelihood_sym(self, x_var, dist_info_vars): probs = dist_info_vars["prob"] # Assume layout is N * A return TT.log( TT.sum(probs * TT.cast(x_var, 'float32'), axis=-1) + TINY) def log_likelihood(self, xs, dist_info): probs = dist_info["prob"] # Assume layout is N * A n = probs.shape[0] return np.log(probs[np.arange(n), from_onehot(np.asarray(xs))] + TINY) def sample_sym(self, dist_info): probs = dist_info["prob"] return self._srng.multinomial(pvals=probs, dtype='uint8') @property def dist_info_keys(self): return ["prob"]
class ParticleFilter(): ''' Implements particle filtering and smoothing for Markov Chains with arbitrary proposal/true distributions ''' def __init__(self, transition_model, observation_model, n_particles, observation_input=None, n_history=1): self.transition_model=transition_model self.observation_model=observation_model self.data_dims=observation_model.output_dims self.state_dims=transition_model.output_dims self.n_particles=n_particles self.n_history=n_history #this is used to keep track of what set of particles corresponds #to the previous point in time self.time_counter=theano.shared(0) self.theano_rng=RandomStreams() #init_particles=np.zeros((n_history+1, n_particles, self.state_dims)).astype(np.float32) init_particles=np.random.randn(n_history+1, n_particles, self.state_dims).astype(np.float32) init_weights=(np.ones((n_history+1, n_particles))/float(n_particles)).astype(np.float32) self.particles=theano.shared(init_particles) self.weights=theano.shared(init_weights) self.next_state=self.particles[(self.time_counter+1)%(self.n_history+1)] self.current_state=self.particles[self.time_counter%(self.n_history+1)] self.previous_state=self.particles[(self.time_counter-1)%(self.n_history+1)] self.next_weights=self.weights[(self.time_counter+1)%(self.n_history+1)] self.current_weights=self.weights[self.time_counter%(self.n_history+1)] self.previous_weights=self.weights[(self.time_counter-1)%(self.n_history+1)] self.proposal_distrib=None self.true_log_transition_probs=self.transition_model.rel_log_prob self.true_log_observation_probs=self.observation_model.rel_log_prob self.perform_inference=None self.resample=None self.sample_joint=None self.observation_input=observation_input ess=self.compute_ESS() self.get_ESS=theano.function([],ess) n_samps=T.lscalar() n_T=T.lscalar() data_samples, state_samples, init_state_samples, data_sample_updates=self.sample_future(n_samps,n_T) self.sample_from_future=theano.function([n_samps, n_T],[data_samples,state_samples,init_state_samples],updates=data_sample_updates) self.get_current_particles=theano.function([],self.current_state) self.get_current_weights=theano.function([],self.current_weights) def recompile(self): '''This function compiles each of the theano functions that might change following a change of the model. ''' samp_updates=self.sample_update(self.observation_input) self.perform_inference=theano.function([],updates=samp_updates) res_updates=self.resample_update() self.resample=theano.function([],updates=res_updates) nsamps=T.lscalar() joint_samples, joint_updates=self.sample_from_joint(nsamps) self.sample_joint=theano.function([nsamps],joint_samples,updates=joint_updates) new_ess, stddevhist, esshist, sr_updates=self.sequential_resample() self.perform_sequential_resampling=theano.function([],[new_ess,stddevhist,esshist],updates=sr_updates) csamps=self.sample_current(nsamps) self.sample_current_state=theano.function([nsamps],csamps) psamps=self.sample_prev(nsamps) self.sample_previous_state=theano.function([nsamps],psamps) return def set_proposal(self, proposal_distrib): self.proposal_distrib=proposal_distrib return def set_true_log_transition_probs(self, true_log_transition_probs): self.true_log_transition_probs=true_log_transition_probs return def set_true_log_observation_probs(self, true_log_observation_probs): self.true_log_observation_probs=true_log_observation_probs return def sample_update(self, data): proposal_samples, log_proposal_probs=self.proposal_distrib printing=False if printing: log_transition_probs=theano.printing.Print('1 log transition probs update')(self.true_log_transition_probs(self.current_state, proposal_samples)) log_observation_probs=theano.printing.Print('2 log observation probs update')(self.true_log_observation_probs(proposal_samples, data.dimshuffle('x',0))) log_unnorm_weights=theano.printing.Print('3 log unnorm weights update')(log_transition_probs + log_observation_probs - log_proposal_probs) log_unnorm_weights_center=theano.printing.Print('4 log unnorm weights center update')(log_unnorm_weights-T.max(log_unnorm_weights)) unnorm_weights=theano.printing.Print('5 unnorm weights update')(T.exp(log_unnorm_weights_center)*self.current_weights) normalizer=theano.printing.Print('6 normalizer update')(T.sum(unnorm_weights)) else: log_transition_probs=self.true_log_transition_probs(self.current_state, proposal_samples) log_observation_probs=self.true_log_observation_probs(proposal_samples, data.dimshuffle('x',0)) log_unnorm_weights=log_transition_probs + log_observation_probs - log_proposal_probs log_unnorm_weights_center=log_unnorm_weights-T.max(log_unnorm_weights) unnorm_weights=T.exp(log_unnorm_weights_center)*self.current_weights normalizer=T.sum(unnorm_weights) weights=unnorm_weights/normalizer updates=OrderedDict() updates[self.weights]=T.set_subtensor(self.next_weights, weights) updates[self.particles]=T.set_subtensor(self.next_state, proposal_samples) updates[self.time_counter]=self.time_counter+1 return updates def compute_ESS(self): return 1.0/T.sum(self.current_weights**2) def resample_update(self): #shape: n_particles by n_particles samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),self.n_particles,axis=0)) idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64') updates=OrderedDict() updates[self.particles]=T.set_subtensor(self.current_state, self.current_state[idxs]) updates[self.weights]=T.set_subtensor(self.current_weights, T.cast(T.ones_like(self.current_weights)/float(self.n_particles),'float32')) return updates def sample_step(self, future_samps, t, n_samples): particles_now=self.particles[(self.time_counter-t)%(self.n_history+1)] weights_now=self.weights[(self.time_counter-t)%(self.n_history+1)] #n_particles by n_samples rel_log_probs=self.true_log_transition_probs(particles_now, future_samps, all_pairs=True) unnorm_probs=T.exp(rel_log_probs)*weights_now.dimshuffle(0,'x') probs=unnorm_probs/T.sum(unnorm_probs, axis=0).dimshuffle('x',0) samps=self.theano_rng.multinomial(pvals=probs.T) idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64') output_samples=particles_now[idxs] return [output_samples, t+1] def sample_from_joint(self, n_samples, output_2D=False): '''Samples from the joint posterior P(s_t-n_history:s_t | observations) n_samples: the number of samples to draw Returns an array with shape (n_history+1, n_samples, state_dims), where array[-1] corresponds to the current time. ''' samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),n_samples,axis=0)) idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64') samps_t0=self.current_state[idxs] t0=T.as_tensor_variable(1) [samples, ts], updates = theano.scan(fn=self.sample_step, outputs_info=[samps_t0, t0], non_sequences=[n_samples], n_steps=self.n_history) #the variable "samples" that results from the scan is time-flipped #in the sense that samples[0] corresponds to the most recent point #in time, and higher indices correspond to points in the past. #I will stick to the convention that for any collection of points in #time, [-1] will index the most recent time, and [0] will index #the point farthest in the past. So, the first axis of "samples" #needs to be flipped. flip_idxs=T.cast(-T.arange(self.n_history)+self.n_history-1,'int64') samples=T.concatenate([samples[flip_idxs], samps_t0.dimshuffle('x',0,1)], axis=0) if output_2D: samples=T.reshape(samples, ((self.n_history+1)*n_samples, self.state_dims)) return samples, updates def sample_future(self, n_samples, n_T): '''Samples from the "future" data distribution: P(s_t+1,...s_t+n_T, x_t+1,...x_t+n_T | s_t) n_samples: number of samples to draw n_T: the number of (future) time points to sample from Returns three arrays. The first two have shapes (n_T, n_samples, data_dims) and (n_T, n_samples, state_dims), corresponding to samples of future observations and states, and the third having size (n_samples,state_dims), corresponding to the "initial" samples taken from the current state distribution. ''' samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),n_samples,axis=0)) idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64') samps_t0=self.current_state[idxs] def fstep(states): next_states=self.transition_model.get_samples_noprobs(states) next_data=self.observation_model.get_samples_noprobs(next_states) return next_states, next_data [state_samples, data_samples], updates = theano.scan(fn=fstep, outputs_info=[samps_t0, None], n_steps=n_T) #data_samples=self.observation_model.get_samples_noprobs(state_samples) return data_samples, state_samples, samps_t0, updates def sample_model(self, n_samples, n_T): '''Samples from the "future" data distribution: P(s_t+1,...s_t+n_T, x_t+1,...x_t+n_T | s_t) n_samples: number of samples to draw n_T: the number of (future) time points to sample from Returns three arrays. The first two have shapes (n_T, n_samples, data_dims) and (n_T, n_samples, state_dims), corresponding to samples of future observations and states, and the third having size (n_samples,state_dims), corresponding to the "initial" samples taken from the current state distribution. ''' samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),n_samples,axis=0)) idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64') samps_t0=self.current_state[idxs] state_samples, updates = theano.scan(fn=self.transition_model.get_samples_noprobs, outputs_info=[samps_t0], n_steps=n_T) data_sample=self.observation_model.get_samples_noprobs(state_samples[-1]) return data_sample, state_samples[-1], state_samples[-2], updates def sr_step(self, means, weights, stddev, ess, decay): #Sampling from a mixture of gaussians msamps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(weights.dimshuffle('x',0),means.shape[0],axis=0)) idxs=T.cast(T.dot(msamps, T.arange(means.shape[0])),'int64') sample_means=T.cast(means[idxs],'float32') proposal_samples=self.theano_rng.normal(size=means.shape)*stddev.dimshuffle('x',0)+sample_means diffs=proposal_samples.dimshuffle(0,'x',1)-sample_means.dimshuffle('x',0,1) printing=False if printing: log_proposal_probs=theano.printing.Print('1 log_proposal_probs')(T.log(T.dot(T.exp(-T.sum((1.0/(2.0*stddev**2)).dimshuffle('x','x',0)*diffs**2,axis=2)),weights))) log_transition_probs=theano.printing.Print('2 log transition probs')(self.true_log_transition_probs(self.previous_state, proposal_samples, all_pairs=True)) log_transition_probs_2=theano.printing.Print('3 log transition probs 2')(T.log(T.dot(T.exp(log_transition_probs).T,self.previous_weights))) log_observation_probs=theano.printing.Print('4 log observation probs')(self.true_log_observation_probs(proposal_samples, self.observation_input.dimshuffle('x',0))) log_unnorm_weights=theano.printing.Print('5 log unnorm weights nomax')(log_transition_probs_2 + log_observation_probs - log_proposal_probs) log_unnorm_weights=theano.printing.Print('6 log unnorm weights')(log_unnorm_weights-T.max(log_unnorm_weights)) unnorm_weights=theano.printing.Print('7 unnorm weights')(T.exp(log_unnorm_weights)) normalizer=theano.printing.Print('8 normalizer')(T.sum(unnorm_weights)) else: log_proposal_probs=T.log(T.dot(T.exp(-T.sum((1.0/(2.0*stddev**2)).dimshuffle('x','x',0)*diffs**2,axis=2)),weights)) log_transition_probs=self.true_log_transition_probs(self.previous_state, proposal_samples, all_pairs=True) log_transition_probs=T.log(T.dot(T.exp(log_transition_probs).T,self.previous_weights)) log_observation_probs=self.true_log_observation_probs(proposal_samples, self.observation_input.dimshuffle('x',0)) log_unnorm_weights=log_transition_probs + log_observation_probs - log_proposal_probs log_unnorm_weights=log_unnorm_weights-T.max(log_unnorm_weights) unnorm_weights=T.exp(log_unnorm_weights) normalizer=T.sum(unnorm_weights) new_weights=unnorm_weights/normalizer new_ess=1.0/T.sum(new_weights**2) sampmean=T.dot(proposal_samples.T, new_weights) sampvar=T.dot(((proposal_samples-sampmean.dimshuffle('x',0))**2).T,new_weights) #propmean=T.mean(proposal_samples, axis=0) #propvar=T.mean((proposal_samples-propmean.dimshuffle('x',0))**2,axis=0) #new_stddev=stddev*T.clip(T.exp(decay*(1.0-propvar/sampvar)),0.5,2.0) #new_stddev=T.clip(stddev*T.clip(T.exp(decay*(1.0-stddev**2/sampvar)),0.5,2.0),0.0,4.0) new_stddev=T.clip(stddev*T.clip(T.exp(decay*(1.0-stddev**2/sampvar)),0.5,1.5),0.0,4.0) return [proposal_samples, new_weights, new_stddev, T.cast(new_ess,'float32')]#, theano.scan_module.until(new_ess>100) def sequential_resample(self, init_stddev=4.0, max_steps=20, stddev_decay=0.1): '''Repeatedly resamples and then samples from a proposal distribution constructed from the current samples. Should be used when the main proposal distribution is poor or whenever the ESS is poor. ''' essT=T.as_tensor_variable(np.asarray(0.0,dtype='float32')) stddevT=T.as_tensor_variable(np.asarray(init_stddev*np.ones(self.state_dims),dtype='float32')) decayT=T.as_tensor_variable(np.asarray(stddev_decay,dtype='float32')) [samphist, weighthist, stddevhist, esshist], updates = theano.scan(fn=self.sr_step, outputs_info=[self.current_state, self.current_weights, stddevT, essT], non_sequences=decayT, n_steps=max_steps) end_samples=samphist[-1] end_weights=weighthist[-1] updates[self.particles]=T.set_subtensor(self.current_state, end_samples) updates[self.weights]=T.set_subtensor(self.current_weights, end_weights) return 1.0/T.sum(end_weights**2), stddevhist, esshist, updates def sample_current(self, nsamps): samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),nsamps,axis=0)) idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64') samples=self.current_state[idxs] return samples def sample_prev(self, nsamps): samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.previous_weights.dimshuffle('x',0),nsamps,axis=0)) idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64') samples=self.previous_state[idxs] return samples def get_history(self): '''This function returns a 3-D array containing all the particles and a 2-D array of weights for the entire memory. The first dimension indexes time, with the zeroth entry corresponding to the earliest point in memory.''' idxs=(T.arange(self.n_history+1)-self.n_history+self.time_counter)%(self.n_history+1) return self.particles[idxs], self.weights[idxs]
class SLmodel(): #This version adapts the proposal distribution by keeping a running #estimate of the exact posterior covariance, parametrized as the #matrix CC' def __init__(self, nx, ns, nh, npcl, xvar=1.0): #for this model I assume one linear generative model and a #combination of nh linear dynamical models #generative matrix init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32') #init_W=np.asarray(np.eye(2),dtype='float32') #always normalize the columns of W to be unit length init_W = init_W / np.sqrt(np.sum(init_W**2, axis=0)) #observed variable means init_c = np.asarray(np.zeros(nx), dtype='float32') #dynamical matrices #init_M=np.asarray(np.random.randn(ns,ns*nh)/2.0,dtype='float32') init_M = np.asarray((np.tile(np.eye(ns), (1, nh))), dtype='float32') #state-variable variances #(covariance matrix of state variable noise assumed to be diagonal) init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32') #Switching parameter matrix init_A = np.asarray(np.zeros((ns, nh)), dtype='float32') #priors for switching variable init_ph = np.asarray(np.zeros(nh), dtype='float32') self.W = theano.shared(init_W) self.c = theano.shared(init_c) self.M = theano.shared(init_M) self.b = theano.shared(init_b) self.A = theano.shared(init_A) self.ph = theano.shared(init_ph) #square root of covariance matrix of proposal distribution #initialized to the true root covariance init_cov_inv = np.dot( init_W.T, init_W) / (xvar**2) + np.eye(ns) * np.exp(-init_b) init_cov = spla.inv(init_cov_inv) init_C = spla.sqrtm(init_cov) init_C = np.asarray(np.real(init_C), dtype='float32') init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_h_now = np.asarray(np.zeros((npcl, nh)), dtype='float32') init_h_now[:, 0] = 1.0 init_weights_now = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32') init_h_past[:, 0] = 1.0 init_weights_past = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') self.C = theano.shared(init_C) #this is to help vectorize operations self.sum_mat = T.as_tensor_variable( np.asarray((np.tile(np.eye(ns), nh)).T, dtype='float32')) self.s_now = theano.shared(init_s_now) self.h_now = theano.shared(init_h_now) self.weights_now = theano.shared(init_weights_now) self.s_past = theano.shared(init_s_past) self.h_past = theano.shared(init_h_past) self.weights_past = theano.shared(init_weights_past) self.xvar = np.asarray(xvar, dtype='float32') self.nx = nx #dimensionality of observed variables self.ns = ns #dimensionality of latent variables self.nh = nh #number of (linear) dynamical modes self.npcl = npcl #numer of particles in particle filter #for ease of use and efficient computation (these are used a lot) self.CCT = T.dot(self.C, self.C.T) self.cov_inv = T.dot( self.W.T, self.W) / (self.xvar**2) + T.eye(ns) * T.exp(-self.b) self.theano_rng = RandomStreams() self.params = [self.W, self.M, self.b, self.A, self.c, self.ph] self.rel_lrates = np.asarray([0.1, 1.0, 1.0, 10.0, 1.0, 1.0], dtype='float32') self.meta_params = [self.C] self.meta_rel_lrates = [1.0] def sample_proposal_s(self, s, h, xp): s_pred = self.get_prediction(s, h) n = self.theano_rng.normal(size=T.shape(s)) mean_term = T.dot( (xp - self.c), self.W) / (self.xvar**2) + s_pred * T.exp(-self.b) prop_mean = T.dot(mean_term, self.CCT) s_prop = prop_mean + T.dot(n, self.C) #I compute the term inside the exponent for the pdf of the proposal distrib prop_term = -T.sum(n**2) / 2.0 return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast( prop_term, 'float32'), prop_mean def calc_h_probs(self, s): #this function takes an np by ns matrix of s samples #and returns an nh by np set of h probabilities exp_terms = T.dot(s, self.A) + T.reshape(self.ph, (1, self.nh)) #re-centering for numerical stability exp_terms_recentered = exp_terms - T.max(exp_terms, axis=1) #exponentiation and normalization rel_probs = T.exp(exp_terms) probs = rel_probs.T / T.sum(rel_probs, axis=1) return probs.T def forward_filter_step(self, xp): #need to sample from the proposal distribution first s_samps, s_pred, prop_terms, prop_means = self.sample_proposal_s( self.s_now, self.h_now, xp) updates = {} #now that we have samples from the proposal distribution, we need to reweight them h_probs = self.calc_h_probs(s_samps) h_samps = self.theano_rng.multinomial(pvals=h_probs) recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1)) x_terms = -T.sum( (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2) s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1) / 2.0 energies = x_terms + s_terms - prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights energies_recentered = energies - T.max(energies) alpha = T.exp(energies_recentered) #these are the reweighting factors new_weights_unnorm = self.weights_now * alpha normalizer = T.sum(new_weights_unnorm) new_weights = new_weights_unnorm / normalizer #need to normalize new weights updates[self.h_past] = T.cast(self.h_now, 'float32') updates[self.s_past] = T.cast(self.s_now, 'float32') updates[self.h_now] = T.cast(h_samps, 'float32') updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.weights_past] = T.cast(self.weights_now, 'float32') updates[self.weights_now] = T.cast(new_weights, 'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates #return h_samps, updates return updates def proposal_loss(self, C): #calculates how far off self.CCT is from the true posterior covariance CCT = T.dot(C, C.T) prod = T.dot(CCT, self.cov_inv) diff = prod - T.eye(self.ns) tot = T.sum(T.sum(diff**2)) #frobenius norm return tot def prop_update_step(self, C_now, lr): loss = self.proposal_loss(C_now) gr = T.grad(loss, C_now) return [C_now - lr * gr] def update_proposal_distrib(self, n_steps, lr): #does some gradient descent on self.C, so that self.CCT becomes #closer to the true posterior covariance C0 = self.C Cs, updates = theano.scan(fn=self.prop_update_step, outputs_info=[C0], non_sequences=[lr], n_steps=n_steps) updates[self.C] = Cs[-1] loss = self.proposal_loss(Cs[-1]) #updates={} #updates[self.C]=self.prop_update_step(self.C,lr) #loss=self.proposal_loss(self.C) return loss, updates def get_prediction(self, s, h): s_dot_M = T.dot(s, self.M) #this is np by nh*ns s_pred = T.dot(s_dot_M * T.extra_ops.repeat(h, self.ns, axis=1), self.sum_mat) #should be np by ns return T.cast(s_pred, 'float32') def sample_joint(self, sp): t2_samp = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s2_samp = T.cast( T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') h2_samp = T.cast( T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') diffs = self.b * (s2_samp - sp) sqr_term = T.sum(diffs**2, axis=1) alpha = T.exp(-sqr_term) probs_unnorm = self.weights_past * alpha probs = probs_unnorm / T.sum(probs_unnorm) t1_samp = self.theano_rng.multinomial( pvals=T.reshape(probs, (1, self.npcl))).T s1_samp = T.cast( T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') h1_samp = T.cast( T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') return [s1_samp, h1_samp, s2_samp, h2_samp] def calc_mean_h_energy(self, s, h): #you give this function a set of samples of s and h, #it gives you the average energy of those samples exp_terms = T.dot(s, self.A) + T.reshape(self.ph, (1, self.nh)) #np by nh energies = T.sum(h * exp_terms, axis=1) - T.log( T.sum(T.exp(exp_terms), axis=1)) #should be np by 1 energy = T.mean(energies) return energy def update_params(self, x1, x2, n_samps, lrate): #this function samples from the joint posterior and performs # a step of gradient ascent on the log-likelihood sp = self.get_prediction(self.s_past, self.h_past) #sp should be np by ns [s1_samps, h1_samps, s2_samps, h2_samps ], updates = theano.scan(fn=self.sample_joint, outputs_info=[None, None, None, None], non_sequences=[sp], n_steps=n_samps) x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1)) x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1)) s_pred = self.get_prediction(s1_samps, h1_samps) hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps) #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps) sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1)) / 2.0 #xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) xterm2 = -T.mean( T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2)) #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2)) energy = hterm1 + xterm2 + sterm gparams = T.grad( energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates): #gnat=T.dot(param, T.dot(param.T,param)) updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32') return energy, updates def get_ESS(self): return 1.0 / T.sum(self.weights_now**2) def resample_step(self): idx = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0) h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0) return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32') def resample(self): [s_samps, h_samps], updates = theano.scan(fn=self.resample_step, outputs_info=[None, None], n_steps=self.npcl) updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.h_now] = T.cast(h_samps, 'float32') updates[self.weights_now] = T.cast( T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'), 'float32') #dtype paranoia return updates def simulate_step(self, s): s = T.reshape(s, (1, self.ns)) #get h probabilities h_probs = self.calc_h_probs(s) #h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(self.nh,1))) h_samp = self.theano_rng.multinomial(pvals=h_probs) sp = self.get_prediction(s, h_samp) xp = T.dot(self.W, sp.T) + T.reshape(self.c, (self.nx, 1)) return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp def simulate_forward(self, n_steps): s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)), axis=0) s0 = T.reshape(s0, (1, self.ns)) [sp, xp, hs], updates = theano.scan(fn=self.simulate_step, outputs_info=[s0, None, None], n_steps=n_steps) return sp, xp, hs, updates
class LatentPolicy(BaseNNModule): # Policy network takes three inputs and produces a single # system action embedding. Its use is heavily coupled with decoder. def __init__(self, latent_size, learn_mode, belief_size, degree_size, ihidden_size, ohidden_size, tfEncoder, tbEncoder, sfEncoder, sbEncoder): # latent variable dimension self.dl = latent_size hidden_size = 100 # set default sampling mode: posterior, from all actions if learn_mode == 'rl': self.setSampleMode('prior', 5) else: self.setSampleMode('posterior', latent_size) # random seed self.srng = RandomStreams(seed=234) # decoder input embedding self.Wd1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (latent_size,hidden_size)).astype(theano.config.floatX)) self.Wd2 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (latent_size,hidden_size)).astype(theano.config.floatX)) self.bd1 = theano.shared(2. * np.ones( (hidden_size)).astype(theano.config.floatX)) self.Wd3 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (hidden_size*2,ohidden_size)).astype(theano.config.floatX)) # for state construction # belief to state self.Ws1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (belief_size,hidden_size)).astype(theano.config.floatX)) # matching degree to state self.Ws2 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (degree_size,hidden_size)).astype(theano.config.floatX)) # intent to state self.Ws3 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (ihidden_size*2,hidden_size)).astype(theano.config.floatX)) # latent policy parameterisation, state -> action self.Wp1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (hidden_size,hidden_size)).astype(theano.config.floatX)) self.Wp2 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (hidden_size,latent_size)).astype(theano.config.floatX)) self.bp1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (hidden_size)).astype(theano.config.floatX)) # prior parameters P(z_t|S_t) and P(R_t|z_t) self.params = [ self.Wd1, self.Wd2, self.bd1, self.Wd3, self.Ws1, self.Ws2, self.Ws3, self.Wp1, self.Wp2, self.bp1 ] # approximated posterior parameters Q(z_t|S_t,R_t) # sentence encoders self.sfEncoder, self.sbEncoder = sfEncoder, sbEncoder self.tfEncoder, self.tbEncoder = tfEncoder, tbEncoder # belief to posterior self.Wq1 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (belief_size,hidden_size)).astype(theano.config.floatX)) # matching degree to posterior self.Wq2 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (degree_size,hidden_size)).astype(theano.config.floatX)) # intent to posterior self.Wq3 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (ihidden_size*2,hidden_size)).astype(theano.config.floatX)) # response to posterior self.Wq4 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (ihidden_size*2,hidden_size)).astype(theano.config.floatX)) # MLP 2nd layer self.Wq5 = theano.shared(0.3 * np.random.uniform(-1.0,1.0,\ (hidden_size,latent_size)).astype(theano.config.floatX)) #posterior parameters Q(z_t|S_t,R_t) self.Qparams = [self.Wq1, self.Wq2, self.Wq3, self.Wq4, self.Wq5] self.Qparams.extend(self.tfEncoder.params + self.tbEncoder.params + self.sfEncoder.params + self.sbEncoder.params) # add posterior also into parameter set self.params.extend(self.Qparams) # Reinforce baseline self.baseline = ReinforceBaseline(belief_size, degree_size, ihidden_size) def setSampleMode(self, sample_mode, topN): self.sample_mode = sample_mode self.topN = topN def encode(self, belief_t, degree_t, intent_t, masked_source_t, masked_source_len_t, masked_target_t, masked_target_len_t, utt_group_t, sample_t=None): # prepare belief state vector belief_t = G.disconnected_grad(T.concatenate(belief_t, axis=0)) ########################## # prior parameterisarion # ########################## hidden_t = T.tanh( T.dot(belief_t, self.Ws1) + T.dot(degree_t, self.Ws2) + T.dot(intent_t, self.Ws3)) prior_t = T.nnet.softmax( T.dot(T.tanh(T.dot(hidden_t, self.Wp1) + self.bp1), self.Wp2)) ############################## # posterior parameterisation # ############################## # response encoding target_intent_t = bidirectional_encode(self.tfEncoder, self.tbEncoder, masked_target_t, masked_target_len_t) source_intent_t = bidirectional_encode(self.sfEncoder, self.sbEncoder, masked_source_t, masked_source_len_t) # scores before softmax layer q_logit_t = T.dot( T.tanh( T.dot(belief_t, self.Wq1) + T.dot(degree_t, self.Wq2) + T.dot(source_intent_t, self.Wq3) + T.dot(target_intent_t, self.Wq4)), self.Wq5) # sampling from a scaled posterior if self.sample_mode == 'posterior': print '\t\tSampling from posterior ...' posterior_t = T.nnet.softmax(q_logit_t) z_t = T.switch( T.lt(utt_group_t, self.dl - 1), utt_group_t, G.disconnected_grad( T.argmax( self.srng.multinomial(pvals=posterior_t, dtype='float32')[0]))) else: # choose to use the current sample or ground truth print '\t\tSampling from prior ...' z_t = T.switch(T.lt(utt_group_t, self.dl - 1), utt_group_t, sample_t) # put sample into decoder to decode hidden_t = T.nnet.sigmoid(self.Wd2[z_t, :] + self.bd1) * hidden_t actEmb_t = T.tanh( T.dot(T.concatenate([T.tanh(self.Wd1[z_t, :]), hidden_t], axis=0), self.Wd3)).dimshuffle('x', 0) # return the true posterior posterior_t = T.nnet.softmax(q_logit_t) # compute baseline estimate b_t = self.baseline.encode(belief_t, degree_t, source_intent_t, target_intent_t) return actEmb_t, prior_t[0], posterior_t[0], z_t, b_t, posterior_t def decide(self, belief_t, degree_t, intent_t, masked_source_t, masked_target_t, forced_sample=None): # prepare belief state vector belief_t = np.concatenate(belief_t, axis=0) # sample how many actions n = 1 # forced sampling if forced_sample != None: z_t = [forced_sample] prob_t = None # different sampling mode elif self.sample_mode == 'posterior' and masked_target_t != None: # training time, sample from posterior z_t, prob_t = self._sample_from_posterior(belief_t, degree_t, intent_t, masked_source_t, masked_target_t) elif self.sample_mode == 'prior': # testing time, sample from prior z_t, prob_t = self._sample_from_prior(belief_t, degree_t, intent_t) # state representation hidden_t = tanh( np.dot(belief_t, self.Ws1_backup) + np.dot(degree_t, self.Ws2_backup) + np.dot(intent_t, self.Ws3_backup)) # put sample into decoder to decode hidden_t = np.multiply( sigmoid(self.Wd2_backup[z_t, :] + self.bd1_backup), hidden_t) hidden_t = np.repeat(hidden_t, n, axis=0) actEmb_t = tanh( np.dot( np.concatenate([tanh(self.Wd1_backup[z_t, :]), hidden_t], axis=1), self.Wd3_backup)) return actEmb_t, z_t, prob_t def _sample_from_prior(self, belief_t, degree_t, intent_t): # prior parameterisarion hidden_t = tanh( np.dot(belief_t, self.Ws1_backup) + np.dot(degree_t, self.Ws2_backup) + np.dot(intent_t, self.Ws3_backup)) p_logit_t = np.dot( tanh(np.dot(hidden_t, self.Wp1_backup) + self.bp1_backup), self.Wp2_backup) # sampling from prior sortedIndex = np.argsort(p_logit_t)[::-1][:self.topN] topN_prior_t = softmax(p_logit_t[sortedIndex]) z_t = sortedIndex[np.argmax( np.random.multinomial(n=1, pvals=topN_prior_t))] z_t = np.expand_dims(z_t, axis=0) # choose the top N samples print 'Sample : %s' % z_t print 'Prior dist.: %s' % sortedIndex print 'probability: %s' % topN_prior_t print return z_t, softmax(p_logit_t) def _sample_from_posterior(self, belief_t, degree_t, intent_t, masked_source_t, masked_target_t): # Posterior # response encoding target_intent_t = bidirectional_read(self.tfEncoder, self.tbEncoder, masked_target_t) source_intent_t = bidirectional_read(self.sfEncoder, self.sbEncoder, masked_source_t) # posterior parameterisation q_logit_t = np.dot( tanh( np.dot(belief_t, self.Wq1_backup) + np.dot(degree_t, self.Wq2_backup) + np.dot(source_intent_t, self.Wq3_backup) + np.dot(target_intent_t, self.Wq4_backup)), self.Wq5_backup) # sampling from a scaled posterior sortedIndex = np.argsort(q_logit_t)[::-1][:self.topN] topN_posterior_t = softmax(q_logit_t[sortedIndex]) z_t = sortedIndex[np.argmax( np.random.multinomial(n=1, pvals=topN_posterior_t))] #z_t = sortedIndex[0] z_t = np.expand_dims(z_t, axis=0) print sortedIndex[:3] print softmax(q_logit_t)[sortedIndex][:3] print 'Posterior : %s' % sortedIndex print 'probability: %s' % topN_posterior_t return z_t, softmax(q_logit_t) def loadConverseParams(self): # decoder self.Wd1_backup = self.params[0].get_value() self.Wd2_backup = self.params[1].get_value() self.bd1_backup = self.params[2].get_value() self.Wd3_backup = self.params[3].get_value() # state self.Ws1_backup = self.params[4].get_value() self.Ws2_backup = self.params[5].get_value() self.Ws3_backup = self.params[6].get_value() # latent policy (conditional prior) self.Wp1_backup = self.params[7].get_value() self.Wp2_backup = self.params[8].get_value() self.bp1_backup = self.params[9].get_value() # posterior self.Wq1_backup = self.params[10].get_value() self.Wq2_backup = self.params[11].get_value() self.Wq3_backup = self.params[12].get_value() self.Wq4_backup = self.params[13].get_value() self.Wq5_backup = self.params[14].get_value() # posterior sentence encoder self.tfEncoder.loadConverseParams() self.tbEncoder.loadConverseParams() self.sfEncoder.loadConverseParams() self.sbEncoder.loadConverseParams()
class SCLmodel(): #This class defines the switched constrained linear model, which was #designed to eliminate state-space 'explosions' that can occur when #doing prediction - a serious issue in the basic SL model def __init__(self, nx, ns, nh, npcl, xvar=1.0): #for this model I assume one linear generative model and a #combination of nh linear dynamical models #generative matrix init_W=np.asarray(np.random.randn(nx,ns)/10.0,dtype='float32') #init_W=np.asarray(np.eye(2),dtype='float32') #always normalize the columns of W to be unit length init_W=init_W/np.sqrt(np.sum(init_W**2,axis=0)) #observed variable means init_c=np.asarray(np.zeros(nx),dtype='float32') #dynamical matrices init_M=np.asarray(np.random.randn(nh,ns**2)/2.0,dtype='float32') #state-variable variances #(covariance matrix of state variable noise assumed to be diagonal) init_b=np.asarray(np.ones(ns)*10.0,dtype='float32') #means for switching variable init_mu=np.asarray(np.random.randn(nh,ns)/1.0,dtype='float32') #(natural log of) covariance matrices for switching variable #I assume the covariance matrices to be diagonal, so I #store all the diagonal elements in a ns-by-nh matrix init_A=np.asarray(np.zeros((nh,ns)),dtype='float32') init_s_now=np.asarray(np.zeros((npcl,ns)),dtype='float32') init_h_now=np.asarray(np.zeros((npcl,nh)),dtype='float32') init_h_now[:,0]=1.0 init_weights_now=np.asarray(np.ones(npcl)/float(npcl),dtype='float32') init_s_past=np.asarray(np.zeros((npcl,ns)),dtype='float32') init_h_past=np.asarray(np.zeros((npcl,nh)),dtype='float32') init_h_past[:,0]=1.0 init_weights_past=np.asarray(np.ones(npcl)/float(npcl),dtype='float32') self.W=theano.shared(init_W) self.c=theano.shared(init_c) self.M=theano.shared(init_M) self.b=theano.shared(init_b) self.A=theano.shared(init_A) self.mu=theano.shared(init_mu) #I define thes to avoid repeated computations of the exponential #of the elements of A and of the normalizing constants for each h self.exp_A=T.exp(self.A) self.ln_Z_h=T.reshape(0.5*T.sum(self.A, axis=1), (nh,1)) self.s_now=theano.shared(init_s_now) self.h_now=theano.shared(init_h_now) self.weights_now=theano.shared(init_weights_now) self.s_past=theano.shared(init_s_past) self.h_past=theano.shared(init_h_past) self.weights_past=theano.shared(init_weights_past) self.xvar=np.asarray(xvar,dtype='float32') self.nx=nx #dimensionality of observed variables self.ns=ns #dimensionality of latent variables self.nh=nh #number of (linear) dynamical modes self.npcl=npcl #numer of particles in particle filter self.theano_rng = RandomStreams() self.params= [self.W, self.M, self.b, self.A, self.c, self.mu] self.rel_lrates=np.asarray([ 1.0, 1.0, 0.01, 1.0, 1.0, 10.0] ,dtype='float32') def sample_proposal_s(self, s, h, xpred, sig): s_pred=self.get_prediction(s, h) n=self.theano_rng.normal(size=T.shape(s)) #This is the proposal distribution that arises when one assumes that W'W=I mean=2.0*(xpred+s_pred*(self.b**2))*sig s_prop=mean+n*T.sqrt(sig) #I compute the term inside the exponent for the pdf of the proposal distrib prop_term=-T.sum(n**2)/2.0 return T.cast(s_prop,'float32'), T.cast(s_pred,'float32'), T.cast(prop_term,'float32') #This function is required if we allow multiple generative models #def get_recon(self, s, h): #W_vec=T.sum(self.W*h, axis=0) #W=W.reshape((self.nx, self.ns)) #xr=T.dot(W, s) #return xr def one_h_prob(self, exp_A_i, mu_i, s): #scan function for self.calc_h_probs smi=s-mu_i #should be np by ns smia=smi*T.reshape(exp_A_i,(1,self.ns)) gaussian_term=-T.sum(smia*smi,axis=1) return gaussian_term def calc_h_probs(self, s): #gterms, updates = theano.scan(fn=self.one_h_prob, #outputs_info=[None], #sequences=[self.exp_A, self.mu], #non_sequences=[s], #n_steps=self.nh) #vectorized version t1=T.dot(s*s,self.exp_A.T) t2=-2.0*T.dot(s, (self.exp_A*self.mu).T) t3=T.sum((self.mu*self.mu)*self.exp_A,axis=1) gterms=(t1+t2+t3).T #gterms should be nh by np #need to multiply by relative partition functions exp_terms=gterms+self.ln_Z_h #re-centering for numerical stability exp_terms_recentered=exp_terms-T.max(exp_terms) #exponentiation and normalization rel_probs=T.exp(exp_terms) probs=rel_probs/T.sum(rel_probs, axis=0) return probs def forward_filter_step(self, xp): #need to sample from the proposal distribution first #these terms are the same for every particle xpred=T.dot(self.W.T,(xp-self.c))/(2.0*self.xvar**2) sig=(1.0/(self.b**2+1.0/(2.0*self.xvar**2)))/2.0 [s_samps, s_pred, prop_terms], updates = theano.scan(fn=self.sample_proposal_s, outputs_info=[None, None, None], sequences=[self.s_now, self.h_now], non_sequences=[xpred, sig], n_steps=self.npcl) #now that we have samples from the proposal distribution, we need to reweight them #would use this if we have multiple generative models #recons, updates = theano.scan(fn=get_recon, #outputs_info=[None], #sequences=[s_samps, h_samps], #n_steps=self.npcl) #this loops over every row of A and mu to calculate relative h probabilities #for each particle h_probs = self.calc_h_probs(s_samps) h_samps=self.theano_rng.multinomial(pvals=h_probs.T) recons=T.dot(self.W, s_samps.T) + T.reshape(self.c,(self.nx,1)) x_terms=-T.sum((recons-T.reshape(xp,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2) s_terms=-T.sum(((s_samps-s_pred)*self.b)**2,axis=1) energies=x_terms+s_terms-prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights energies_recentered=energies-T.max(energies) alpha=T.exp(energies_recentered) #these are the reweighting factors new_weights_unnorm=self.weights_now*alpha normalizer=T.sum(new_weights_unnorm) new_weights=new_weights_unnorm/normalizer #need to normalize new weights updates[self.h_past]=T.cast(self.h_now,'float32') updates[self.s_past]=T.cast(self.s_now,'float32') updates[self.h_now]=T.cast(h_samps,'float32') updates[self.s_now]=T.cast(s_samps,'float32') updates[self.weights_past]=T.cast(self.weights_now,'float32') updates[self.weights_now]=T.cast(new_weights,'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates return h_samps, updates def get_prediction(self, s, h): M_vec=T.sum(self.M*T.reshape(h,(self.nh,1)),axis=0) M=M_vec.reshape((self.ns,self.ns)) sp=T.dot(M, s) return T.cast(sp,'float32') def sample_joint(self, sp): t2_samp=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T s2_samp=T.cast(T.sum(self.s_now*T.addbroadcast(t2_samp,1),axis=0),'float32') h2_samp=T.cast(T.sum(self.h_now*T.addbroadcast(t2_samp,1),axis=0),'float32') diffs=self.b*(s2_samp-sp) sqr_term=T.sum(diffs**2,axis=1) alpha=T.exp(-sqr_term) probs_unnorm=self.weights_past*alpha probs=probs_unnorm/T.sum(probs_unnorm) t1_samp=self.theano_rng.multinomial(pvals=T.reshape(probs,(1,self.npcl))).T s1_samp=T.cast(T.sum(self.s_past*T.addbroadcast(t1_samp,1),axis=0),'float32') h1_samp=T.cast(T.sum(self.h_past*T.addbroadcast(t1_samp,1),axis=0),'float32') return [s1_samp, h1_samp, s2_samp, h2_samp] #def sample_posterior(self, n_samps): #sp, updates = theano.scan(fn=self.get_prediction, #outputs_info=[None], #sequences=[self.s_past, self.h_past], #n_steps=self.npcl) ##sp should be np by ns #[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint, #outputs_info=[None, None, None, None], #non_sequences=[sp], #n_steps=n_samps) #return [s1_samps, h1_samps, s2_samps, h2_samps] def h_energy_step(self, s, h): #helper function for self.calc_s_energy exp_A_i=T.reshape(T.sum(self.exp_A*T.reshape(h,(self.nh,1)),axis=0),(self.ns,1)) mu_i=T.reshape(T.sum(self.mu*T.reshape(h,(self.nh,1)),axis=0), (self.ns,1)) ln_Z_h_i=T.sum(self.ln_Z_h*T.reshape(h,(self.nh,1))) diff=T.reshape(T.reshape(s,(self.ns,1))-mu_i,(self.ns,1)) diff_dot_exp_A_i=diff*exp_A_i gterm=-T.sum(T.sum(diff_dot_exp_A_i*diff)) energy=gterm+ln_Z_h_i return energy def calc_mean_h_energy(self, s, h, nsamps): #you give this function a set of samples of s and h, #it gives you the average energy of those samples energies, updates = theano.scan(fn=self.h_energy_step, outputs_info=[None], sequences=[s, h], n_steps=nsamps) energy=T.mean(energies) return energy def update_params(self, x1, x2, n_samps, lrate): #this function samples from the joint posterior and performs # a step of gradient ascent on the log-likelihood sp, updates = theano.scan(fn=self.get_prediction, outputs_info=[None], sequences=[self.s_past, self.h_past], n_steps=self.npcl) #sp should be np by ns [s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint, outputs_info=[None, None, None, None], non_sequences=[sp], n_steps=n_samps) x1_recons=T.dot(self.W, s1_samps.T) + T.reshape(self.c,(self.nx,1)) x2_recons=T.dot(self.W, s2_samps.T) + T.reshape(self.c,(self.nx,1)) s_pred, updates = theano.scan(fn=self.get_prediction, outputs_info=[None], sequences=[s1_samps, h1_samps], n_steps=n_samps) hterm1=self.calc_mean_h_energy(s1_samps, h1_samps, n_samps) hterm2=self.calc_mean_h_energy(s2_samps, h2_samps, n_samps) sterm=-T.mean(T.sum((self.b*(s2_samps-s_pred))**2,axis=1)) xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) xterm2=-T.mean(T.sum((x2_recons-T.reshape(x2,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm gparams=T.grad(energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates): #gnat=T.dot(param, T.dot(param.T,param)) updates[param] = T.cast(param + gparam*lrate*rel_lr,'float32') #make sure W has unit-length columns #new_W=updates[self.W] #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32') #MIGHT NEED TO NORMALIZE A return energy, updates def get_ESS(self): return 1.0/T.sum(self.weights_now**2) def resample_step(self): idx=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T s_samp=T.sum(self.s_now*T.addbroadcast(idx,1),axis=0) h_samp=T.sum(self.h_now*T.addbroadcast(idx,1),axis=0) return T.cast(s_samp,'float32'), T.cast(h_samp,'float32') def resample(self): [s_samps, h_samps], updates = theano.scan(fn=self.resample_step, outputs_info=[None, None], n_steps=self.npcl) updates[self.s_now]=T.cast(s_samps,'float32') updates[self.h_now]=T.cast(h_samps,'float32') updates[self.weights_now]=T.cast(T.ones_like(self.weights_now)/T.cast(self.npcl,'float32'),'float32') #dtype paranoia return updates def simulate_step(self, s): #get h probabilities h_probs = self.calc_h_probs(s) h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(1,self.nh))) M_vec=T.sum(self.M*T.reshape(h_samp,(self.nh,1)),axis=0) #here I use the 'mean M' by combining the M's according to their probabilities #M_vec=T.sum(self.M*T.reshape(hprobs,(self.nh,1)),axis=0) M=M_vec.reshape((self.ns,self.ns)) sp=T.dot(M, s) xp=T.dot(self.W, sp) + self.c return T.cast(sp,'float32'), T.cast(xp,'float32'), h_samp def simulate_forward(self, n_steps): s0=T.sum(self.s_now*T.reshape(self.weights_now,(self.npcl,1)),axis=0) [sp, xp, hs], updates = theano.scan(fn=self.simulate_step, outputs_info=[s0, None, None], n_steps=n_steps) return sp, xp, hs, updates
class StochasticPoolLayer(layers.Layer): def __init__(self, incoming, ds, strides=None, ignore_border=False, pad=(0, 0), random_state=42, **kwargs): super(StochasticPoolLayer, self).__init__(incoming, **kwargs) self.ds = ds self.ignore_border = ignore_border self.pad = pad self.st = ds if strides is None else strides if hasattr(random_state, 'multinomial'): self.rng = random_state else: self.rng = RandomStreams(seed=random_state) def get_output_shape_for(self, input_shape): output_shape = list(input_shape) # copy / convert to mutable list output_shape[2] = pool_output_length( input_shape[2], ds=self.ds[0], st=self.st[0], ignore_border=self.ignore_border, pad=self.pad[0], ) output_shape[3] = pool_output_length( input_shape[3], ds=self.ds[1], st=self.st[1], ignore_border=self.ignore_border, pad=self.pad[1], ) return tuple(output_shape) def get_output_for(self, input, deterministic=False, **kwargs): # inspired by: # https://github.com/lisa-lab/pylearn2/blob/14b2f8bebce7cc938cfa93e640008128e05945c1/pylearn2/expr/stochastic_pool.py#L23 batch, channels, nr, nc = self.input_shape pr, pc = self.ds sr, sc = self.st output_shape = self.get_output_shape() out_r, out_c = output_shape[2:] # calculate shape needed for padding pad_shape = list(output_shape) pad_shape[2] = (pad_shape[2] - 1) * sr + pr pad_shape[3] = (pad_shape[3] - 1) * sc + pc # allocate a new input tensor padded = T.alloc(0.0, *pad_shape) # get padding offset offset_x = (pad_shape[2] - nr) // 2 offset_y = (pad_shape[3] - nc) // 2 padded = T.set_subtensor( padded[:, :, offset_x:(offset_x + nr), offset_y:(offset_y + nc)], input) window = T.alloc(0.0, batch, channels, out_r, out_c, pr, pc) for row_within_pool in xrange(pr): row_stop = (output_shape[2] - 1) * sr + row_within_pool + 1 for col_within_pool in xrange(pc): col_stop = (output_shape[3] - 1) * sc + col_within_pool + 1 # theano dark magic win_cell = padded[:, :, row_within_pool:row_stop:sr, col_within_pool:col_stop:sc] window = T.set_subtensor( window[:, :, :, :, row_within_pool, col_within_pool], win_cell) # sum across pooling regions norm = window.sum(axis=[4, 5]) norm = T.switch(T.eq(norm, 0.0), 1.0, norm) norm = window / norm.dimshuffle(0, 1, 2, 3, 'x', 'x') if deterministic: res = (window * norm).sum(axis=[4, 5]) else: prob = self.rng.multinomial(pvals=norm.reshape( (batch * channels * out_r * out_c, pr * pc)), dtype=theano.config.floatX) # double max because of grad problems res = (window * prob.reshape( (batch, channels, out_r, out_c, pr, pc))).max(axis=5).max( axis=4) return T.cast(res, theano.config.floatX)
class MultiRTRBM(DS_MRTRBM): """This Class Implement the Multi-Category Recurrent Temporal RBM """ def __init__(self, input, n_visible, n_hidden, time, n_cate, W=None, Wt=None, vbias=None, hbias=None, h0=None): # the input dimeansion should be (n_cate, N_sample * time * n_vis) self.input = input self.n_vis = n_visible self.n_hid = n_hidden self.time = time self.n_cate = n_cate # Define the parameter of the Machine if W is None: W = theano.shared( np.random.normal(size=(self.n_cate, self.n_vis, self.n_hid)).astype( theano.config.floatX)) if vbias is None: vbias = theano.shared( np.zeros(shape=(self.n_cate, self.time, self.n_vis)).astype(theano.config.floatX)) if hbias is None: hbias = theano.shared( np.zeros(shape=(self.time, self.n_hid)).astype(theano.config.floatX)) if Wt is None: Wt = theano.shared( np.random.normal(size=(self.n_hid, self.n_hid)).astype( theano.config.floatX)) if h0 is None: h0 = theano.shared( np.zeros(shape=(1, 1, self.n_hid)).astype(theano.config.floatX)) # set parameters self.W = W self.Wt = Wt self.h0 = h0 self.hbias = hbias self.vbias = vbias self.params = [self.W, self.Wt, self.h0, self.hbias, self.vbias] self.numpy_rng = np.random.RandomState(1234) self.theano_rng = MRG_RandomStreams(self.numpy_rng.randint(2**30)) def h_given_h_lag_vt(self, vt, h_lag, hbias): if h_lag == self.h0: x = T.batched_dot(vt, self.W) + T.addbroadcast( T.dot(h_lag, self.Wt) + hbias.dimshuffle('x', 0), 0, 1) else: x = T.batched_dot(vt, self.W) + \ T.dot(h_lag, self.Wt) + hbias.dimshuffle('x', 0) return [x, T.nnet.sigmoid(x)] def H_given_h_lag_vt(self, V): H = [self.h0] # [x, out], _ = theano.scan(fn=self.h_given_h_lag_vt, sequence=V, # outputs_info=[None, self.h0], # n_steps=V.shape[0]) for t in range(self.time): H += [self.h_given_h_lag_vt(V[t], H[-1], self.hbias[t])[1]] return T.concatenate(H[1:], axis=2) def free_energy_given_hid_lag(self, vt, h_lag, hbias, vbias): if h_lag == self.h0: wx_b = T.batched_dot(vt, self.W) +\ T.addbroadcast(T.dot(h_lag, self.Wt) + hbias, 0, 1) vbias_term = T.batched_dot(vt, vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=2) else: wx_b = T.batched_dot(vt, self.W) + T.dot(h_lag, self.Wt) + \ hbias.dimshuffle('x', 0) vbias_term = T.batched_dot(vt, vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=2) return -hidden_term - vbias_term def free_energy_RTRBM(self, V): H = self.H_given_h_lag_vt(V) for t in range(self.time): if t == 0: Et = T.sum(self.free_energy_given_hid_lag( V[t], self.h0, self.hbias[t], self.vbias[:, t, :]), axis=0) else: Et += T.sum(self.free_energy_given_hid_lag( V[t], H[:, :, t * (self.n_hid):(t + 1) * self.n_hid], self.hbias[t], self.vbias[:, t, :]), axis=0) return Et def propup_given_h_lag(self, vt, h_lag, hbias): if h_lag == self.h0: x = T.batched_dot(vt, self.W) + T.addbroadcast( T.dot(h_lag, self.Wt) + hbias, 0, 1) else: x = T.batched_dot(vt, self.W) + hbias + T.dot(h_lag, self.Wt) return [x, T.nnet.sigmoid(x)] def propdown_given_h_lag(self, ht, vbias): x = T.batched_dot(ht, self.W.dimshuffle(0, 2, 1)) + \ vbias.dimshuffle((0, 'x', 1)) e_x = T.exp(x - x.max(axis=0, keepdims=True)) out = e_x / e_x.sum(axis=0, keepdims=True) return [x, out] def sample_vt_given_ht_h_lag(self, ht, vbias): x, out = self.propdown_given_h_lag(ht, vbias) v_sample = [] for v in range(self.n_vis): v_sample += [ self.theano_rng.multinomial( n=1, pvals=out[:, :, v].T, dtype=theano.config.floatX).dimshuffle(1, 0, 'x') ] v_sample = T.concatenate(v_sample, axis=2) return [x, out, v_sample] def sample_ht_given_vt_hid_lag(self, vt, h_lag, hbias): x, out = self.propup_given_h_lag(vt, h_lag, hbias) h_sample = self.theano_rng.binomial(n=1, p=out, size=out.shape, dtype=theano.config.floatX) return [x, out, h_sample] def gibbs_vhv_given_h_lag(self, v0, h_lag, hbias, vbias): xh, ph, h0 = self.sample_ht_given_vt_hid_lag(v0, h_lag, hbias) xv, pv, v1 = self.sample_vt_given_ht_h_lag(h0, vbias) return [xh, ph, h0, xv, pv, v1] def gibbs_VhV(self, V0): V = [] H = self.H_given_h_lag_vt(V0) for t in range(self.time): if t == 0: V += [ self.gibbs_vhv_given_h_lag( V0[t], self.h0, self.hbias[t], self.vbias[:, t, :])[-1].dimshuffle('x', 0, 1, 2) ] else: V += [ self.gibbs_vhv_given_h_lag( V0[t], H[:, :, t * self.n_hid:(t + 1) * self.n_hid], self.hbias[t], self.vbias[:, t, :])[-1].dimshuffle('x', 0, 1, 2) ] return T.concatenate(V, axis=0) def get_cost_updates(self, persistant, k=2, lr=0.01, l1=0., l2=0.01): chain_start = persistant V_burn_in, updates = theano.scan(fn=self.gibbs_VhV, outputs_info=[chain_start], n_steps=k, name='MultiRTRBM Gibbs Smapler') chain_end = V_burn_in[-1] # Contrastive Divergence (Variational method Cost)/ Approxiamted # likelihood L1 = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.Wt)) L2 = T.sum(self.W**2) + T.sum(self.Wt**2) KL_diff = T.mean(self.free_energy_RTRBM(self.input) - self.free_energy_RTRBM(chain_end)) +\ T.cast(l1, theano.config.floatX) * L1 + \ T.cast(l2, theano.config.floatX) * L2 self.gparams = T.grad(KL_diff, self.params, consider_constant=[chain_end]) for param, gparam in zip(self.params, self.gparams): if param in [self.W, self.Wt]: updates[param] = param - 0.0001 * gparam else: updates[param] = param - lr * gparam cost, updates = self.get_pseudo_likelihood_cost(updates) return cost, updates def get_pseudo_likelihood_cost(self, updates): bit_i_idx = theano.shared(value=0, name='bit_i_idx') xi = T.round(self.input) fe_xi = self.free_energy_RTRBM(xi) for k in range(self.n_cate): xi_flip = T.set_subtensor(xi[:, k, :, bit_i_idx], 1 - xi[:, k, :, bit_i_idx]) # calculate free energy with bit flipped fe_xi_flip = self.free_energy_RTRBM(xi_flip) # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) cost = T.mean(self.n_vis * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))) # increment bit_i_idx % number as part of updates updates[bit_i_idx] = (bit_i_idx + 1) % self.n_vis return cost, updates
class Network: def __init__(self, options): ctx_dim = options['ctx_dim'] dim = options['dim'] dim_word = options['dim_word'] n_words = options['n_words'] self.scale = 0.01 self.Wemb = theano.shared( (self.scale * numpy.random.randn(n_words, dim_word)).astype('float32'), name='Wemb') self.trng = RandomStreams(1234) self.use_noise = theano.shared(numpy.float32(0.)) self.FFInit = FFLayer(shape=[ctx_dim, ctx_dim], name='ff_init') self.FFState = FFLayer(shape=[ctx_dim, dim], name='ff_state') self.FFMemory = FFLayer(shape=[ctx_dim, dim], name='ff_memory') self.LSTMLayer = LSTMLayer(shape=[dim_word, dim, ctx_dim], name='decoder') self.FFLSTM = FFLayer(shape=[dim, dim_word], name='ff_logit_lstm') self.FFCtx = FFLayer(shape=[ctx_dim, dim_word], name='ff_logit_ctx') self.FFLogit = FFLayer(shape=[dim_word, n_words], name='ff_logit') self.Layers = [ self.FFInit, self.FFState, self.FFMemory, self.LSTMLayer, self.FFLSTM, self.FFCtx, self.FFLogit ] self._params = sum([layer.params() for layer in self.Layers], [self.Wemb]) self.dropOutInit = DropOutLayer(self.use_noise, self.trng) self.dropOutLSTM = DropOutLayer(self.use_noise, self.trng) self.dropOutLogit = DropOutLayer(self.use_noise, self.trng) def params(self): return self._params def infer_init(self, ctx_mean): ctx_mean = self.FFInit(ctx_mean, activation='relu') ctx_mean = self.dropOutInit(ctx_mean) init_state = self.FFState(ctx_mean, activation='tanh') init_memory = self.FFMemory(ctx_mean, activation='tanh') return init_state, init_memory def infer_main(self, ctx, emb=None, mask=None, init_state=None, init_memory=None, one_step=False): output_state = self.LSTMLayer(emb, ctx, init_memory, init_state, one_step, mask) output_state_h = self.dropOutLSTM(output_state[0]) logit = self.FFLSTM(output_state_h, activation='linear') # prev2out logit += emb # ctx2out logit += self.FFCtx(output_state[3], activation='linear') logit = tensor.tanh(logit) logit = self.dropOutLogit(logit) logit = self.FFLogit(logit, activation='linear') return output_state, logit def build_training_graph(self, options): # description string: #words x #samples, x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') # context: #samples x #annotations x dim ctx = tensor.tensor3('ctx', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # index into the word embedding matrix, shift it forward in time #n_timesteps == caption length. n_samples = number of captions. emb = self.Wemb[x.flatten()].reshape( [n_timesteps, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # initial state/cell [top right on page 4] ctx_mean = ctx.mean(1) init_state, init_memory = self.infer_init(ctx_mean) output_state, logit = self.infer_main(ctx=ctx, emb=emb, mask=mask, init_state=init_state, init_memory=init_memory, one_step=False) logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # Index into the computed probability to give the log likelihood x_flat = x.flatten() p_flat = probs.flatten() cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) * probs.shape[1] + x_flat] + 1e-8) cost = cost.reshape([x.shape[0], x.shape[1]]) masked_cost = cost * mask cost = (masked_cost).sum(0) alphas = output_state[2] return self.use_noise, [x, mask, ctx], alphas, cost def infer(self): # context: #annotations x dim ctx = tensor.matrix('ctx_sampler', dtype='float32') x = tensor.vector('x_sampler', dtype='int64') # initial state/cell ctx_mean = ctx.mean(0) init_state, init_memory = self.infer_init(ctx_mean) f_init = TFW([ctx], { 'context': ctx, 'state': init_state, 'memory': init_memory }, name='f_init', profile=False) init_state = tensor.matrix('init_state', dtype='float32') init_memory = tensor.matrix('init_memory', dtype='float32') # for the first word (which is coded with -1), emb should be all zero emb = tensor.switch(x[:, None] < 0, tensor.alloc(0., 1, self.Wemb.shape[1]), self.Wemb[x]) output_state, logit = self.infer_main(ctx=ctx, emb=emb, mask=None, init_state=init_state, init_memory=init_memory, one_step=True) next_probs = tensor.nnet.softmax(logit) next_sample = self.trng.multinomial(pvals=next_probs).argmax(1) next_state, next_memory = output_state[0], output_state[1] f_next = TFW( [x, ctx, init_state, init_memory], { 'probs': next_probs, 'sample': next_sample, 'state': next_state, 'memory': next_memory }, name='f_next', profile=False) return f_init, f_next
class EncoderDecoder(object): def __init__(self, rng, **kwargs): self.n_in_src = kwargs.get('nembed_src') self.n_in_trg = kwargs.get('nembed_trg') self.n_hids_src = kwargs.get('nhids_src') self.n_hids_trg = kwargs.get('nhids_trg') self.src_vocab_size = kwargs.get('src_vocab_size') self.trg_vocab_size = kwargs.get('trg_vocab_size') self.method = kwargs.get('method') self.dropout = kwargs.get('dropout') self.maxout_part = kwargs.get('maxout_part') self.path = kwargs.get('saveto') self.clip_c = kwargs.get('clip_c') self.rng = rng self.trng = RandomStreams(rng.randint(1e5)) # added by Zhaopeng Tu, 2016-04-29 self.with_coverage = kwargs.get('with_coverage') self.coverage_dim = kwargs.get('coverage_dim') self.coverage_type = kwargs.get('coverage_type') self.max_fertility = kwargs.get('max_fertility') if self.coverage_type is 'linguistic': # make sure the dimension of linguistic coverage is always 1 self.coverage_dim = 1 # added by Zhaopeng Tu, 2016-05-30 self.with_context_gate = kwargs.get('with_context_gate') # added by Zhaopeng Tu, 2017-11-29 self.with_layernorm = kwargs.get('with_layernorm', False) self.params = [] self.layers = [] self.table_src = LookupTable(self.rng, self.src_vocab_size, self.n_in_src, name='table_src') self.layers.append(self.table_src) self.encoder = BidirectionalEncoder(self.rng, self.n_in_src, self.n_hids_src, self.table_src, name='birnn_encoder') self.layers.append(self.encoder) self.table_trg = LookupTable(self.rng, self.trg_vocab_size, self.n_in_trg, name='table_trg') self.layers.append(self.table_trg) self.decoder = Decoder(self.rng, self.n_in_trg, self.n_hids_trg, 2*self.n_hids_src, \ maxout_part=self.maxout_part, name='rnn_decoder', \ # added by Zhaopeng Tu, 2016-04-29 with_coverage=self.with_coverage, coverage_dim=self.coverage_dim, coverage_type=self.coverage_type, max_fertility=self.max_fertility, \ # added by Zhaopeng Tu, 2016-05-30 with_context_gate=self.with_context_gate, \ with_layernorm=self.with_layernorm) self.layers.append(self.decoder) self.logistic_layer = LogisticRegression(self.rng, self.n_in_trg, self.trg_vocab_size) self.layers.append(self.logistic_layer) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction self.with_reconstruction = kwargs.get('with_reconstruction') if self.with_reconstruction: # added by Zhaopeng Tu, 2016-07-27 self.reconstruction_weight = kwargs.get('reconstruction_weight') # note the source and target sides are reversed self.inverse_decoder = Decoder(self.rng, self.n_in_src, 2*self.n_hids_src, self.n_hids_trg, \ maxout_part=self.maxout_part, name='rnn_inverse_decoder', \ with_layernorm=self.with_layernorm) self.layers.append(self.inverse_decoder) self.srng = RandomStreams(rng.randint(1e5)) self.inverse_logistic_layer = LogisticRegression( self.rng, self.n_in_src, self.src_vocab_size, name='inverse_LR') self.layers.append(self.inverse_logistic_layer) for layer in self.layers: self.params.extend(layer.params) def build_trainer(self, src, src_mask, trg, trg_mask): annotations = self.encoder.apply(src, src_mask) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = (annotations * src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None] trg_emb = self.table_trg.apply(trg) trg_emb_shifted = T.zeros_like(trg_emb) trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1]) results = self.decoder.run_pipeline(state_below=trg_emb_shifted, mask_below=trg_mask, init_context=init_context, c=annotations, c_mask=src_mask) hiddens, ctxs, readout, alignment = results[:4] # apply dropout if self.dropout < 1.0: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(self.trng, readout, 1, self.dropout) p_y_given_x = self.logistic_layer.get_probs(readout) self.cost = self.logistic_layer.cost(p_y_given_x, trg, trg_mask) / trg.shape[1] # self.cost = theano.printing.Print('likilihood cost:')(self.cost) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction if self.with_reconstruction: # now hiddens is the annotations inverse_init_context = (hiddens * trg_mask[:, :, None] ).sum(0) / trg_mask.sum(0)[:, None] src_emb = self.table_src.apply(src) src_emb_shifted = T.zeros_like(src_emb) src_emb_shifted = T.set_subtensor(src_emb_shifted[1:], src_emb[:-1]) inverse_results = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask) inverse_hiddens, inverse_ctxs, inverse_readout, inverse_alignment = inverse_results[: 4] # apply dropout if self.dropout < 1.0: # logger.info('Apply dropout with p = {}'.format(self.dropout)) inverse_readout = Dropout(self.srng, inverse_readout, 1, self.dropout) p_x_given_y = self.inverse_logistic_layer.get_probs( inverse_readout) self.reconstruction_cost = self.inverse_logistic_layer.cost( p_x_given_y, src, src_mask) / src.shape[1] # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost) self.cost += self.reconstruction_cost * self.reconstruction_weight self.L1 = sum(T.sum(abs(param)) for param in self.params) self.L2 = sum(T.sum(param**2) for param in self.params) params_regular = self.L1 * 1e-6 + self.L2 * 1e-6 # params_regular = theano.printing.Print('params_regular:')(params_regular) # train cost train_cost = self.cost + params_regular # gradients grads = T.grad(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # train function inps = [src, src_mask, trg, trg_mask] outs = [train_cost] if self.with_layernorm: inps = [src, src_mask, trg, trg_mask] lr = T.scalar(name='lr') print 'Building optimizers...', self.train_fn, self.update_fn = adam(lr, self.params, grads, inps, outs) else: # updates updates = adadelta(self.params, grads) # mode=theano.Mode(linker='vm') for ifelse # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch. self.train_fn = theano.function(inps, outs, updates=updates, name='train_function', mode=theano.Mode(linker='vm')) # self.train_fn = theano.function(inps, outs, updates=updates, name='train_function', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) def build_sampler(self): x = T.lmatrix() # Build Networks # src_mask is None c = self.encoder.apply(x, None) #init_context = ctx[0, :, -self.n_hids_src:] # mean pooling init_context = c.mean(0) init_state = self.decoder.create_init_state(init_context) # compile function print 'Building compile_init_state_and_context function ...' self.compile_init_and_context = theano.function( [x], [init_state, c], name='compile_init_and_context') print 'Done' y = T.lvector() cur_state = T.matrix() # if it is the first word, emb should be a1l zero, and it is indicated by -1 trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg), self.table_trg.apply(y)) # added by Zhaopeng Tu, 2016-06-09 if self.with_coverage: cov_before = T.tensor3() if self.coverage_type is 'linguistic': print 'Building compile_fertility ...' fertility = self.decoder._get_fertility(c) fertility = T.addbroadcast(fertility, 1) self.compile_fertility = theano.function( [c], [fertility], name='compile_fertility') print 'Done' else: fertility = None else: cov_before = None fertility = None # apply one step # modified by Zhaopeng Tu, 2016-04-29 results = self.decoder.apply( state_below=trg_emb, init_state=cur_state, c=c, one_step=True, # added by Zhaopeng Tu, 2016-04-27 cov_before=cov_before, fertility=fertility) next_state, ctxs, alignment = results[:3] idx = 3 if self.with_coverage: cov = results[idx] idx += 1 readout = self.decoder.readout(next_state, ctxs, trg_emb) # maxout if self.maxout_part > 1: readout = self.decoder.one_step_maxout(readout) # apply dropout if self.dropout < 1.0: readout = Dropout(self.trng, readout, 0, self.dropout) # compute the softmax probability next_probs = self.logistic_layer.get_probs(readout) # sample from softmax distribution to get the sample next_sample = self.trng.multinomial(pvals=next_probs).argmax(1) # compile function print 'Building compile_next_state_and_probs function ...' inps = [y, cur_state, c] outs = [next_probs, next_state, next_sample, alignment] # added by Zhaopeng Tu, 2016-04-29 if self.with_coverage: inps.append(cov_before) if self.coverage_type is 'linguistic': inps.append(fertility) outs.append(cov) # mode=theano.Mode(linker='vm') for ifelse # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch. self.compile_next_state_and_probs = theano.function( inps, outs, name='compile_next_state_and_probs', mode=theano.Mode(linker='vm')) print 'Done' # added by Zhaopeng Tu, 2016-07-18 # for reconstruction if self.with_reconstruction: # Build Networks # trg_mask is None inverse_c = T.tensor3() # mean pooling inverse_init_context = inverse_c.mean(0) inverse_init_state = self.inverse_decoder.create_init_state( inverse_init_context) outs = [inverse_init_state] # compile function print 'Building compile_inverse_init_state_and_context function ...' self.compile_inverse_init_and_context = theano.function( [inverse_c], outs, name='compile_inverse_init_and_context') print 'Done' src = T.lvector() inverse_cur_state = T.matrix() trg_mask = T.matrix() # if it is the first word, emb should be all zero, and it is indicated by -1 src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src), self.table_src.apply(src)) # apply one step # modified by Zhaopeng Tu, 2016-04-29 inverse_results = self.inverse_decoder.apply( state_below=src_emb, init_state=inverse_cur_state, c=inverse_c, c_mask=trg_mask, one_step=True) inverse_next_state, inverse_ctxs, inverse_alignment = inverse_results[: 3] inverse_readout = self.inverse_decoder.readout( inverse_next_state, inverse_ctxs, src_emb) # maxout if self.maxout_part > 1: inverse_readout = self.inverse_decoder.one_step_maxout( inverse_readout) # apply dropout if self.dropout < 1.0: inverse_readout = Dropout(self.srng, inverse_readout, 0, self.dropout) # compute the softmax probability inverse_next_probs, inverse_next_energy = self.inverse_logistic_layer.get_probs( inverse_readout) # sample from softmax distribution to get the sample inverse_next_sample = self.srng.multinomial( pvals=inverse_next_probs).argmax(1) # compile function print 'Building compile_inverse_next_state_and_probs function ...' inps = [src, trg_mask, inverse_cur_state, inverse_c] outs = [ inverse_next_probs, inverse_next_state, inverse_next_sample, inverse_alignment ] self.compile_inverse_next_state_and_probs = theano.function( inps, outs, name='compile_inverse_next_state_and_probs') print 'Done' def save(self, path=None): if path is None: path = self.path filenpz = open(path, "w") val = dict([(value.name, value.get_value()) for index, value in enumerate(self.params)]) logger.info("save the model {}".format(path)) numpy.savez(path, **val) filenpz.close() def load(self, path=None): if path is None: path = self.path if os.path.isfile(path): logger.info("load params {}".format(path)) val = numpy.load(path) for index, param in enumerate(self.params): logger.info('Loading {} with shape {}'.format( param.name, param.get_value(borrow=True).shape)) if param.name not in val.keys(): logger.info('Adding new param {} with shape {}'.format( param.name, param.get_value(borrow=True).shape)) continue if param.get_value().shape != val[param.name].shape: logger.info("Error: model param != load param shape {} != {}".format(\ param.get_value().shape, val[param.name].shape)) raise Exception("loading params shape mismatch") else: param.set_value(val[param.name], borrow=True) else: logger.error("file {} does not exist".format(path)) self.save()
class RVal(Elem, TensorWrapped, Masked): # random value def __init__(self, seed=None, **kw): super(RVal, self).__init__(**kw) if seed is None: seed = np.random.randint(0, 1e6) self.rng = RandomStreams(seed=seed) self.value = None def binomial(self, shape, n=1, p=0.5, ndim=None, dtype="int32"): if isinstance(shape, Elem): shape = shape.d self.value = self.rng.binomial(shape, n, p, ndim, dtype) return self def normal(self, shape, avg=0.0, std=1.0, ndim=None, dtype=None): if isinstance(shape, Elem): shape = shape.d self.value = self.rng.normal(shape, avg, std, ndim, dtype) return self def multinomial(self, shape, n=1, pvals=None, without_replacement=False, ndim=None, dtype="int32"): if isinstance(shape, Elem): shape = shape.d if without_replacement: self.value = self.rng.multinomial_wo_replacement( shape, n, pvals, ndim, dtype) else: self.value = self.rng.multinomial(shape, n, pvals, ndim, dtype) return self def gumbel(self, shape, eps=1e-10): if isinstance(shape, Elem): shape = shape.d x = self.rng.uniform(shape, 0.0, 1.0) self.value = -theano.tensor.log(-theano.tensor.log(x + eps) + eps) return self @property def d(self): return self.value @property def v(self): return self.value.eval() @property def allparams(self): return set() @property def allupdates(self): return {} @property def all_extra_outs(self): return {}
class SCLmodel(): #This class defines the switched constrained linear model, which was #designed to eliminate state-space 'explosions' that can occur when #doing prediction - a serious issue in the basic SL model def __init__(self, nx, ns, nh, npcl, xvar=1.0): #for this model I assume one linear generative model and a #combination of nh linear dynamical models #generative matrix init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32') #init_W=np.asarray(np.eye(2),dtype='float32') #always normalize the columns of W to be unit length init_W = init_W / np.sqrt(np.sum(init_W**2, axis=0)) #observed variable means init_c = np.asarray(np.zeros(nx), dtype='float32') #dynamical matrices init_M = np.asarray(np.random.randn(nh, ns**2) / 2.0, dtype='float32') #state-variable variances #(covariance matrix of state variable noise assumed to be diagonal) init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32') #means for switching variable init_mu = np.asarray(np.random.randn(nh, ns) / 1.0, dtype='float32') #(natural log of) covariance matrices for switching variable #I assume the covariance matrices to be diagonal, so I #store all the diagonal elements in a ns-by-nh matrix init_A = np.asarray(np.zeros((nh, ns)), dtype='float32') init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_h_now = np.asarray(np.zeros((npcl, nh)), dtype='float32') init_h_now[:, 0] = 1.0 init_weights_now = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32') init_h_past[:, 0] = 1.0 init_weights_past = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') self.W = theano.shared(init_W) self.c = theano.shared(init_c) self.M = theano.shared(init_M) self.b = theano.shared(init_b) self.A = theano.shared(init_A) self.mu = theano.shared(init_mu) #I define thes to avoid repeated computations of the exponential #of the elements of A and of the normalizing constants for each h self.exp_A = T.exp(self.A) self.ln_Z_h = T.reshape(0.5 * T.sum(self.A, axis=1), (nh, 1)) self.s_now = theano.shared(init_s_now) self.h_now = theano.shared(init_h_now) self.weights_now = theano.shared(init_weights_now) self.s_past = theano.shared(init_s_past) self.h_past = theano.shared(init_h_past) self.weights_past = theano.shared(init_weights_past) self.xvar = np.asarray(xvar, dtype='float32') self.nx = nx #dimensionality of observed variables self.ns = ns #dimensionality of latent variables self.nh = nh #number of (linear) dynamical modes self.npcl = npcl #numer of particles in particle filter self.theano_rng = RandomStreams() self.params = [self.W, self.M, self.b, self.A, self.c, self.mu] self.rel_lrates = np.asarray([1.0, 1.0, 0.01, 1.0, 1.0, 10.0], dtype='float32') def sample_proposal_s(self, s, h, xpred, sig): s_pred = self.get_prediction(s, h) n = self.theano_rng.normal(size=T.shape(s)) #This is the proposal distribution that arises when one assumes that W'W=I mean = 2.0 * (xpred + s_pred * (self.b**2)) * sig s_prop = mean + n * T.sqrt(sig) #I compute the term inside the exponent for the pdf of the proposal distrib prop_term = -T.sum(n**2) / 2.0 return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast( prop_term, 'float32') #This function is required if we allow multiple generative models #def get_recon(self, s, h): #W_vec=T.sum(self.W*h, axis=0) #W=W.reshape((self.nx, self.ns)) #xr=T.dot(W, s) #return xr def one_h_prob(self, exp_A_i, mu_i, s): #scan function for self.calc_h_probs smi = s - mu_i #should be np by ns smia = smi * T.reshape(exp_A_i, (1, self.ns)) gaussian_term = -T.sum(smia * smi, axis=1) return gaussian_term def calc_h_probs(self, s): #gterms, updates = theano.scan(fn=self.one_h_prob, #outputs_info=[None], #sequences=[self.exp_A, self.mu], #non_sequences=[s], #n_steps=self.nh) #vectorized version t1 = T.dot(s * s, self.exp_A.T) t2 = -2.0 * T.dot(s, (self.exp_A * self.mu).T) t3 = T.sum((self.mu * self.mu) * self.exp_A, axis=1) gterms = (t1 + t2 + t3).T #gterms should be nh by np #need to multiply by relative partition functions exp_terms = gterms + self.ln_Z_h #re-centering for numerical stability exp_terms_recentered = exp_terms - T.max(exp_terms) #exponentiation and normalization rel_probs = T.exp(exp_terms) probs = rel_probs / T.sum(rel_probs, axis=0) return probs def forward_filter_step(self, xp): #need to sample from the proposal distribution first #these terms are the same for every particle xpred = T.dot(self.W.T, (xp - self.c)) / (2.0 * self.xvar**2) sig = (1.0 / (self.b**2 + 1.0 / (2.0 * self.xvar**2))) / 2.0 [s_samps, s_pred, prop_terms], updates = theano.scan(fn=self.sample_proposal_s, outputs_info=[None, None, None], sequences=[self.s_now, self.h_now], non_sequences=[xpred, sig], n_steps=self.npcl) #now that we have samples from the proposal distribution, we need to reweight them #would use this if we have multiple generative models #recons, updates = theano.scan(fn=get_recon, #outputs_info=[None], #sequences=[s_samps, h_samps], #n_steps=self.npcl) #this loops over every row of A and mu to calculate relative h probabilities #for each particle h_probs = self.calc_h_probs(s_samps) h_samps = self.theano_rng.multinomial(pvals=h_probs.T) recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1)) x_terms = -T.sum( (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2) s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1) energies = x_terms + s_terms - prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights energies_recentered = energies - T.max(energies) alpha = T.exp(energies_recentered) #these are the reweighting factors new_weights_unnorm = self.weights_now * alpha normalizer = T.sum(new_weights_unnorm) new_weights = new_weights_unnorm / normalizer #need to normalize new weights updates[self.h_past] = T.cast(self.h_now, 'float32') updates[self.s_past] = T.cast(self.s_now, 'float32') updates[self.h_now] = T.cast(h_samps, 'float32') updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.weights_past] = T.cast(self.weights_now, 'float32') updates[self.weights_now] = T.cast(new_weights, 'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates return h_samps, updates def get_prediction(self, s, h): M_vec = T.sum(self.M * T.reshape(h, (self.nh, 1)), axis=0) M = M_vec.reshape((self.ns, self.ns)) sp = T.dot(M, s) return T.cast(sp, 'float32') def sample_joint(self, sp): t2_samp = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s2_samp = T.cast( T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') h2_samp = T.cast( T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') diffs = self.b * (s2_samp - sp) sqr_term = T.sum(diffs**2, axis=1) alpha = T.exp(-sqr_term) probs_unnorm = self.weights_past * alpha probs = probs_unnorm / T.sum(probs_unnorm) t1_samp = self.theano_rng.multinomial( pvals=T.reshape(probs, (1, self.npcl))).T s1_samp = T.cast( T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') h1_samp = T.cast( T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') return [s1_samp, h1_samp, s2_samp, h2_samp] #def sample_posterior(self, n_samps): #sp, updates = theano.scan(fn=self.get_prediction, #outputs_info=[None], #sequences=[self.s_past, self.h_past], #n_steps=self.npcl) ##sp should be np by ns #[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint, #outputs_info=[None, None, None, None], #non_sequences=[sp], #n_steps=n_samps) #return [s1_samps, h1_samps, s2_samps, h2_samps] def h_energy_step(self, s, h): #helper function for self.calc_s_energy exp_A_i = T.reshape( T.sum(self.exp_A * T.reshape(h, (self.nh, 1)), axis=0), (self.ns, 1)) mu_i = T.reshape(T.sum(self.mu * T.reshape(h, (self.nh, 1)), axis=0), (self.ns, 1)) ln_Z_h_i = T.sum(self.ln_Z_h * T.reshape(h, (self.nh, 1))) diff = T.reshape(T.reshape(s, (self.ns, 1)) - mu_i, (self.ns, 1)) diff_dot_exp_A_i = diff * exp_A_i gterm = -T.sum(T.sum(diff_dot_exp_A_i * diff)) energy = gterm + ln_Z_h_i return energy def calc_mean_h_energy(self, s, h, nsamps): #you give this function a set of samples of s and h, #it gives you the average energy of those samples energies, updates = theano.scan(fn=self.h_energy_step, outputs_info=[None], sequences=[s, h], n_steps=nsamps) energy = T.mean(energies) return energy def update_params(self, x1, x2, n_samps, lrate): #this function samples from the joint posterior and performs # a step of gradient ascent on the log-likelihood sp, updates = theano.scan(fn=self.get_prediction, outputs_info=[None], sequences=[self.s_past, self.h_past], n_steps=self.npcl) #sp should be np by ns [s1_samps, h1_samps, s2_samps, h2_samps ], updates = theano.scan(fn=self.sample_joint, outputs_info=[None, None, None, None], non_sequences=[sp], n_steps=n_samps) x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1)) x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1)) s_pred, updates = theano.scan(fn=self.get_prediction, outputs_info=[None], sequences=[s1_samps, h1_samps], n_steps=n_samps) hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps, n_samps) hterm2 = self.calc_mean_h_energy(s2_samps, h2_samps, n_samps) sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1)) xterm1 = -T.mean( T.sum((x1_recons - T.reshape(x1, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2)) xterm2 = -T.mean( T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2)) energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm gparams = T.grad( energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates): #gnat=T.dot(param, T.dot(param.T,param)) updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32') #make sure W has unit-length columns #new_W=updates[self.W] #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32') #MIGHT NEED TO NORMALIZE A return energy, updates def get_ESS(self): return 1.0 / T.sum(self.weights_now**2) def resample_step(self): idx = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0) h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0) return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32') def resample(self): [s_samps, h_samps], updates = theano.scan(fn=self.resample_step, outputs_info=[None, None], n_steps=self.npcl) updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.h_now] = T.cast(h_samps, 'float32') updates[self.weights_now] = T.cast( T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'), 'float32') #dtype paranoia return updates def simulate_step(self, s): #get h probabilities h_probs = self.calc_h_probs(s) h_samp = self.theano_rng.multinomial( pvals=T.reshape(h_probs, (1, self.nh))) M_vec = T.sum(self.M * T.reshape(h_samp, (self.nh, 1)), axis=0) #here I use the 'mean M' by combining the M's according to their probabilities #M_vec=T.sum(self.M*T.reshape(hprobs,(self.nh,1)),axis=0) M = M_vec.reshape((self.ns, self.ns)) sp = T.dot(M, s) xp = T.dot(self.W, sp) + self.c return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp def simulate_forward(self, n_steps): s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)), axis=0) [sp, xp, hs], updates = theano.scan(fn=self.simulate_step, outputs_info=[s0, None, None], n_steps=n_steps) return sp, xp, hs, updates
class SLmodel(): #This is the switched conditional linear model for integrating #action with sensation def __init__(self, nx, ns, nh, na, npcl, xvar=1.0): #for this model I assume one linear generative model and a #combination of nh linear dynamical models #generative matrix init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32') #observed variable means init_c = np.asarray(np.zeros(nx), dtype='float32') #dynamical matrices init_M = np.asarray((np.tile(np.eye(ns), (1, nh))), dtype='float32') #for state-based predictions init_C = np.asarray((np.tile(np.zeros((na, ns)), (1, nh))), dtype='float32') #for action-based predictions #state-variable variances #(covariance matrix of state variable noise assumed to be diagonal) init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32') #Switching parameter matrices init_A = np.asarray(np.zeros((ns, nh)), dtype='float32') #associated with the state init_B = np.asarray(np.zeros((na, nh)), dtype='float32') #associated with actions #priors for switching variable init_ph = np.asarray(np.zeros(nh), dtype='float32') init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_weights_now = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32') init_h_past[:, 0] = 1.0 init_weights_past = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') init_a_past = np.asarray(np.zeros((1, na)), dtype='float32') self.W = theano.shared(init_W) self.c = theano.shared(init_c) self.M = theano.shared(init_M) self.C = theano.shared(init_C) self.b = theano.shared(init_b) self.A = theano.shared(init_A) self.B = theano.shared(init_B) self.ph = theano.shared(init_ph) #this is to help vectorize operations self.sum_mat = T.as_tensor_variable( np.asarray((np.tile(np.eye(ns), nh)).T, dtype='float32')) self.s_now = theano.shared(init_s_now) self.weights_now = theano.shared(init_weights_now) self.s_past = theano.shared(init_s_past) self.h_past = theano.shared(init_h_past) self.a_past = theano.shared(init_a_past) self.weights_past = theano.shared(init_weights_past) self.xvar = np.asarray(xvar, dtype='float32') self.nx = nx #dimensionality of observed variables self.ns = ns #dimensionality of latent variables self.nh = nh #number of (linear) dynamical modes self.na = na #dimensionality of action variables self.npcl = npcl #numer of particles in particle filter self.theano_rng = RandomStreams() self.params = [ self.W, self.M, self.C, self.b, self.A, self.B, self.c, self.ph ] self.rel_lrates = np.asarray( [0.1, 1.0, 1.0, 0.01, 10.0, 10.0, 0.1, 1.0], dtype='float32') def sample_proposal_s(self, s, a, h, xpred, sig): s_pred = self.get_prediction(s, a, h) n = self.theano_rng.normal(size=T.shape(s)) #This is the proposal distribution that arises when one assumes that W'W=I mean = 2.0 * (xpred + s_pred * (self.b**2)) * sig s_prop = mean + n * T.sqrt(sig) #I compute the term inside the exponent for the pdf of the proposal distrib prop_term = -T.sum(n**2) / 2.0 return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast( prop_term, 'float32') #This function is required if we allow multiple generative models #def get_recon(self, s, h): #W_vec=T.sum(self.W*h, axis=0) #W=W.reshape((self.nx, self.ns)) #xr=T.dot(W, s) #return xr def calc_h_probs(self, s, a): #this function takes an np by ns matrix of s samples plus #an action vector a #and returns an nh by np set of h probabilities exp_terms = T.dot(s, self.A) + T.reshape(T.dot(a, self.B), (1, self.nh)) + T.reshape( self.ph, (1, self.nh)) #re-centering for numerical stability exp_terms_recentered = exp_terms - T.max(exp_terms, axis=1) #exponentiation and normalization rel_probs = T.exp(exp_terms) probs = rel_probs.T / T.sum(rel_probs, axis=1) return probs.T def forward_filter_step(self, a, xp): #first sample from h given s and a h_probs = self.calc_h_probs(self.s_now, a) h_samps = self.theano_rng.multinomial(pvals=h_probs) #need to sample from the proposal distribution #these terms are the same for every particle xpred = T.dot(self.W.T, (xp - self.c)) / (2.0 * self.xvar**2) sig = (1.0 / (self.b**2 + 1.0 / (2.0 * self.xvar**2))) / 2.0 #sig=1.0/(self.b**2) #vectorized version s_pred = self.get_prediction(self.s_now, a, h_samps) n = self.theano_rng.normal(size=T.shape(self.s_now)) mean = 2.0 * (xpred + s_pred * (self.b**2)) * sig #mean=s_pred #trying out using solely predictive proposal distrib s_samps = mean + n * T.sqrt(sig) prop_terms = -T.sum(n**2, axis=1) / 2.0 updates = {} #now that we have samples from the proposal distribution, we need to reweight them recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1)) x_terms = -T.sum( (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2) s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1) / 2.0 energies = x_terms + s_terms - prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights energies_recentered = energies - T.max(energies) alpha = T.exp(energies_recentered) #these are the reweighting factors new_weights_unnorm = self.weights_now * alpha normalizer = T.sum(new_weights_unnorm) new_weights = new_weights_unnorm / normalizer #need to normalize new weights updates[self.h_past] = T.cast(h_samps, 'float32') updates[self.s_past] = T.cast(self.s_now, 'float32') updates[self.a_past] = T.cast(a, 'float32') updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.weights_past] = T.cast(self.weights_now, 'float32') updates[self.weights_now] = T.cast(new_weights, 'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates #return h_samps, updates return updates def get_prediction(self, s, a, h): s_dot_M = T.dot(s, self.M) #this is np by nh*ns a_dot_C = T.dot(a, self.C) #this is 1 by nh*ns tot = s_dot_M + a_dot_C #should be np by nh*ns s_pred = T.dot(tot * T.extra_ops.repeat(h, self.ns, axis=1), self.sum_mat) #should be np by ns return T.cast(s_pred, 'float32') def sample_joint(self, sp): t2_samp = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s2_samp = T.cast( T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') h2_samp = T.cast( T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') diffs = self.b * (s2_samp - sp) sqr_term = T.sum(diffs**2, axis=1) alpha = T.exp(-sqr_term) probs_unnorm = self.weights_past * alpha probs = probs_unnorm / T.sum(probs_unnorm) t1_samp = self.theano_rng.multinomial( pvals=T.reshape(probs, (1, self.npcl))).T s1_samp = T.cast( T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') h1_samp = T.cast( T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') return [s1_samp, h1_samp, s2_samp, h2_samp] def calc_mean_h_energy(self, s, a, h): #you give this function a set of samples of s, a, and h, #it gives you the average energy of those samples exp_terms = T.dot(s, self.A) + T.reshape(T.dot(a, self.B), (1, self.nh)) + T.reshape( self.ph, (1, self.nh)) #np by nh energies = T.sum(h * exp_terms, axis=1) - T.log( T.sum(T.exp(exp_terms), axis=1)) #should be np by 1 energy = T.mean(energies) return energy def update_params(self, x1, x2, n_samps, lrate): #this function samples from the joint posterior and performs # a step of gradient ascent on the log-likelihood sp = self.get_prediction(self.s_past, self.a_past, self.h_past) #sp should be np by ns [s1_samps, h1_samps, s2_samps, h2_samps ], updates = theano.scan(fn=self.sample_joint, outputs_info=[None, None, None, None], non_sequences=[sp], n_steps=n_samps) x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1)) x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1)) s_pred = self.get_prediction(s1_samps, h1_samps) hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps) #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps) sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1)) / 2.0 xterm1 = -T.mean( T.sum((x1_recons - T.reshape(x1, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2)) xterm2 = -T.mean( T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2)) #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2)) energy = hterm1 + xterm1 + xterm2 + sterm gparams = T.grad( energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates): #gnat=T.dot(param, T.dot(param.T,param)) updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32') #make sure W has unit-length columns #new_W=updates[self.W] #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32') #MIGHT NEED TO NORMALIZE A return energy, updates def get_ESS(self): return 1.0 / T.sum(self.weights_now**2) def resample_step(self): idx = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0) h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0) return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32') def resample(self): [s_samps, h_samps], updates = theano.scan(fn=self.resample_step, outputs_info=[None, None], n_steps=self.npcl) updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.h_now] = T.cast(h_samps, 'float32') updates[self.weights_now] = T.cast( T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'), 'float32') #dtype paranoia return updates def simulate_step(self, s, a): s = T.reshape(s, (1, self.ns)) a = T.reshape(a, (1, self.na)) #get h probabilities h_probs = self.calc_h_probs(s, a) h_samp = self.theano_rng.multinomial(pvals=h_probs) sp = self.get_prediction(s, a, h_samp) xp = T.dot(self.W, sp.T) + T.reshape(self.c, (self.nx, 1)) return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp def simulate_forward(self, a, n_steps): #a should be n_steps by na s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)), axis=0) s0 = T.reshape(s0, (1, self.ns)) [sp, xp, hs], updates = theano.scan(fn=self.simulate_step, outputs_info=[s0, None, None], sequences=[a], n_steps=n_steps) return sp, xp, hs, updates
class SLmodel(): #This is a test of my idea to adapt the proposal distribution by #maximizing the entropy of the weights def __init__(self, nx, ns, nh, npcl, xvar=1.0): #for this model I assume one linear generative model and a #combination of nh linear dynamical models #generative matrix init_W=np.asarray(np.random.randn(nx,ns)/10.0,dtype='float32') #init_W=np.asarray(np.eye(2),dtype='float32') #always normalize the columns of W to be unit length init_W=init_W/np.sqrt(np.sum(init_W**2,axis=0)) #observed variable means init_c=np.asarray(np.zeros(nx),dtype='float32') #dynamical matrices #init_M=np.asarray(np.random.randn(ns,ns*nh)/2.0,dtype='float32') init_M=np.asarray((np.tile(np.eye(ns),(1,nh))),dtype='float32') #state-variable variances #(covariance matrix of state variable noise assumed to be diagonal) init_b=np.asarray(np.ones(ns)*10.0,dtype='float32') #Switching parameter matrix init_A=np.asarray(np.zeros((ns,nh)),dtype='float32') #priors for switching variable init_ph=np.asarray(np.zeros(nh),dtype='float32') #parameters for proposal distribution init_D=np.asarray(np.eye(ns),dtype='float32') init_E=np.asarray(np.random.randn(nx,ns)/100.0,dtype='float32') init_k=np.asarray(np.zeros(ns),dtype='float32') init_sig=np.asarray(np.ones(ns),dtype='float32') init_s_now=np.asarray(np.zeros((npcl,ns)),dtype='float32') init_h_now=np.asarray(np.zeros((npcl,nh)),dtype='float32') init_h_now[:,0]=1.0 init_weights_now=np.asarray(np.ones(npcl)/float(npcl),dtype='float32') init_s_past=np.asarray(np.zeros((npcl,ns)),dtype='float32') init_h_past=np.asarray(np.zeros((npcl,nh)),dtype='float32') init_h_past[:,0]=1.0 init_weights_past=np.asarray(np.ones(npcl)/float(npcl),dtype='float32') self.W=theano.shared(init_W) self.c=theano.shared(init_c) self.M=theano.shared(init_M) self.b=theano.shared(init_b) self.A=theano.shared(init_A) self.ph=theano.shared(init_ph) self.D=theano.shared(init_D) self.E=theano.shared(init_E) self.k=theano.shared(init_k) self.sig=theano.shared(init_sig) #this is to help vectorize operations self.sum_mat=T.as_tensor_variable(np.asarray((np.tile(np.eye(ns),nh)).T,dtype='float32')) self.s_now=theano.shared(init_s_now) self.h_now=theano.shared(init_h_now) self.weights_now=theano.shared(init_weights_now) self.s_past=theano.shared(init_s_past) self.h_past=theano.shared(init_h_past) self.weights_past=theano.shared(init_weights_past) self.xvar=np.asarray(xvar,dtype='float32') self.nx=nx #dimensionality of observed variables self.ns=ns #dimensionality of latent variables self.nh=nh #number of (linear) dynamical modes self.npcl=npcl #numer of particles in particle filter self.theano_rng = RandomStreams() self.params= [self.W, self.M, self.b, self.A, self.c, self.ph] self.rel_lrates=np.asarray([ 0.1, 1.0, 0.01, 10.0, 0.1, 1.0] ,dtype='float32') self.meta_params= [self.D, self.E, self.k, self.sig] self.meta_rel_lrates=[ 1.0, 1.0, 1.0, 1.0 ] def sample_proposal_s(self, s, h, xp): s_pred=self.get_prediction(s, h) n=self.theano_rng.normal(size=T.shape(s)) prop_mean=T.dot(s_pred, self.D) + T.reshape(T.dot(xp, self.E),(1,self.ns)) + self.k s_prop=prop_mean + n*T.reshape(T.exp(self.sig/2.0),(1,self.ns)) #I compute the term inside the exponent for the pdf of the proposal distrib prop_term=-T.sum(n**2)/2.0 return T.cast(s_prop,'float32'), T.cast(s_pred,'float32'), T.cast(prop_term,'float32'), prop_mean def calc_h_probs(self, s): #this function takes an np by ns matrix of s samples #and returns an nh by np set of h probabilities exp_terms=T.dot(s, self.A) + T.reshape(self.ph,(1,self.nh)) #re-centering for numerical stability exp_terms_recentered=exp_terms-T.max(exp_terms,axis=1) #exponentiation and normalization rel_probs=T.exp(exp_terms) probs=rel_probs.T/T.sum(rel_probs, axis=1) return probs.T def proposal_loss(self, s_pred, s_samps, xp, weights): #estimates the KL divergence between the proposal distribution #and the true posterior (minus one term, which we assume does not #depend on the proposal distribution). #prop means should be symblolic variables since we need to #compute the derivatives of D and E through this function prop_means=T.dot(s_pred, self.D) + T.reshape(T.dot(xp, self.E),(1,self.ns)) + self.k #np by ns diffs=(prop_means-s_samps) scl_diffs=diffs*T.reshape(T.exp(-self.sig),(1,self.ns)) energies=0.5*T.sum(diffs*scl_diffs,axis=1) tot=T.sum(energies*weights)+0.5*T.sum(self.sig) return tot def forward_filter_step(self, xp): #need to sample from the proposal distribution first s_samps, s_pred, prop_terms, prop_means = self.sample_proposal_s(self.s_now,self.h_now,xp) updates={} #now that we have samples from the proposal distribution, we need to reweight them h_probs = self.calc_h_probs(s_samps) h_samps=self.theano_rng.multinomial(pvals=h_probs) recons=T.dot(self.W, s_samps.T) + T.reshape(self.c,(self.nx,1)) x_terms=-T.sum((recons-T.reshape(xp,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2) s_terms=-T.sum(((s_samps-s_pred)*self.b)**2,axis=1)/2.0 energies=x_terms+s_terms-prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights energies_recentered=energies-T.max(energies) alpha=T.exp(energies_recentered) #these are the reweighting factors new_weights_unnorm=self.weights_now*alpha normalizer=T.sum(new_weights_unnorm) new_weights=new_weights_unnorm/normalizer #need to normalize new weights #gradient updates for the proposal distribution parameters lrate=1e-2 loss=self.proposal_loss(s_pred, s_samps, xp, new_weights) gparams=T.grad(loss, self.meta_params, consider_constant=[s_pred, s_samps, xp, new_weights]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.meta_params, self.meta_rel_lrates): updates[param] = T.cast(param - gparam*lrate*rel_lr,'float32') updates[self.h_past]=T.cast(self.h_now,'float32') updates[self.s_past]=T.cast(self.s_now,'float32') updates[self.h_now]=T.cast(h_samps,'float32') updates[self.s_now]=T.cast(s_samps,'float32') updates[self.weights_past]=T.cast(self.weights_now,'float32') updates[self.weights_now]=T.cast(new_weights,'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates #return h_samps, updates return updates def get_prediction(self, s, h): s_dot_M=T.dot(s, self.M) #this is np by nh*ns s_pred=T.dot(s_dot_M*T.extra_ops.repeat(h,self.ns,axis=1),self.sum_mat) #should be np by ns return T.cast(s_pred,'float32') def sample_joint(self, sp): t2_samp=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T s2_samp=T.cast(T.sum(self.s_now*T.addbroadcast(t2_samp,1),axis=0),'float32') h2_samp=T.cast(T.sum(self.h_now*T.addbroadcast(t2_samp,1),axis=0),'float32') diffs=self.b*(s2_samp-sp) sqr_term=T.sum(diffs**2,axis=1) alpha=T.exp(-sqr_term) probs_unnorm=self.weights_past*alpha probs=probs_unnorm/T.sum(probs_unnorm) t1_samp=self.theano_rng.multinomial(pvals=T.reshape(probs,(1,self.npcl))).T s1_samp=T.cast(T.sum(self.s_past*T.addbroadcast(t1_samp,1),axis=0),'float32') h1_samp=T.cast(T.sum(self.h_past*T.addbroadcast(t1_samp,1),axis=0),'float32') return [s1_samp, h1_samp, s2_samp, h2_samp] #def sample_posterior(self, n_samps): #sp, updates = theano.scan(fn=self.get_prediction, #outputs_info=[None], #sequences=[self.s_past, self.h_past], #n_steps=self.npcl) ##sp should be np by ns #[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint, #outputs_info=[None, None, None, None], #non_sequences=[sp], #n_steps=n_samps) #return [s1_samps, h1_samps, s2_samps, h2_samps] def h_energy_step(self, s, h): #helper function for self.calc_mean_h_energy exp_A_i=T.reshape(T.sum(self.exp_A*T.reshape(h,(self.nh,1)),axis=0),(self.ns,1)) mu_i=T.reshape(T.sum(self.mu*T.reshape(h,(self.nh,1)),axis=0), (self.ns,1)) ln_Z_h_i=T.sum(self.ln_Z_h*T.reshape(h,(self.nh,1))) ph_i=T.sum(self.ph*T.reshape(h,(self.nh,1))) diff=T.reshape(T.reshape(s,(self.ns,1))-mu_i,(self.ns,1)) diff_dot_exp_A_i=diff*exp_A_i gterm=-0.5*T.sum(T.sum(diff_dot_exp_A_i*diff)) energy=gterm+ln_Z_h_i+ph_i return energy def calc_mean_h_energy(self, s, h): #you give this function a set of samples of s and h, #it gives you the average energy of those samples exp_terms=T.dot(s, self.A) + T.reshape(self.ph,(1,self.nh)) #np by nh energies=T.sum(h*exp_terms,axis=1) + T.log(T.sum(T.exp(exp_terms),axis=1)) #should be np by 1 energy=T.mean(energies) return energy def update_params(self, x1, x2, n_samps, lrate): #this function samples from the joint posterior and performs # a step of gradient ascent on the log-likelihood sp=self.get_prediction(self.s_past, self.h_past) #sp should be np by ns [s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint, outputs_info=[None, None, None, None], non_sequences=[sp], n_steps=n_samps) x1_recons=T.dot(self.W, s1_samps.T) + T.reshape(self.c,(self.nx,1)) x2_recons=T.dot(self.W, s2_samps.T) + T.reshape(self.c,(self.nx,1)) s_pred = self.get_prediction(s1_samps, h1_samps) hterm1=self.calc_mean_h_energy(s1_samps, h1_samps) #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps) sterm=-T.mean(T.sum((self.b*(s2_samps-s_pred))**2,axis=1))/2.0 #xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) xterm2=-T.mean(T.sum((x2_recons-T.reshape(x2,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2)) energy = hterm1 + xterm2 + sterm gparams=T.grad(energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates): #gnat=T.dot(param, T.dot(param.T,param)) updates[param] = T.cast(param + gparam*lrate*rel_lr,'float32') #make sure W has unit-length columns #new_W=updates[self.W] #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32') #MIGHT NEED TO NORMALIZE A return energy, updates def get_ESS(self): return 1.0/T.sum(self.weights_now**2) def resample_step(self): idx=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T s_samp=T.sum(self.s_now*T.addbroadcast(idx,1),axis=0) h_samp=T.sum(self.h_now*T.addbroadcast(idx,1),axis=0) return T.cast(s_samp,'float32'), T.cast(h_samp,'float32') def resample(self): [s_samps, h_samps], updates = theano.scan(fn=self.resample_step, outputs_info=[None, None], n_steps=self.npcl) updates[self.s_now]=T.cast(s_samps,'float32') updates[self.h_now]=T.cast(h_samps,'float32') updates[self.weights_now]=T.cast(T.ones_like(self.weights_now)/T.cast(self.npcl,'float32'),'float32') #dtype paranoia return updates def simulate_step(self, s): s=T.reshape(s,(1,self.ns)) #get h probabilities h_probs = self.calc_h_probs(s) #h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(self.nh,1))) h_samp=self.theano_rng.multinomial(pvals=h_probs) sp=self.get_prediction(s,h_samp) xp=T.dot(self.W, sp.T) + T.reshape(self.c,(self.nx,1)) return T.cast(sp,'float32'), T.cast(xp,'float32'), h_samp def simulate_forward(self, n_steps): s0=T.sum(self.s_now*T.reshape(self.weights_now,(self.npcl,1)),axis=0) s0=T.reshape(s0,(1,self.ns)) [sp, xp, hs], updates = theano.scan(fn=self.simulate_step, outputs_info=[s0, None, None], n_steps=n_steps) return sp, xp, hs, updates
class SLmodel(): #This is a test of my idea to adapt the proposal distribution by #maximizing the entropy of the weights def __init__(self, nx, ns, nh, npcl, xvar=1.0): #for this model I assume one linear generative model and a #combination of nh linear dynamical models #generative matrix init_W = np.asarray(np.random.randn(nx, ns) / 10.0, dtype='float32') #init_W=np.asarray(np.eye(2),dtype='float32') #always normalize the columns of W to be unit length init_W = init_W / np.sqrt(np.sum(init_W**2, axis=0)) #observed variable means init_c = np.asarray(np.zeros(nx), dtype='float32') #dynamical matrices #init_M=np.asarray(np.random.randn(ns,ns*nh)/2.0,dtype='float32') init_M = np.asarray((np.tile(np.eye(ns), (1, nh))), dtype='float32') #state-variable variances #(covariance matrix of state variable noise assumed to be diagonal) init_b = np.asarray(np.ones(ns) * 10.0, dtype='float32') #Switching parameter matrix init_A = np.asarray(np.zeros((ns, nh)), dtype='float32') #priors for switching variable init_ph = np.asarray(np.zeros(nh), dtype='float32') #parameters for proposal distribution init_D = np.asarray(np.eye(ns), dtype='float32') init_E = np.asarray(np.random.randn(nx, ns) / 100.0, dtype='float32') init_k = np.asarray(np.zeros(ns), dtype='float32') init_sig = np.asarray(np.ones(ns), dtype='float32') init_s_now = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_h_now = np.asarray(np.zeros((npcl, nh)), dtype='float32') init_h_now[:, 0] = 1.0 init_weights_now = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') init_s_past = np.asarray(np.zeros((npcl, ns)), dtype='float32') init_h_past = np.asarray(np.zeros((npcl, nh)), dtype='float32') init_h_past[:, 0] = 1.0 init_weights_past = np.asarray(np.ones(npcl) / float(npcl), dtype='float32') self.W = theano.shared(init_W) self.c = theano.shared(init_c) self.M = theano.shared(init_M) self.b = theano.shared(init_b) self.A = theano.shared(init_A) self.ph = theano.shared(init_ph) self.D = theano.shared(init_D) self.E = theano.shared(init_E) self.k = theano.shared(init_k) self.sig = theano.shared(init_sig) #this is to help vectorize operations self.sum_mat = T.as_tensor_variable( np.asarray((np.tile(np.eye(ns), nh)).T, dtype='float32')) self.s_now = theano.shared(init_s_now) self.h_now = theano.shared(init_h_now) self.weights_now = theano.shared(init_weights_now) self.s_past = theano.shared(init_s_past) self.h_past = theano.shared(init_h_past) self.weights_past = theano.shared(init_weights_past) self.xvar = np.asarray(xvar, dtype='float32') self.nx = nx #dimensionality of observed variables self.ns = ns #dimensionality of latent variables self.nh = nh #number of (linear) dynamical modes self.npcl = npcl #numer of particles in particle filter self.theano_rng = RandomStreams() self.params = [self.W, self.M, self.b, self.A, self.c, self.ph] self.rel_lrates = np.asarray([0.1, 1.0, 0.01, 10.0, 0.1, 1.0], dtype='float32') self.meta_params = [self.D, self.E, self.k, self.sig] self.meta_rel_lrates = [1.0, 1.0, 1.0, 1.0] def sample_proposal_s(self, s, h, xp): s_pred = self.get_prediction(s, h) n = self.theano_rng.normal(size=T.shape(s)) prop_mean = T.dot(s_pred, self.D) + T.reshape(T.dot(xp, self.E), (1, self.ns)) + self.k s_prop = prop_mean + n * T.reshape(T.exp(self.sig / 2.0), (1, self.ns)) #I compute the term inside the exponent for the pdf of the proposal distrib prop_term = -T.sum(n**2) / 2.0 return T.cast(s_prop, 'float32'), T.cast(s_pred, 'float32'), T.cast( prop_term, 'float32'), prop_mean def calc_h_probs(self, s): #this function takes an np by ns matrix of s samples #and returns an nh by np set of h probabilities exp_terms = T.dot(s, self.A) + T.reshape(self.ph, (1, self.nh)) #re-centering for numerical stability exp_terms_recentered = exp_terms - T.max(exp_terms, axis=1) #exponentiation and normalization rel_probs = T.exp(exp_terms) probs = rel_probs.T / T.sum(rel_probs, axis=1) return probs.T def proposal_loss(self, s_pred, s_samps, xp, weights): #estimates the KL divergence between the proposal distribution #and the true posterior (minus one term, which we assume does not #depend on the proposal distribution). #prop means should be symblolic variables since we need to #compute the derivatives of D and E through this function prop_means = T.dot(s_pred, self.D) + T.reshape(T.dot( xp, self.E), (1, self.ns)) + self.k #np by ns diffs = (prop_means - s_samps) scl_diffs = diffs * T.reshape(T.exp(-self.sig), (1, self.ns)) energies = 0.5 * T.sum(diffs * scl_diffs, axis=1) tot = T.sum(energies * weights) + 0.5 * T.sum(self.sig) return tot def forward_filter_step(self, xp): #need to sample from the proposal distribution first s_samps, s_pred, prop_terms, prop_means = self.sample_proposal_s( self.s_now, self.h_now, xp) updates = {} #now that we have samples from the proposal distribution, we need to reweight them h_probs = self.calc_h_probs(s_samps) h_samps = self.theano_rng.multinomial(pvals=h_probs) recons = T.dot(self.W, s_samps.T) + T.reshape(self.c, (self.nx, 1)) x_terms = -T.sum( (recons - T.reshape(xp, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2) s_terms = -T.sum(((s_samps - s_pred) * self.b)**2, axis=1) / 2.0 energies = x_terms + s_terms - prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights energies_recentered = energies - T.max(energies) alpha = T.exp(energies_recentered) #these are the reweighting factors new_weights_unnorm = self.weights_now * alpha normalizer = T.sum(new_weights_unnorm) new_weights = new_weights_unnorm / normalizer #need to normalize new weights #gradient updates for the proposal distribution parameters lrate = 1e-2 loss = self.proposal_loss(s_pred, s_samps, xp, new_weights) gparams = T.grad(loss, self.meta_params, consider_constant=[s_pred, s_samps, xp, new_weights]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.meta_params, self.meta_rel_lrates): updates[param] = T.cast(param - gparam * lrate * rel_lr, 'float32') updates[self.h_past] = T.cast(self.h_now, 'float32') updates[self.s_past] = T.cast(self.s_now, 'float32') updates[self.h_now] = T.cast(h_samps, 'float32') updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.weights_past] = T.cast(self.weights_now, 'float32') updates[self.weights_now] = T.cast(new_weights, 'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates #return h_samps, updates return updates def get_prediction(self, s, h): s_dot_M = T.dot(s, self.M) #this is np by nh*ns s_pred = T.dot(s_dot_M * T.extra_ops.repeat(h, self.ns, axis=1), self.sum_mat) #should be np by ns return T.cast(s_pred, 'float32') def sample_joint(self, sp): t2_samp = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s2_samp = T.cast( T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') h2_samp = T.cast( T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') diffs = self.b * (s2_samp - sp) sqr_term = T.sum(diffs**2, axis=1) alpha = T.exp(-sqr_term) probs_unnorm = self.weights_past * alpha probs = probs_unnorm / T.sum(probs_unnorm) t1_samp = self.theano_rng.multinomial( pvals=T.reshape(probs, (1, self.npcl))).T s1_samp = T.cast( T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') h1_samp = T.cast( T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') return [s1_samp, h1_samp, s2_samp, h2_samp] #def sample_posterior(self, n_samps): #sp, updates = theano.scan(fn=self.get_prediction, #outputs_info=[None], #sequences=[self.s_past, self.h_past], #n_steps=self.npcl) ##sp should be np by ns #[s1_samps, h1_samps, s2_samps, h2_samps], updates = theano.scan(fn=self.sample_joint, #outputs_info=[None, None, None, None], #non_sequences=[sp], #n_steps=n_samps) #return [s1_samps, h1_samps, s2_samps, h2_samps] def h_energy_step(self, s, h): #helper function for self.calc_mean_h_energy exp_A_i = T.reshape( T.sum(self.exp_A * T.reshape(h, (self.nh, 1)), axis=0), (self.ns, 1)) mu_i = T.reshape(T.sum(self.mu * T.reshape(h, (self.nh, 1)), axis=0), (self.ns, 1)) ln_Z_h_i = T.sum(self.ln_Z_h * T.reshape(h, (self.nh, 1))) ph_i = T.sum(self.ph * T.reshape(h, (self.nh, 1))) diff = T.reshape(T.reshape(s, (self.ns, 1)) - mu_i, (self.ns, 1)) diff_dot_exp_A_i = diff * exp_A_i gterm = -0.5 * T.sum(T.sum(diff_dot_exp_A_i * diff)) energy = gterm + ln_Z_h_i + ph_i return energy def calc_mean_h_energy(self, s, h): #you give this function a set of samples of s and h, #it gives you the average energy of those samples exp_terms = T.dot(s, self.A) + T.reshape(self.ph, (1, self.nh)) #np by nh energies = T.sum(h * exp_terms, axis=1) + T.log( T.sum(T.exp(exp_terms), axis=1)) #should be np by 1 energy = T.mean(energies) return energy def update_params(self, x1, x2, n_samps, lrate): #this function samples from the joint posterior and performs # a step of gradient ascent on the log-likelihood sp = self.get_prediction(self.s_past, self.h_past) #sp should be np by ns [s1_samps, h1_samps, s2_samps, h2_samps ], updates = theano.scan(fn=self.sample_joint, outputs_info=[None, None, None, None], non_sequences=[sp], n_steps=n_samps) x1_recons = T.dot(self.W, s1_samps.T) + T.reshape(self.c, (self.nx, 1)) x2_recons = T.dot(self.W, s2_samps.T) + T.reshape(self.c, (self.nx, 1)) s_pred = self.get_prediction(s1_samps, h1_samps) hterm1 = self.calc_mean_h_energy(s1_samps, h1_samps) #hterm2=self.calc_mean_h_energy(s2_samps, h2_samps) sterm = -T.mean(T.sum((self.b * (s2_samps - s_pred))**2, axis=1)) / 2.0 #xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) xterm2 = -T.mean( T.sum((x2_recons - T.reshape(x2, (self.nx, 1)))**2, axis=0) / (2.0 * self.xvar**2)) #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2)) energy = hterm1 + xterm2 + sterm gparams = T.grad( energy, self.params, consider_constant=[s1_samps, s2_samps, h1_samps, h2_samps]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, self.params, self.rel_lrates): #gnat=T.dot(param, T.dot(param.T,param)) updates[param] = T.cast(param + gparam * lrate * rel_lr, 'float32') #make sure W has unit-length columns #new_W=updates[self.W] #updates[self.W]=T.cast(new_W/T.sqrt(T.sum(new_W**2,axis=0)),'float32') #MIGHT NEED TO NORMALIZE A return energy, updates def get_ESS(self): return 1.0 / T.sum(self.weights_now**2) def resample_step(self): idx = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s_samp = T.sum(self.s_now * T.addbroadcast(idx, 1), axis=0) h_samp = T.sum(self.h_now * T.addbroadcast(idx, 1), axis=0) return T.cast(s_samp, 'float32'), T.cast(h_samp, 'float32') def resample(self): [s_samps, h_samps], updates = theano.scan(fn=self.resample_step, outputs_info=[None, None], n_steps=self.npcl) updates[self.s_now] = T.cast(s_samps, 'float32') updates[self.h_now] = T.cast(h_samps, 'float32') updates[self.weights_now] = T.cast( T.ones_like(self.weights_now) / T.cast(self.npcl, 'float32'), 'float32') #dtype paranoia return updates def simulate_step(self, s): s = T.reshape(s, (1, self.ns)) #get h probabilities h_probs = self.calc_h_probs(s) #h_samp=self.theano_rng.multinomial(pvals=T.reshape(h_probs,(self.nh,1))) h_samp = self.theano_rng.multinomial(pvals=h_probs) sp = self.get_prediction(s, h_samp) xp = T.dot(self.W, sp.T) + T.reshape(self.c, (self.nx, 1)) return T.cast(sp, 'float32'), T.cast(xp, 'float32'), h_samp def simulate_forward(self, n_steps): s0 = T.sum(self.s_now * T.reshape(self.weights_now, (self.npcl, 1)), axis=0) s0 = T.reshape(s0, (1, self.ns)) [sp, xp, hs], updates = theano.scan(fn=self.simulate_step, outputs_info=[s0, None, None], n_steps=n_steps) return sp, xp, hs, updates
class EncoderDecoder(object): def __init__(self, rng, **kwargs): self.n_in_src = kwargs.pop('nembed_src') self.n_in_trg = kwargs.pop('nembed_trg') self.n_hids_src = kwargs.pop('nhids_src') self.n_hids_trg = kwargs.pop('nhids_trg') self.src_vocab_size = kwargs.pop('src_vocab_size') self.trg_vocab_size = kwargs.pop('trg_vocab_size') self.method = kwargs.pop('method') self.dropout = kwargs.pop('dropout') self.maxout_part = kwargs.pop('maxout_part') self.path = kwargs.pop('saveto') self.clip_c = kwargs.pop('clip_c') self.rng = rng self.trng = RandomStreams(rng.randint(1e5)) # added by Zhaopeng Tu, 2016-06-09 self.with_attention = kwargs.pop('with_attention') # added by Zhaopeng Tu, 2016-04-29 self.with_coverage = kwargs.pop('with_coverage') self.coverage_dim = kwargs.pop('coverage_dim') self.coverage_type = kwargs.pop('coverage_type') self.max_fertility = kwargs.pop('max_fertility') if self.coverage_type is 'linguistic': # make sure the dimension of linguistic coverage is always 1 self.coverage_dim = 1 # added by Zhaopeng Tu, 2016-05-30 self.with_context_gate = kwargs.pop('with_context_gate') self.params = [] self.layers = [] self.table_src = LookupTable(self.rng, self.src_vocab_size, self.n_in_src, name='table_src') self.layers.append(self.table_src) self.encoder = BidirectionalEncoder(self.rng, self.n_in_src, self.n_hids_src, self.table_src, name='birnn_encoder') self.layers.append(self.encoder) # added by Longyue self.encoder_hist_1 = Encoder(self.rng, self.n_in_src, self.n_hids_src, self.table_src, name='rnn_encoder_hist_1') self.layers.append(self.encoder_hist_1) self.encoder_hist_2 = Encoder(self.rng, self.n_hids_src, self.n_hids_src, self.table_src, name='rnn_encoder_hist_2') self.layers.append(self.encoder_hist_2) self.table_trg = LookupTable(self.rng, self.trg_vocab_size, self.n_in_trg, name='table_trg') self.layers.append(self.table_trg) self.decoder = Decoder(self.rng, self.n_in_trg, self.n_hids_trg, 2*self.n_hids_src, self.n_hids_src, \ # added by Zhaopeng Tu, 2016-06-09 with_attention=self.with_attention, \ # added by Zhaopeng Tu, 2016-04-29 with_coverage=self.with_coverage, coverage_dim=self.coverage_dim, coverage_type=self.coverage_type, max_fertility=self.max_fertility, \ # added by Zhaopeng Tu, 2016-05-30 with_context_gate=self.with_context_gate, \ maxout_part=self.maxout_part, name='rnn_decoder') self.layers.append(self.decoder) self.logistic_layer = LogisticRegression(self.rng, self.n_in_trg, self.trg_vocab_size) self.layers.append(self.logistic_layer) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction self.with_reconstruction = kwargs.pop('with_reconstruction') if self.with_reconstruction: # added by Zhaopeng Tu, 2016-07-27 self.reconstruction_weight = kwargs.pop('reconstruction_weight') # note the source and target sides are reversed self.inverse_decoder = InverseDecoder(self.rng, self.n_in_src, 2*self.n_hids_src, self.n_hids_trg, \ # added by Zhaopeng Tu, 2016-06-09 with_attention=self.with_attention, \ maxout_part=self.maxout_part, name='rnn_inverse_decoder') self.layers.append(self.inverse_decoder) self.srng = RandomStreams(rng.randint(1e5)) self.inverse_logistic_layer = LogisticRegression( self.rng, self.n_in_src, self.src_vocab_size, name='inverse_LR') self.layers.append(self.inverse_logistic_layer) for layer in self.layers: self.params.extend(layer.params) def build_trainer(self, src, src_mask, src_hist, src_hist_mask, trg, trg_mask, ite): # added by Longyue # checked by Zhaopeng: sentence dim = n_steps, hist_len, batch_size (4, 3, 25) # hist = (bath_size, sent_num, sent_len) --.T--> # hist = (sent_len, sent_num, bath_size) --lookup table--> # (sent_len, sent_num, bath_size, word_emb) --reshape--> # (sent_len, sent_num*bath_size, word_emb) --word-level rnn--> # (sent_len, sent_num*bath_size, hidden_size) --reshape--> # (sent_len, sent_num, bath_size, hidden_size) --[-1]--> # (sent_num, bath_size, hidden_size) --sent-level rnn--> # (sent_num, bath_size, hidden_size) --[-1]--> # (bath_size, hidden_size) = cross-sent context vector annotations_1 = self.encoder_hist_1.apply_1(src_hist, src_hist_mask) annotations_1 = annotations_1[-1] # get last hidden states annotations_2 = self.encoder_hist_2.apply_2(annotations_1) annotations_3 = annotations_2[-1] # get last hidden states #modified by Longyue annotations = self.encoder.apply(src, src_mask, annotations_3) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = (annotations * src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None] #added by Longyue init_context = concatenate([init_context, annotations_3], axis=annotations_3.ndim - 1) trg_emb = self.table_trg.apply(trg) trg_emb_shifted = T.zeros_like(trg_emb) trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1]) # modified by Longyue hiddens, readout, alignment = self.decoder.run_pipeline( state_below=trg_emb_shifted, mask_below=trg_mask, init_context=init_context, c=annotations, c_mask=src_mask, hist=annotations_3) # apply dropout if self.dropout < 1.0: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(self.trng, readout, 1, self.dropout) p_y_given_x = self.logistic_layer.get_probs(readout) self.cost = self.logistic_layer.cost(p_y_given_x, trg, trg_mask) / trg.shape[1] # self.cost = theano.printing.Print('likilihood cost:')(self.cost) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction if self.with_reconstruction: # now hiddens is the annotations inverse_init_context = (hiddens * trg_mask[:, :, None] ).sum(0) / trg_mask.sum(0)[:, None] src_emb = self.table_src.apply(src) src_emb_shifted = T.zeros_like(src_emb) src_emb_shifted = T.set_subtensor(src_emb_shifted[1:], src_emb[:-1]) inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask) # apply dropout if self.dropout < 1.0: # logger.info('Apply dropout with p = {}'.format(self.dropout)) inverse_readout = Dropout(self.srng, inverse_readout, 1, self.dropout) p_x_given_y = self.inverse_logistic_layer.get_probs( inverse_readout) self.reconstruction_cost = self.inverse_logistic_layer.cost( p_x_given_y, src, src_mask) / src.shape[1] # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost) self.cost += self.reconstruction_cost * self.reconstruction_weight self.L1 = sum(T.sum(abs(param)) for param in self.params) self.L2 = sum(T.sum(param**2) for param in self.params) params_regular = self.L1 * 1e-6 + self.L2 * 1e-6 # params_regular = theano.printing.Print('params_regular:')(params_regular) # train cost train_cost = self.cost + params_regular # gradients grads = T.grad(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # updates updates = adadelta(self.params, grads) # train function # modified by Longyue inps = [src, src_mask, src_hist, src_hist_mask, trg, trg_mask] self.train_fn = theano.function(inps, [train_cost], updates=updates, name='train_function') # self.train_fn = theano.function(inps, [train_cost], updates=updates, name='train_function', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) def build_sampler(self): # added by Longyue x_hist = T.ltensor3() x_hist_mask = T.tensor3() annotations_1 = self.encoder_hist_1.apply_1(x_hist, x_hist_mask) annotations_1 = annotations_1[-1] annotations_2 = self.encoder_hist_2.apply_2(annotations_1) annotations_3 = annotations_2[-1] x = T.lmatrix() # Build Networks # src_mask is None c = self.encoder.apply(x, None, annotations_3) #init_context = ctx[0, :, -self.n_hids_src:] # mean pooling init_context = c.mean(0) # added by Longyue init_context = concatenate([init_context, annotations_3], axis=annotations_3.ndim - 1) init_state = self.decoder.create_init_state(init_context) outs = [init_state, c, annotations_3] if not self.with_attention: outs.append(init_context) # compile function print 'Building compile_init_state_and_context function ...' self.compile_init_and_context = theano.function( [x, x_hist, x_hist_mask], outs, name='compile_init_and_context') print 'Done' y = T.lvector() cur_state = T.matrix() # if it is the first word, emb should be all zero, and it is indicated by -1 trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg), self.table_trg.apply(y)) # added by Zhaopeng Tu, 2016-06-09 # for with_attention=False if self.with_attention and self.with_coverage: cov_before = T.tensor3() if self.coverage_type is 'linguistic': print 'Building compile_fertility ...' fertility = self.decoder._get_fertility(c) fertility = T.addbroadcast(fertility, 1) self.compile_fertility = theano.function( [c], [fertility], name='compile_fertility') print 'Done' else: fertility = None else: cov_before = None fertility = None # apply one step # modified by Zhaopeng Tu, 2016-04-29 # [next_state, ctxs] = self.decoder.apply(state_below=trg_emb, results = self.decoder.apply( state_below=trg_emb, init_state=cur_state, # added by Zhaopeng Tu, 2016-06-09 init_context=None if self.with_attention else init_context, c=c if self.with_attention else None, hist=annotations_3, # added by Longyue one_step=True, # added by Zhaopeng Tu, 2016-04-27 cov_before=cov_before, fertility=fertility) next_state = results[0] if self.with_attention: ctxs, alignment = results[1], results[2] if self.with_coverage: cov = results[3] else: # if with_attention=False, we always use init_context as the source representation ctxs = init_context readout = self.decoder.readout(next_state, ctxs, trg_emb) # maxout if self.maxout_part > 1: readout = self.decoder.one_step_maxout(readout) # apply dropout if self.dropout < 1.0: readout = Dropout(self.trng, readout, 0, self.dropout) # compute the softmax probability next_probs = self.logistic_layer.get_probs(readout) # sample from softmax distribution to get the sample next_sample = self.trng.multinomial(pvals=next_probs).argmax(1) # compile function print 'Building compile_next_state_and_probs function ...' inps = [y, cur_state] if self.with_attention: inps.append(c) else: inps.append(init_context) # added by Longyue inps.append(annotations_3) outs = [next_probs, next_state, next_sample] # added by Zhaopeng Tu, 2016-06-09 if self.with_attention: outs.append(alignment) # added by Zhaopeng Tu, 2016-04-29 if self.with_coverage: inps.append(cov_before) if self.coverage_type is 'linguistic': inps.append(fertility) outs.append(cov) self.compile_next_state_and_probs = theano.function( inps, outs, name='compile_next_state_and_probs') print 'Done' # added by Zhaopeng Tu, 2016-07-18 # for reconstruction if self.with_reconstruction: # Build Networks # trg_mask is None inverse_c = T.tensor3() # mean pooling inverse_init_context = inverse_c.mean(0) inverse_init_state = self.inverse_decoder.create_init_state( inverse_init_context) outs = [inverse_init_state] if not self.with_attention: outs.append(inverse_init_context) # compile function print 'Building compile_inverse_init_state_and_context function ...' self.compile_inverse_init_and_context = theano.function( [inverse_c], outs, name='compile_inverse_init_and_context') print 'Done' src = T.lvector() inverse_cur_state = T.matrix() trg_mask = T.matrix() # if it is the first word, emb should be all zero, and it is indicated by -1 src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src), self.table_src.apply(src)) # apply one step # modified by Zhaopeng Tu, 2016-04-29 inverse_results = self.inverse_decoder.apply( state_below=src_emb, init_state=inverse_cur_state, # added by Zhaopeng Tu, 2016-06-09 init_context=None if self.with_attention else inverse_init_context, c=inverse_c if self.with_attention else None, c_mask=trg_mask, one_step=True) inverse_next_state = inverse_results[0] if self.with_attention: inverse_ctxs, inverse_alignment = inverse_results[ 1], inverse_results[2] else: # if with_attention=False, we always use init_context as the source representation inverse_ctxs = init_context inverse_readout = self.inverse_decoder.readout( inverse_next_state, inverse_ctxs, src_emb) # maxout if self.maxout_part > 1: inverse_readout = self.inverse_decoder.one_step_maxout( inverse_readout) # apply dropout if self.dropout < 1.0: inverse_readout = Dropout(self.srng, inverse_readout, 0, self.dropout) # compute the softmax probability inverse_next_probs = self.inverse_logistic_layer.get_probs( inverse_readout) # sample from softmax distribution to get the sample inverse_next_sample = self.srng.multinomial( pvals=inverse_next_probs).argmax(1) # compile function print 'Building compile_inverse_next_state_and_probs function ...' inps = [src, trg_mask, inverse_cur_state] if self.with_attention: inps.append(inverse_c) else: inps.append(inverse_init_context) outs = [ inverse_next_probs, inverse_next_state, inverse_next_sample ] # added by Zhaopeng Tu, 2016-06-09 if self.with_attention: outs.append(inverse_alignment) self.compile_inverse_next_state_and_probs = theano.function( inps, outs, name='compile_inverse_next_state_and_probs') print 'Done' def save(self, path=None): if path is None: path = self.path filenpz = open(path, "w") val = dict([(value.name, value.get_value()) for index, value in enumerate(self.params)]) logger.info("save the model {}".format(path)) numpy.savez(path, **val) filenpz.close() def load(self, path=None): if path is None: path = self.path if os.path.isfile(path): logger.info("load params {}".format(path)) val = numpy.load(path) for index, param in enumerate(self.params): logger.info('Loading {} with shape {}'.format( param.name, param.get_value(borrow=True).shape)) if param.name not in val.keys(): logger.info('Adding new param {} with shape {}'.format( param.name, param.get_value(borrow=True).shape)) continue if param.get_value().shape != val[param.name].shape: logger.info("Error: model param != load param shape {} != {}".format(\ param.get_value().shape, val[param.name].shape)) raise Exception("loading params shape mismatch") else: param.set_value(val[param.name], borrow=True) else: logger.error("file {} does not exist".format(path)) self.save()
class RNNUnidirectionalEncDec(Model): def __init__(self, hyperparams, encoder_vocab, decoder_vocab): self.hyperparams = hyperparams self.encoder_vocab = encoder_vocab self.decoder_vocab = decoder_vocab # hyperparams.encoder_vocab_size and hyperparams.decoder_vocab_size setting to max # TODO: Uncomment this and throw error #hyperparams.encoder_vocab_size = min(hyperparams.encoder_vocab_size, encoder_vocab.vocab_size) #hyperparams.decoder_vocab_size = min(hyperparams.decoder_vocab_size, decoder_vocab.vocab_size) # Preparing and Initializing Network Weights & Biases self.setup() # TODO: Loading and storing params def setup(self): """ Setup the shared variables and model components """ self._params = OrderedDict() # Encoder embeddings self.encoder_embeddings = Embeddings( 'encoder_emb', self.hyperparams.encoder_vocab.vocab_size, self.hyperparams.encoder_emb_dim) self._params.update(self.encoder_embeddings.params()) # Decoder embeddings self.decoder_embeddings = Embeddings( 'decoder_emb', self.hyperparams.decoder_vocab.vocab_size, self.hyperparams.decoder_emb_dim, add_bos=True) self._params.update(self.decoder_embeddings.params()) ################ # Encoder Layer ################ # TODO: make a different class if self.hyperparams.rnn_cell == 'gru': from ..nn.layers.gru import GRU as RNN elif self.hyperparams.rnn_cell == 'lstm': raise NotImplementedError else: logger.error("Invalid RNN Cell Type:" + self.hyperparams.rnn_cell) self.encoder_rnn_layer_l2r = RNN( name='encoder' + self.hyperparams.rnn_cell + '0_l2r', in_dim=self.hyperparams.encoder_emb_dim, num_units=self.hyperparams.encoder_units) self._params.update(self.encoder_rnn_layer_l2r.params()) # Transform to prepare init state of decoder self.decoder_init_transform = Dense( name='decoder_init_transform', in_dim=self.hyperparams.encoder_units, num_units=self.hyperparams.decoder_units, activation=Activation.tanh) self._params.update(self.decoder_init_transform.params()) ################ # Decoder Layer ############### # TODO: make a different class if self.hyperparams.rnn_cell == 'gru': from ..nn.layers.gru import ConditionalGRU as ConditionalRNN elif self.hyperparams.rnn_cell == 'lstm': raise NotImplementedError else: logger.error("Invalid RNN Cell Type:" + self.hyperparams.rnn_cell) self.decoder_rnn_layer = ConditionalRNN( name='decoder_' + self.hyperparams.rnn_cell + '0', in_dim=self.hyperparams.decoder_emb_dim, num_units=self.hyperparams.decoder_units, context_dim=self.hyperparams.encoder_units) self._params.update(self.decoder_rnn_layer.params()) # Read out words self.decoder_state_transform = Dense( name='decoder_state_transform', in_dim=self.hyperparams.decoder_units, num_units=self.hyperparams.decoder_emb_dim, activation=Activation.linear) self._params.update(self.decoder_state_transform.params()) self.prev_emb_transform = Dense( name='prev_emb_transform', in_dim=self.hyperparams.decoder_emb_dim, num_units=self.hyperparams.decoder_emb_dim, activation=Activation.linear) self._params.update(self.prev_emb_transform.params()) self.encoder_context_transform = Dense( name='encoder_context_transform', in_dim=self.hyperparams.encoder_units, num_units=self.hyperparams.decoder_emb_dim, activation=Activation.linear) self._params.update(self.encoder_context_transform.params()) self.word_probs_transform = Dense( name='word_probs_transform', in_dim=self.hyperparams.decoder_emb_dim, num_units=self.decoder_vocab.vocab_size, activation=Activation.linear) self._params.update(self.word_probs_transform.params()) # DEBUG #for k, v in self._params.iteritems(): # print k, v.get_value(), v.get_value().shape def build(self): self.trng = RandomStreams(1234) # dim(x) = (input_time_steps, num_samples) self.x = T.matrix('x', dtype='int64') # dim(x_mask) = (input_time_steps, num_samples) self.x_mask = T.matrix('x_mask', dtype='float32') # dim(y) = (output_time_steps, num_samples) self.y = T.matrix('y', dtype='int64') # dim(y_mask) = (output_time_steps, num_samples) self.y_mask = T.matrix('y_mask', dtype='float32') # get source word embeddings enc_emb = self.encoder_embeddings.Emb[self.x.flatten()] # dim(x) = timesteps x samples enc_emb = enc_emb.reshape([ self.x.shape[0], self.x.shape[1], self.hyperparams.encoder_emb_dim ]) # get decoder init state self.encoder_outputs = self.encoder_rnn_layer_l2r.build( enc_emb, self.x_mask)[0] last_encoder_output = self.encoder_outputs[ -1] # This will be the context at every input step # transform encoder output to get decoder init self.decoder_init = self.decoder_init_transform.build( last_encoder_output) # input # TODO: remove embedding shifting? dec_emb = self.decoder_embeddings.Emb[self.y.flatten()] dec_emb = dec_emb.reshape([ self.y.shape[0], self.y.shape[1], self.hyperparams.decoder_emb_dim ]) # Building the RNN layer self.decoder_outputs = self.decoder_rnn_layer.build( x=dec_emb, x_mask=self.y_mask, c=last_encoder_output, h_init=self.decoder_init)[ 0] # Only one output, hidden states at every time step # context with new axis. condition the output on encoder context as well. # TODO: remove axis adding? context = last_encoder_output[None, :, :] # dim(proj_h) = #timesteps x #samples x #num_units # Computing word probabilities logit_decoder_rnn = self.decoder_state_transform.build( self.decoder_outputs ) # dim(logit_rnn) = #timesteps x #samples x #emb_dim logit_prev_emb = self.prev_emb_transform.build( dec_emb) # dim(logit_prev) = #timesteps x #samples x #emb_dim logit_enc_context = self.encoder_context_transform.build(context) logit = self.word_probs_transform.build( Activation.tanh(logit_decoder_rnn + logit_prev_emb + logit_enc_context) ) # dim(logit) = #timesteps x #samples x #vocab_size # reshaping logit as (#timesteps*#samples) x vocab_size and performing softmax across vocabulary self.probs = T.nnet.softmax( logit.reshape([ logit.shape[0] * logit.shape[1], logit.shape[2] ])) #dim(probs) = (#timesteps*#samples) x vocab_size self.debug = [self.probs.shape, self.y.shape, self.y_mask.shape] #Building loss function self.build_loss() self._outputs = [self.probs] def build_loss(self): # TODO: Make it better? # y[0] is bos, remove it to calculate loss y_flat = self.y[1:].flatten( ) #x_flat: a linear array with size #timesteps*#samples y_flat_idx = T.arange( y_flat.shape[0]) * self.decoder_vocab.vocab_size + y_flat self._loss = -T.log(self.probs.flatten()[y_flat_idx]) self._loss = self._loss.reshape([self.y.shape[0] - 1, self.y.shape[1]]) self._loss = (self._loss * self.y_mask[1:]).sum(0) def build_sampler(self, sampling=True): initializer_input = [self.x, self.x_mask] initializer_output = [self.encoder_outputs, self.decoder_init] self.initializer = theano.function(initializer_input, initializer_output) sampler_input = [self.y, self.y_mask] + initializer_output if sampling == True: # sample a word from the output softmax, instead of selecting argmax next_token_index = self.trng.multinomial(pvals=self.probs).argmax( 1 ) # multinomial will represent 1 hot representation of the selected sample else: next_token_index = T.argmax(self.probs, axis=1) sampler_output = [self.probs, next_token_index, self.decoder_outputs] self.sampler = theano.function(sampler_input, sampler_output) def sample(self, batch, num_samples=5): source, source_mask, target, target_mask = self.prepare_train_input( batch) num_samples = np.minimum(1, source.shape[1]) # TODO: replace by random sampling: source = source[:, 0:num_samples] source_mask = source_mask[:, 0:num_samples] target = target[:, 0:num_samples] target_mask = target_mask[:, 0:num_samples] samples = [] for sample_index in xrange(num_samples): hypothesis = self.encode_decode([ source[:, sample_index:sample_index + 1], source_mask[:, sample_index:sample_index + 1] ]) hypothesis_sent = ' '.join(hypothesis) source_sent = ' '.join([ self.encoder_vocab.get_token(index) for index in source[:, sample_index] if index != self.encoder_vocab.get_index(self.encoder_vocab.eos) ]) #hypothesis_sent = ' '.join([self.decoder_vocab.get_token(index) for index in target[:, sample_index] if index != self.decoder_vocab.get_index(self.decoder_vocab.eos)]) target_sent = ' '.join([ self.decoder_vocab.get_token(index) for index in target[:, sample_index] if index != -1 and index != self.decoder_vocab.get_index(self.decoder_vocab.eos) ]) # TODO: change sample to dictionary, and in trainer display all keys and values samples.append( OrderedDict({ 'SRC': source_sent, 'HYP': hypothesis_sent, 'REF': target_sent })) return samples def encode_decode(self, test_input, max_length=50): encoding = self.initializer(*test_input) init_input = [np.array([[-1]]), np.array([[1.]], dtype='float32')] + encoding probs, next_token_index, decoder_outputs = self.sampler(*init_input) hypothesis = [] hyp_length = 0 while (next_token_index[0] != self.decoder_vocab.get_index( self.decoder_vocab.eos)): hypothesis.append(self.decoder_vocab.get_token( next_token_index[0])) hyp_length += 1 # This is if next_token_index is a scalar, i.e. when only one column is passed as test_input # [None] adds a new axis next_input = [ next_token_index[None], np.array([[1.]], dtype='float32') ] + encoding probs, next_token_index, decoder_outputs = self.sampler( *next_input) if hyp_length == max_length: break return hypothesis def loss(self): return self._loss.mean() def log_probs(self): # TODO: Make it better? return self._loss def outputs(self): return self._outputs def inputs(self): return [self.x, self.x_mask, self.y, self.y_mask] def params(self): return self._params def prepare_train_input(self, batch, max_length=None): # setting maxlen to length of longest sample max_length_input = max([len(sample[0]) for sample in batch]) max_length_target = max([len(sample[1]) for sample in batch]) # adding end of sentence marker inp = [[self.encoder_vocab.get_index(token) for token in sample[0]] + [self.encoder_vocab.get_index(self.encoder_vocab.eos)] for sample in batch] target = [[self.decoder_vocab.get_index(token) for token in sample[1]] + [self.decoder_vocab.get_index(self.decoder_vocab.eos)] for sample in batch] max_length_input += 1 max_length_target += 1 # preparing mask and input source_mask = np.array( [[1.] * len(inp_instance[:max_length_input]) + [0.] * (max_length_input - len(inp_instance)) for inp_instance in inp], dtype='float32').transpose() source = np.array([ inp_instance[:max_length_input] + [0.] * (max_length_input - len(inp_instance)) for inp_instance in inp ], dtype='int64').transpose() # taret preparation with -1 (beginning of sentence) row upfront target_mask = np.array( [[1.] + [1.] * len(target_instance[:max_length_target]) + [0.] * (max_length_target - len(target_instance)) for target_instance in target], dtype='float32').transpose() target = np.array([[-1] + target_instance[:max_length_target] + [0.] * (max_length_target - len(target_instance)) for target_instance in target], dtype='int64').transpose() return source, source_mask, target, target_mask
def stochastic_max_pool_bc01(bc01, pool_shape, pool_stride, image_shape, rng=None): """ .. todo:: WRITEME properly Stochastic max pooling for training as defined in: Stochastic Pooling for Regularization of Deep Convolutional Neural Networks Matthew D. Zeiler, Rob Fergus bc01: minibatch in format (batch size, channels, rows, cols), IMPORTANT: All values should be poitivie pool_shape: shape of the pool region (rows, cols) pool_stride: strides between pooling regions (row stride, col stride) image_shape: avoid doing some of the arithmetic in theano rng: theano random stream """ r, c = image_shape pr, pc = pool_shape rs, cs = pool_stride batch = bc01.shape[0] channel = bc01.shape[1] if rng is None: rng = RandomStreams(2022) # Compute index in pooled space of last needed pool # (needed = each input pixel must appear in at least one pool) def last_pool(im_shp, p_shp, p_strd): rval = int(numpy.ceil(float(im_shp - p_shp) / p_strd)) assert p_strd * rval + p_shp >= im_shp assert p_strd * (rval - 1) + p_shp < im_shp return rval # Compute starting row of the last pool last_pool_r = last_pool(image_shape[0], pool_shape[0], pool_stride[0]) * pool_stride[0] # Compute number of rows needed in image for all indexes to work out required_r = last_pool_r + pr last_pool_c = last_pool(image_shape[1], pool_shape[1], pool_stride[1]) * pool_stride[1] required_c = last_pool_c + pc # final result shape res_r = int(numpy.floor(last_pool_r / rs)) + 1 res_c = int(numpy.floor(last_pool_c / cs)) + 1 for bc01v in get_debug_values(bc01): assert not numpy.any(numpy.isinf(bc01v)) assert bc01v.shape[2] == image_shape[0] assert bc01v.shape[3] == image_shape[1] # padding padded = tensor.alloc(0.0, batch, channel, required_r, required_c) name = bc01.name if name is None: name = 'anon_bc01' bc01 = tensor.set_subtensor(padded[:, :, 0:r, 0:c], bc01) bc01.name = 'zero_padded_' + name # unraveling window = tensor.alloc(0.0, batch, channel, res_r, res_c, pr, pc) window.name = 'unravlled_winodows_' + name for row_within_pool in xrange(pool_shape[0]): row_stop = last_pool_r + row_within_pool + 1 for col_within_pool in xrange(pool_shape[1]): col_stop = last_pool_c + col_within_pool + 1 win_cell = bc01[:, :, row_within_pool:row_stop:rs, col_within_pool:col_stop:cs] window = tensor.set_subtensor( window[:, :, :, :, row_within_pool, col_within_pool], win_cell) # find the norm norm = window.sum(axis=[4, 5]) norm = tensor.switch(tensor.eq(norm, 0.0), 1.0, norm) norm = window / norm.dimshuffle(0, 1, 2, 3, 'x', 'x') # get prob prob = rng.multinomial(pvals=norm.reshape( (batch * channel * res_r * res_c, pr * pc)), dtype='float32') # select res = (window * prob.reshape( (batch, channel, res_r, res_c, pr, pc))).max(axis=5).max(axis=4) res.name = 'pooled_' + name return tensor.cast(res, theano.config.floatX)
), dtype = theano.config.floatX ), name = 'w_01', borrow = True, ) b_01 = theano.shared( value = np.zeros((4,), dtype=theano.config.floatX), name = 'b_01', borrow = True, ) h_0 = T.tanh(T.dot(x, w_00) + b_00) y_0 = T.dot(h_0, w_01) + b_01 distn_0 = T.nnet.softmax(y_0[:-1]) s_1 = T.nnet.sigmoid(y_0[-1:]) cate_chosen_0 = T.flatten(srng.multinomial(n = 1, pvals = distn_0)) ind_chosen_0 = T.argmax(cate_chosen_0) w_10 = theano.shared( value = np.asarray( rng.uniform( low = -np.sqrt(6. / 8), high = np.sqrt(6. / 8), size = (4, 4) ), dtype = theano.config.floatX ), name = 'w_10', borrow = True, ) b_10 = theano.shared(
class ImportanceSampler(): '''Implements importance sampling/resampling''' def __init__(self, ndims, n_particles, true_log_probs, proposal_func=None): ''' true_log_probs: a function that returns the true relative log probabilities proposal_func: a function that returns (samples, relative_log_probabilities) n_particles: the number of particles to use ''' self.true_log_probs=true_log_probs self.proposal_func=proposal_func self.n_particles=n_particles self.ndims=ndims init_particles=np.zeros((n_particles, self.ndims)) init_weights=np.ones(n_particles)/float(n_particles) self.particles=theano.shared(init_particles.astype(np.float32)) self.weights=theano.shared(init_weights.astype(np.float32)) self.theano_rng=RandomStreams() self.get_ESS=None self.perform_resampling=None self.perform_sampling=None def set_proposal_func(self, proposal_func): '''You might need to use this if you want to make the proposal function depend on the current particles''' self.proposal_func=proposal_func return def sample_reweight(self): '''Samples new particles and reweights them''' samples, prop_log_probs = self.proposal_func() true_log_probs=self.true_log_probs(samples) diffs=true_log_probs-prop_log_probs weights_unnorm=T.exp(diffs) weights=weights_unnorm/T.sum(weights_unnorm) updates=OrderedDict() updates[self.weights]=T.cast(weights,'float32') updates[self.particles]=T.cast(samples,'float32') return updates def compute_ESS(self): '''Returns the effective sample size''' return 1.0/T.sum(self.weights**2) def resample(self): '''Resamples using the current weights''' samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.weights.dimshuffle('x',0),self.n_particles,axis=0)) idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64') updates=OrderedDict() updates[self.particles]=self.particles[idxs] updates[self.weights]=T.cast(T.ones_like(self.weights)/float(self.n_particles),'float32') return updates def compile(self): '''Compiles the ESS, resampling, and sampling functions''' ess=self.compute_ESS() self.get_ESS=theano.function([],ess) resample_updates=self.resample() self.perform_resampling=theano.function([],updates=resample_updates) sample_updates=self.sample_reweight() self.perform_sampling=theano.function([],updates=sample_updates) return