def filterbank_matrices(self, center_y, center_x, delta, sigma): """Create a Fy and a Fx Parameters ---------- center_y : T.vector (shape: batch_size) center_x : T.vector (shape: batch_size) Y and X center coordinates for the attention window delta : T.vector (shape: batch_size) sigma : T.vector (shape: batch_size) Returns ------- FY : T.fvector (shape: ) FX : T.fvector (shape: ) """ tol = 1e-4 N = self.N rng = T.arange(N, dtype=floatX)-N/2.+0.5 # e.g. [1.5, -0.5, 0.5, 1.5] muX = center_x.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*rng muY = center_y.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*rng a = tensor.arange(self.img_width, dtype=floatX) b = tensor.arange(self.img_height, dtype=floatX) FX = tensor.exp( -(a-muX.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 ) FY = tensor.exp( -(b-muY.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 ) FX = FX / (FX.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) FY = FY / (FY.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) return FY, FX
def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed): """Create Theano tensor of approximate ELBO by Monte Carlo sampling. """ l = (uw.size / 2).astype('int64') u = uw[:l] w = uw[l:] # Callable tensor logp_ = lambda input: theano.clone(logp, {inarray: input}, strict=False) # Naive Monte-Carlo r = MRG_RandomStreams(seed=random_seed) if n_mcsamples == 1: n = r.normal(size=inarray.tag.test_value.shape) q = n * exp(w) + u elbo = logp_(q) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) else: n = r.normal(size=(n_mcsamples, u.tag.test_value.shape[0])) qs = n * exp(w) + u logps, _ = theano.scan(fn=lambda q: logp_(q), outputs_info=None, sequences=[qs]) elbo = tt.mean(logps) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) return elbo
def batchnorm(X, rescale=None, reshift=None, u=None, s=None, e=1e-8): """ batchnorm with support for not using scale and shift parameters as well as inference values (u and s) and partial batchnorm (via a) will detect and use convolutional or fully connected version """ g = rescale b = reshift if X.ndim == 4: if u is not None and s is not None: # use normalization params given a priori b_u = u.dimshuffle('x', 0, 'x', 'x') b_s = s.dimshuffle('x', 0, 'x', 'x') else: # compute normalization params from input b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') # batch normalize X = (X - b_u) / T.sqrt(b_s + e) if g is not None and b is not None: # apply rescale and reshift X = X*T.exp(0.2*g.dimshuffle('x', 0, 'x', 'x')) + b.dimshuffle('x', 0, 'x', 'x') elif X.ndim == 2: if u is None and s is None: # compute normalization params from input u = T.mean(X, axis=0) s = T.mean(T.sqr(X - u), axis=0) # batch normalize X = (X - u) / T.sqrt(s + e) if g is not None and b is not None: # apply rescale and reshift X = X*T.exp(0.2*g) + b else: raise NotImplementedError return X
def K(self, x, y): l = tensor.exp(self.log_lenscale) d = ((x ** 2).sum(axis=1).dimshuffle(0, 'x') + (y ** 2).sum(axis=1) - 2 * tensor.dot(x, y.T)) K = tensor.exp(-tensor.sqrt(d) / l) return K
def learn_step(self): #this is a list of gradients w.r.t. every parameter in self.params gparams=T.grad(self.loss, self.params) updates=OrderedDict() #updates the momentums and parameter values i=0 for param, gparam, momentum, lrate, momentum_coeff in zip(self.params, gparams, self.momentums, self.lrates, self.momentum_coeffs): #if param.ndim==2: # gparam=T.dot(T.dot(param,param.T),gparam) if param.name=='log_stddev': gparam=gparam*2.0*T.exp(2.0*param) if param.name=='M': gparam=gparam*T.exp(1.0*self.params[i+2]).dimshuffle('x',0) if param.name=='b': gparam=gparam*T.exp(1.0*self.params[i+1]) new_momentum=momentum_coeff*momentum - lrate*gparam*self.global_lrate new_param=param + new_momentum updates[param]=new_param updates[momentum]=new_momentum i+=1 updates[self.global_lrate]=self.global_lrate*self.lrate_decay return updates
def __init__(self, word_vec_width, batch_size, num_hidden, learning_rate=0.1): self.num_hidden = num_hidden self.learning_rate = learning_rate self.word_vec_width = word_vec_width self.batch_size = batch_size self.vocab_mat = T.fmatrix('vocab') self.word_onehot = T.fmatrix('word_onehot') b = T.fvector('b') W = T.fmatrix('W') f = 1 / (1 + T.exp(-(W * (self.word_onehot.dot(self.vocab_mat) + b)))) s = T.sum(f) self.exec_fn = theano.function( [self.word_onehot, b, W, self.vocab_mat], f, allow_input_downcast=True) self.word_onehot_c = T.fmatrix('word_onehot_c') f_c = 1 / (1 + T.exp(-(W * (self.word_onehot_c.dot(self.vocab_mat)) + b))) s_c = T.sum(f_c) J = T.largest(0, 1 - s + s_c) self.grad = theano.grad(J, [b, W, self.vocab_mat]) self.grad_fn = theano.function( [self.word_onehot, self.word_onehot_c, b, W, self.vocab_mat], self.grad, allow_input_downcast=True)
def __init__(self, alpha, beta, *args, **kwargs): super(Weibull, self).__init__(*args, **kwargs) self.alpha = alpha self.beta = beta self.mean = beta * T.exp(gammaln(1 + 1./alpha)) self.median = beta * T.exp(gammaln(T.log(2)))**(1./alpha) self.variance = (beta**2) * T.exp(gammaln(1 + 2./alpha - self.mean**2))
def forward_init(self): obs_ = self.obs_.reshape([self.obs_.shape[0]*self.obs_.shape[1], self.obs_.shape[-1]]) h = eval(self.activ)(tensor.dot(obs_, self.params['W']) + self.params['b'][None,None,:]) self.pi = [] for oi in xrange(self.n_out): pi = tensor.dot(h, self.params['U%d'%oi]) + self.params['c%d'%oi][None,:] pi = tensor.exp(pi - tensor.max(pi,-1,keepdims=True)) self.pi.append(pi / (pi.sum(-1, keepdims=True))) prev = tensor.matrix('prev', dtype='float32') #obs = tensor.matrix('obs', dtype='float32') obs_ = self.obs_.reshape([self.obs_.shape[0]*self.obs_.shape[1], self.obs_.shape[-1]]) obs_ = obs_[0] self.h_init = lambda x: numpy.float32(0.) h = eval(self.activ)(tensor.dot(obs_, self.params['W']) + self.params['b'][None,:]) pi = [] for oi in xrange(self.n_out): pi_ = tensor.dot(h, self.params['U%d'%oi]) + self.params['c%d'%oi][None,:] pi_ = tensor.exp(pi_ - tensor.max(pi_,-1,keepdims=True)) pi.append(pi_ / (pi_.sum(-1, keepdims=True))) self.forward = theano.function([self.obs, prev], [h] + pi, name='forward', on_unused_input='ignore')
def filterbank_matrices(center_y, center_x, delta, sigma, N, imgshp): """Create a Fy and a Fx Parameters ---------- center_y : T.vector (shape: batch_size) center_x : T.vector (shape: batch_size) Y and X center coordinates for the attention window delta : T.vector (shape: batch_size) sigma : T.vector (shape: batch_size) Returns ------- FY, FX """ tol = 1e-4 img_height, img_width = imgshp muX = center_x.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*(T.arange(N)-N/2-0.5) muY = center_y.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*(T.arange(N)-N/2-0.5) a = T.arange(img_width) b = T.arange(img_height) FX = T.exp( -(a-muX.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 ) FY = T.exp( -(b-muY.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 ) FX = FX / (FX.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) FY = FY / (FY.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) return FY, FX
def lp_norm(self, n, k, r, c, z): ''' Lp = ( 1/n * sum(|x_i|^p, 1..n))^(1/p) where p = 1 + ln(1+e^P) :param n: :param k: :param r: :param c: :param z: :return: ''' ds0, ds1 = self.pool_size st0, st1 = self.stride pad_h = self.pad[0] pad_w = self.pad[1] row_st = r * st0 row_end = T.minimum(row_st + ds0, self.img_rows) row_st = T.maximum(row_st, self.pad[0]) row_end = T.minimum(row_end, self.x_m2d + pad_h) col_st = c * st1 col_end = T.minimum(col_st + ds1, self.img_cols) col_st = T.maximum(col_st, self.pad[1]) col_end = T.minimum(col_end, self.x_m1d + pad_w) Lp = T.pow( T.mean(T.pow( T.abs_(T.flatten(self.y[n, k, row_st:row_end, col_st:col_end], 1)), 1 + T.log(1 + T.exp(self.P)) )), 1 / (1 + T.log(1 + T.exp(self.P))) ) return T.set_subtensor(z[n, k, r, c], Lp)
def output_probabilistic(self, m_w_previous, v_w_previous): if (self.non_linear): m_in = self.m_w - m_w_previous v_in = self.v_w # We compute the mean and variance after the ReLU activation lam = self.lam v_1 = 1 + 2*lam*v_in v_1_inv = v_1**-1 s_1 = T.prod(v_1,axis=1)**-0.5 v_2 = 1 + 4*lam*v_in v_2_inv = v_2**-1 s_2 = T.prod(v_2,axis=1)**-0.5 v_inv = v_in**-1 exponent1 = m_in**2*(1 - v_1_inv)*v_inv exponent1 = T.sum(exponent1,axis=1) exponent2 = m_in**2*(1 - v_2_inv)*v_inv exponent2 = T.sum(exponent2,axis=1) m_a = s_1*T.exp(-0.5*exponent1) v_a = s_2*T.exp(-0.5*exponent2) - m_a**2 return (m_a, v_a) else: m_w_previous_with_bias = \ T.concatenate([ m_w_previous, T.alloc(1, 1) ], 0) v_w_previous_with_bias = \ T.concatenate([ v_w_previous, T.alloc(0, 1) ], 0) m_linear = T.dot(self.m_w, m_w_previous_with_bias) / T.sqrt(self.n_inputs) v_linear = (T.dot(self.v_w, v_w_previous_with_bias) + \ T.dot(self.m_w**2, v_w_previous_with_bias) + \ T.dot(self.v_w, m_w_previous_with_bias**2)) / self.n_inputs return (m_linear, v_linear)
def flow(init_W,init_b,nData): import theano import theano.tensor as T n_layers = len(init_b) bias = [] weights = [] muStates = [] for layer_i in xrange(n_layers): bias.append(theano.shared(value=init_b[layer_i], name='b'+str(layer_i), borrow=True)) weights.append(theano.shared(value=init_W[layer_i], name='W'+str(layer_i), borrow=True)) muStates.append(T.matrix('mu'+str(layer_i))) for layer_i in xrange(n_layers): diffe = T.tile(bias[layer_i].copy(), (nData,1)) # All layers except top if layer_i < (n_layers-1): W_h = weights[layer_i].dot(muStates[layer_i+1].T).T diffe += W_h if layer_i > 0: vT_W = muStates[layer_i-1].dot(weights[layer_i-1]) diffe += vT_W exK = muStates[layer_i]*T.exp(.5*-diffe) + (1.-muStates[layer_i])*T.exp(.5*diffe) flows += exK.sum() return flows
def step(xinp_h1_t, xgate_h1_t, xinp_h2_t, xgate_h2_t, h1_tm1, h2_tm1, k_tm1, w_tm1, ctx): attinp_h1, attgate_h1 = att_to_h1.proj(w_tm1) h1_t = cell1.step(xinp_h1_t + attinp_h1, xgate_h1_t + attgate_h1, h1_tm1) h1inp_h2, h1gate_h2 = h1_to_h2.proj(h1_t) a_t = h1_t.dot(h1_to_att_a) b_t = h1_t.dot(h1_to_att_b) k_t = h1_t.dot(h1_to_att_k) a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) k_t = k_tm1 + tensor.exp(k_t) ss4 = calc_phi(k_t, a_t, b_t, u) ss5 = ss4.dimshuffle(0, 1, 'x') ss6 = ss5 * ctx.dimshuffle(1, 0, 2) w_t = ss6.sum(axis=1) attinp_h2, attgate_h2 = att_to_h2.proj(w_t) h2_t = cell2.step(xinp_h2_t + h1inp_h2 + attinp_h2, xgate_h2_t + h1gate_h2 + attgate_h2, h2_tm1) return h1_t, h2_t, k_t, w_t
def _step(self,xg_t, xo_t, xc_t, mask_tm1,h_tm1, c_tm1, u_g, u_o, u_c): h_mask_tm1 = mask_tm1 * h_tm1 c_mask_tm1 = mask_tm1 * c_tm1 act = T.tensordot( xg_t + h_mask_tm1, u_g , [[1],[2]]) gate = T.nnet.softmax(act.reshape((-1, act.shape[-1]))).reshape(act.shape) c_tilda = self.activation(xc_t + T.dot(h_mask_tm1, u_c)) sigma_se = self.k_parameters[0] sigma_per = self.k_parameters[1] sigma_b_lin = self.k_parameters[2] sigma_v_lin = self.k_parameters[3] sigma_rq = self.k_parameters[4] l_se = self.k_parameters[5] l_per = self.k_parameters[6] l_lin = self.k_parameters[7] l_rq = self.k_parameters[8] alpha_rq = self.k_parameters[9] p_per = self.k_parameters[10] k_se = T.pow(sigma_se,2) * T.exp( -T.pow(c_mask_tm1 - c_tilda,2) / (2* T.pow(l_se,2) + self.EPS)) k_per = T.pow(sigma_per,2) * T.exp( -2*T.pow(T.sin( math.pi*(c_mask_tm1 - c_tilda)/ (p_per + self.EPS) ),2) / ( T.pow(l_per,2) + self.EPS )) k_lin = T.pow(sigma_b_lin,2) + T.pow(sigma_v_lin,2) * (c_mask_tm1 - l_lin) * (c_tilda - l_lin ) k_rq = T.pow(sigma_rq,2) * T.pow( 1 + T.pow( (c_mask_tm1 - c_tilda),2) / ( 2 * alpha_rq * T.pow(l_rq,2) + self.EPS), -alpha_rq) ops = [c_mask_tm1,c_tilda,k_se, k_per, k_lin,k_rq] yshuff = T.as_tensor_variable( ops, name='yshuff').dimshuffle(1,2,0) c_t = (gate.reshape((-1,gate.shape[-1])) * yshuff.reshape((-1,yshuff.shape[-1]))).sum(axis = 1).reshape(gate.shape[:2]) o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1, u_o)) h_t = o_t * self.activation(c_t) return h_t, c_t
def get_gradients(self, X, Y, weights=1.0): W_mean, W_ls, b_mean, b_ls = self.parameters mean, log_sigma = self.sample_expected(Y) sigma = tensor.exp(log_sigma) cost = -log_sigma - 0.5 * (X - mean) ** 2 / tensor.exp(2 * log_sigma) if weights != 1.0: cost = -weights.dimshuffle(0, "x") * cost cost_scaled = sigma ** 2 * cost cost_gscale = (sigma ** 2).sum(axis=1).dimshuffle([0, "x"]) cost_gscale = cost_gscale * cost gradients = OrderedDict() params = Selector(self.mlp).get_parameters() for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost_gscale.sum(), param, consider_constant=[X, Y]) gradients[W_mean] = tensor.grad(cost_scaled.sum(), W_mean, consider_constant=[X, Y]) gradients[b_mean] = tensor.grad(cost_scaled.sum(), b_mean, consider_constant=[X, Y]) gradients[W_ls] = tensor.grad(cost_scaled.sum(), W_ls, consider_constant=[X, Y]) gradients[b_ls] = tensor.grad(cost_scaled.sum(), b_ls, consider_constant=[X, Y]) return gradients
def decoder(localt, stm1, cstm1, hmat, Wbeta, Ubeta, vbeta, Wzide, Wzfde, Wzcde, Wzode, Ede, Wxide, Wside, bide, Wxfde, Wsfde, bfde, Wxcde, Wscde, bcde, Wxode, Wsode, bode, L0, Ls, Lz): xt = theano.dot(localt, Ede) # get z from hmat (sentlen * nen), stm1 beta = \ theano.dot( act( theano.dot(hmat,Ubeta) + theano.dot(stm1,Wbeta) ) , vbeta ) alpha = T.exp(beta-T.max(beta)) / T.sum(T.exp(beta-T.max(beta)) ) zt = theano.dot(alpha, hmat) # it = sigma(theano.dot(xt,Wxide) + theano.dot(stm1,Wside) + theano.dot(zt,Wzide) + bide ) ft = sigma(theano.dot(xt,Wxfde) + theano.dot(stm1,Wsfde) + theano.dot(zt,Wzfde) + bfde ) cst = ft * cstm1 + it*act(theano.dot(xt,Wxcde)+theano.dot(stm1,Wscde)+ theano.dot(zt,Wzcde) +bcde ) ot = sigma(theano.dot(xt,Wxode) + theano.dot(stm1,Wsode) + theano.dot(zt,Wzode) +bode ) st = ot * act(cst) # winst = getwins() stfory = st * winst # yt0 = T.dot( (xt + T.dot(stfory, Ls) + T.dot(zt, Lz) ) , L0) #yt0 = theano.dot(st,Wsyde) yt0max = T.max(yt0) #yt0maxvec = T.maximum(yt0, yt0max) yt = T.exp(yt0-yt0max) / T.sum(T.exp(yt0-yt0max)) logyt = yt0-yt0max-T.log(T.sum(T.exp(yt0-yt0max))) #yt = T.exp(yt0-yt0maxvec) / T.sum(T.exp(yt0-yt0maxvec)) #logyt = yt0-yt0maxvec-T.log(T.sum(T.exp(yt0-yt0maxvec))) # yt = T.concatenate([addzero,tempyt],axis=0) return st, cst, yt, logyt
def nn2att(self, l): """Convert neural-net outputs to attention parameters Parameters ---------- l : tensor (batch_size x 5) Returns ------- center_y : vector (batch_size) center_x : vector (batch_size) delta : vector (batch_size) sigma : vector (batch_size) gamma : vector (batch_size) """ center_y = l[:,0] center_x = l[:,1] log_delta = l[:,2] log_sigma = l[:,3] log_gamma = l[:,4] delta = T.exp(log_delta) sigma = T.exp(log_sigma/2.) gamma = T.exp(log_gamma).dimshuffle(0, 'x') # normalize coordinates center_x = (center_x+1.)/2. * self.img_width center_y = (center_y+1.)/2. * self.img_height delta = (max(self.img_width, self.img_height)-1) / (self.N-1) * delta return center_y, center_x, delta, sigma, gamma
def test_elemwise1(): """ Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """ shape = (3, 4) a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.5, 'a') b = tensor.fmatrix() #let debugmode catch any mistakes print >> sys.stdout, "STARTING FUNCTION 1" f = pfunc([b], [], updates=[(a, b ** a)], mode=mode_with_gpu) for i, node in enumerate(f.maker.env.toposort()): print i, node f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3) print >> sys.stdout, "STARTING FUNCTION 2" #let debugmode catch any mistakes f = pfunc([b], [], updates=[(a, tensor.exp(b ** a))], mode=mode_with_gpu) for i, node in enumerate(f.maker.env.toposort()): print i, node f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3) print >> sys.stdout, "STARTING FUNCTION 3" #let debugmode catch any mistakes f = pfunc([b], [], updates=[(a, a + b * tensor.exp(b ** a))], mode=mode_with_gpu) f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
def softmax(x): if x.ndim == 2: e = TT.exp(x) return e / TT.sum(e, axis=1).dimshuffle(0, 'x') else: e = TT.exp(x) return e/ TT.sum(e)
def _step(self,h_tm1,p_x,p_xm,ctx): #visual attention #ctx=dropout_layer(ctx) v_a=T.exp(ctx+T.dot(h_tm1,self.W_v)) v_a=v_a/v_a.sum(1, keepdims=True) ctx_p=ctx*v_a #linguistic attention l_a=p_x+T.dot(h_tm1,self.W_l)[None,:,:] l_a=T.dot(l_a,self.U_att)+self.b_att l_a=T.exp(l_a.reshape((l_a.shape[0],l_a.shape[1]))) l_a=l_a/l_a.sum(0, keepdims=True) l_a=l_a*p_xm p_x_p=(p_x*l_a[:,:,None]).sum(0) h= T.dot(ctx_p,self.W_vh) + T.dot(p_x_p,self.W_lh) return h
def _step(self, x_tm1, u_tm1, inputs, x_prior, u_prior, *args): # x_prior are previous states # u_prior are causes from above outputs = self.activation(T.dot(x_tm1, self.W)) rec_error = T.sqr(inputs - outputs).sum() causes = (1 + T.exp(-T.dot(u_tm1, self.V))) * .5 if self.pool_flag: batch_size = inputs.shape[0] dim = causes.shape[1] imgs = T.cast(T.sqrt(dim), 'int64') causes_up = causes.reshape( (batch_size, 1, imgs, imgs)).repeat( self.pool_size, axis=2).repeat(self.pool_size, axis=3).flatten(ndim=2) else: causes_up = causes x = _IstaStep(rec_error, x_tm1, lambdav=self.gamma*causes_up, x_prior=x_prior) if self.pool_flag: dim = T.cast(T.sqrt(x.shape[1]), 'int64') x_pool = x.reshape((batch_size, 1, dim, dim)) x_pool = max_pool_2d(x_pool, ds=(self.pool_size, )*2).flatten(ndim=2) else: x_pool = x prev_u_cost = .01 * self.gamma * T.sqr(u_tm1-u_prior).sum() u_cost = causes * abs(x_pool) * self.gamma + prev_u_cost u = _IstaStep(u_cost.sum(), u_tm1, lambdav=self.gamma) causes = (1 + T.exp(-T.dot(u, self.V))) * .5 u_cost = causes * abs(x_pool) * self.gamma return (x, u, u_cost, outputs)
def createGradientFunctions(self): #create X = T.dmatrices("X") mu, logSigma, u, v, f, R = T.dcols("mu", "logSigma", "u", "v", "f", "R") mu = sharedX( np.random.normal(10, 10, (self.dimTheta, 1)), name='mu') logSigma = sharedX(np.random.uniform(0, 4, (self.dimTheta, 1)), name='logSigma') logLambd = sharedX(np.matrix(np.random.uniform(0, 10)),name='logLambd') logLambd = T.patternbroadcast(T.dmatrix("logLambd"),[1,1]) negKL = 0.5 * T.sum(1 + 2*logSigma - mu ** 2 - T.exp(logSigma) ** 2) theta = mu+T.exp(logSigma)*v W=theta y=X[:,0] X_sim=X[:,1:] f = (T.dot(X_sim,W)+u).flatten() gradvariables = [mu, logSigma, logLambd] logLike = T.sum(-(0.5 * np.log(2 * np.pi) + logLambd) - 0.5 * ((y-f)/(T.exp(logLambd)))**2) logp = (negKL + logLike)/self.m optimizer = -logp self.negKL = th.function([mu, logSigma], negKL, on_unused_input='ignore') self.f = th.function(gradvariables + [X,u,v], f, on_unused_input='ignore') self.logLike = th.function(gradvariables + [X, u, v], logLike,on_unused_input='ignore') derivatives = T.grad(logp,gradvariables) derivatives.append(logp) self.gradientfunction = th.function(gradvariables + [X, u, v], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [X, u, v], logp, on_unused_input='ignore') self.optimizer = BatchGradientDescent(objective=optimizer, params=gradvariables,inputs = [X,u,v],conjugate=True,max_iter=1)
def bbox_transform_inv(boxes, deltas): if boxes.shape[0] == 0: return T.zeros((0, deltas.shape[1]), dtype=deltas.dtype) boxes = boxes.astype(deltas.dtype) widths = boxes[:, 2] - boxes[:, 0] + 1.0 heights = boxes[:, 3] - boxes[:, 1] + 1.0 ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights dx = deltas[:, 0::4] dy = deltas[:, 1::4] dw = deltas[:, 2::4] dh = deltas[:, 3::4] pred_ctr_x = dx * widths.dimshuffle(0,'x') + ctr_x.dimshuffle(0,'x') pred_ctr_y = dy * heights.dimshuffle(0,'x') + ctr_y.dimshuffle(0,'x') pred_w = T.exp(dw) * widths.dimshuffle(0,'x') pred_h = T.exp(dh) * heights.dimshuffle(0,'x') pred_boxes = T.zeros_like(deltas, dtype=deltas.dtype) # x1 pred_boxes = T.set_subtensor(pred_boxes[:, 0::4], pred_ctr_x - 0.5 * pred_w) # y1 pred_boxes = T.set_subtensor(pred_boxes[:, 1::4], pred_ctr_y - 0.5 * pred_h) # x2 pred_boxes = T.set_subtensor(pred_boxes[:, 2::4], pred_ctr_x + 0.5 * pred_w) # y2 pred_boxes = T.set_subtensor(pred_boxes[:, 3::4], pred_ctr_y + 0.5 * pred_h) return pred_boxes
def model(x, p, p_dropout, noise): input_size = x.shape[1] h0 = p.W_emb[x] # (seq_len, batch_size, emb_size) h0 = dropout(h0, p_dropout) cost, h1, c1, h2, c2 = [0., b1_h, b1_c, b2_h, b2_c] eps = srnd.normal((self.hp.seq_size, input_size, self.n_zpt), dtype=theano.config.floatX) for t in xrange(0, self.hp.seq_size): if t >= self.hp.warmup_size: pyx = softmax(T.dot(h2, T.transpose(p.W_emb))) cost += T.sum(T.nnet.categorical_crossentropy(pyx, theano_one_hot(x[t], n_tokens))) h_x = concatenate([h0[t], h2], axis=1) h1, c1 = lstm(h_x, h1, c1, p.W1, p.V1, p.b1) h1 = dropout(h1, p_dropout) mu_encoder = T.dot(h1, p.Wmu) + p.bmu if noise: log_sigma_encoder = 0.5*(T.dot(h1, p.Wsi) + p.bsi) cost += -0.5* T.sum(1 + 2*log_sigma_encoder - mu_encoder**2 - T.exp(2*log_sigma_encoder)) * 0.01 z = mu_encoder + eps[t]*T.exp(log_sigma_encoder) else: z = mu_encoder h2, c2 = lstm(z, h2, c2, p.W2, p.V2, p.b2) h2 = dropout(h2, p_dropout) h_updates = [(b1_h, h1), (b1_c, c1), (b2_h, h2), (b2_c, c2)] return cost, h_updates
def softmax_ratio(numer, denom): """ .. todo:: WRITEME properly Parameters ---------- numer : Variable Output of a softmax. denom : Variable Output of a softmax. Returns ------- ratio : Variable numer / denom, computed in a numerically stable way """ numer_Z = arg_of_softmax(numer) denom_Z = arg_of_softmax(denom) numer_Z -= numer_Z.max(axis=1).dimshuffle(0, 'x') denom_Z -= denom_Z.min(axis=1).dimshuffle(0, 'x') new_num = T.exp(numer_Z - denom_Z) * (T.exp(denom_Z).sum( axis=1).dimshuffle(0, 'x')) new_den = (T.exp(numer_Z).sum(axis=1).dimshuffle(0, 'x')) return new_num / new_den
def cost(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a softmax estimate. of Y. Returns negative log probability of Y under the Y_hat distribution. """ y_probclass, y_probcluster = Y_hat #Y = self._group_dot.fprop(Y, Y_hat) CLS = self.array_clusters[T.cast(T.argmax(Y,axis=1),'int32')] #theano.printing.Print('value of cls')(CLS) assert hasattr(y_probclass, 'owner') owner = y_probclass.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 y_probclass, = owner.inputs owner = y_probclass.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z_class ,= owner.inputs assert z_class.ndim == 2 assert hasattr(y_probcluster, 'owner') owner = y_probcluster.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 y_probcluster, = owner.inputs owner = y_probcluster.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z_cluster ,= owner.inputs assert z_cluster.ndim == 2 z_class = z_class - z_class.max(axis=1).dimshuffle(0, 'x') log_prob = z_class - T.log(T.exp(z_class).sum(axis=1).dimshuffle(0, 'x')) # we use sum and not mean because this is really one variable per row # Y = OneHotFormatter(self.n_classes).theano_expr( # T.addbroadcast(Y,0,1).dimshuffle(0).astype('uint32')) log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 # cluster z_cluster = z_cluster - z_cluster.max(axis=1).dimshuffle(0, 'x') log_prob_cls = z_cluster - T.log(T.exp(z_cluster).sum(axis=1).dimshuffle(0, 'x')) out = OneHotFormatter(self.n_clusters).theano_expr(CLS.astype('int32')) #CLS = OneHotFormatter(self.n_clusters).theano_expr( # T.addbroadcast(CLS, 1).dimshuffle(0).astype('uint32')) log_prob_of_cls = (out * log_prob_cls).sum(axis=1) assert log_prob_of_cls.ndim == 1 # p(w|history) = p(c|s) * p(w|c,s) log_prob_of = log_prob_of + log_prob_of_cls rval = log_prob_of.mean() return - rval
def initialise(self): rng = np.random.RandomState(23455) inpt = self.inpt w_shp = (self.in_dim,self.out_dim) w_bound = np.sqrt(self.out_dim) W_mu = theano.shared( np.asarray( rng.normal(0.,0.01,size=w_shp), dtype=inpt.dtype), name ='w_post_mu') b_shp = (self.out_dim,) b_mu = theano.shared(np.asarray( np.zeros(self.out_dim), dtype=inpt.dtype), name ='b_post_mu') W_sigma = theano.shared( np.asarray( rng.normal(0.,0.01,size=w_shp), dtype=inpt.dtype), name ='w_post_sigm') b_sigma = theano.shared(np.asarray( np.zeros(self.out_dim), dtype=inpt.dtype), name ='b_post_sigm') #Find the hidden variable z self.mu_encoder = T.dot(self.inpt,W_mu) +b_mu self.log_sigma_encoder =0.5*(T.dot(self.inpt,W_sigma) + b_sigma) self.output =self.mu_encoder +T.exp(self.log_sigma_encoder)*self.eps.astype(theano.config.floatX) self.prior = 0.5* T.sum(1 + 2*self.log_sigma_encoder - self.mu_encoder**2 - T.exp(2*self.log_sigma_encoder),axis=1).astype(theano.config.floatX) self.params = [W_mu,b_mu,W_sigma,b_sigma]
def softmax_neg(self, X): if hasattr(self, 'hack_matrix'): X = X * self.hack_matrix e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x')) * self.hack_matrix else: e_x = T.fill_diagonal(T.exp(X - X.max(axis=1).dimshuffle(0, 'x')), 0) return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
def entropy_exp(X, g=None, b=None, u=None, s=None, a=1., e=1e-8): if X.ndim == 4: if u is not None and s is not None: b_u = u.dimshuffle('x', 0, 'x', 'x') b_s = s.dimshuffle('x', 0, 'x', 'x') else: b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') if a != 1: b_u = (1. - a)*0. + a*b_u b_s = (1. - a)*1. + a*b_s X = (X - b_u) / T.sqrt(b_s + e) if g is not None and b is not None: X = X*T.exp(g.dimshuffle('x', 0, 'x', 'x'))+b.dimshuffle('x', 0, 'x', 'x') elif X.ndim == 2: if u is None and s is None: u = T.mean(X, axis=0) s = T.mean(T.sqr(X - u), axis=0) if a != 1: u = (1. - a)*0. + a*u s = (1. - a)*1. + a*s X = (X - u) / T.sqrt(s + e) if g is not None and b is not None: X = X*T.exp(g)+b else: raise NotImplementedError return X
def compute_f_mu(x, t, params): [centers, spreads, biases, M, b]=params diffs=x.dimshuffle(0,1,2,'x')-centers.dimshuffle('x','x',0,1) scaled_diffs=(diffs**2)*T.exp(spreads).dimshuffle('x','x',0,1) exp_terms=T.sum(scaled_diffs,axis=2)+biases.dimshuffle('x','x',0)*0.0 h=T.exp(-exp_terms) sumact=T.sum(h,axis=2) #Normalization hnorm=h/sumact.dimshuffle(0,1,'x') z=T.dot(hnorm,M) z=T.reshape(z,(t.shape[0],t.shape[1],ntgates,nx))+b.dimshuffle('x','x',0,1) #nt by nb by ntgates by nx #z=z+T.reshape(x,(t.shape[0],t.shape[1],1,nx)) tpoints=T.cast(T.arange(ntgates),'float32')/T.cast(ntgates-1,'float32') tpoints=T.reshape(tpoints, (1,1,ntgates)) #tgating=T.exp(T.dot(t,muWT)+mubT) #nt by nb by ntgates tgating=T.exp(-kT*(tpoints-t)**2) tgating=tgating/T.reshape(T.sum(tgating, axis=2),(t.shape[0], t.shape[1], 1)) tgating=T.reshape(tgating,(t.shape[0],t.shape[1],ntgates,1)) mult=z*tgating out=T.sum(mult,axis=2) #out=out+x return T.cast(out,'float32')
def _log_dot_tensor(x, z): log_dot = x.dimshuffle(1, 'x', 0) + z max_ = log_dot.max(axis=0) out = (T.log(T.sum(T.exp(log_dot - max_[None, :, :]), axis=0)) + max_) out = out.T return T.switch(T.isnan(out), -numpy.inf, out)
def htovMB(self, HsampM): """ computes visible unit outputs given hidden unit inputs ("half" a MCMC iteration) computes in parallel given input rows of hidden units args: HsampM (T.matrix): rows of hidden unit inputs returns: a T.matrix, rows of visible unit outputs """ T_omgH = T.matrix(name="T_omgH", dtype=theano.config.floatX) T_means = T.matrix(name="T_means", dtype=theano.config.floatX) htovMBres = T.matrix(name="htovMBres", dtype=theano.config.floatX) T_omgH = T.transpose(T.dot(self.T_omega, T.transpose(HsampM))) T_means = T.fill(T_omgH, self.T_a) + T_omgH htovMBres = self.T_rng.normal(size=T_means.shape, avg=T_means, std=T.fill(T_means,T.sqrt(T.exp(self.T_z))), dtype=theano.config.floatX) return htovMBres
def __init__(self, noOfVisibleUnits, noOfHiddenUnits, CD_n, aRate, bRate, omegaRate, sigmaRate, omega=None, b=None, a=None, z=None, rprop_e = 0.01, rprop_en =0.005, sparseTargetp=0.01): ''' constructor RBMrv_T(self, noOfVisibleUnits, noOfHiddenUnits, CD_n, aRate, bRate, omegaRate, sigmaRate, omega=None, b=None, a=None, z=None, rprop_e = 0.01, rprop_en =0.005, sparseTargetp=0.01): noOfVisibleUnits (int): must be perfect square noOfHiddenUnits (int): must be perfect square CD_n (int): no. of iterations in MCMC simulation during training, check if model means are used if CD_n = 1 aRate (float32): update rate of parameter \underline{a} during training bRate (float32): update rate of parameter \underline{b} during training omegaRate (float32): update rate of parameter \boldsymbol{\omega} during training sigmaRate (float32): update rate of parameter \underline{z} during training omega (numpy array of float32): \omega parameter matrix with noOfVisible unit rows x noOfHiddenUnits columns b (numpy array of float32): b parameter vector, size = noOfHiddenUnits a (numpy array of float32): b parameter vector, size = noOfVisibleUnits z (numpy array of float32): z parameter vector, size = noOfVisibleUnits rprop_e (float32): rprop_en (float32): sparseTargetp (float32): target mean hidden unit activation for training. between (0,1) ''' self.epsilon = 0.0000001 theano.config.exception_verbosity = 'high' #rprop parameters and variables, rprop not used self.T_rprop_e = theano.shared(value=np.float32(rprop_e), name='T_rprop_e', borrow = True, allow_downcast=True) self.T_rprop_en = theano.shared(value=np.float32(rprop_en), name='T_rprop_en', borrow = True, allow_downcast=True) self.T_posUpdate = theano.shared(value=np.float32(0.5*(1.0+rprop_e)), name='T_posUpdate', borrow = True, allow_downcast=True) self.T_negUpdate = theano.shared(value=np.float32(0.5*(1.0-rprop_en)), name='T_negUpdate', borrow = True, allow_downcast=True) #network geometry and training parameters self.miniBatchSize = 0 #will be set in self.trainMB(...) self.parameterLoaded = False self.parameterSaved = False self.sparseTargetp = sparseTargetp self.CD_n = CD_n self.nv = noOfVisibleUnits self.nh = noOfHiddenUnits self.dimV = int(math.sqrt(self.nv)) self.dimH = int(math.sqrt(self.nh)) self.aRate = np.float32(aRate) self.bRate = np.float32(bRate) self.omegaRate = np.float32(omegaRate) self.sigmaRate = np.float32(sigmaRate) #initialise v and h self.v = np.float32(np.random.uniform(0, 1.0, self.nv)) self.h = np.float32(np.random.binomial(1.0,0.5,self.nh)) self.logLikelihood = [] self.likelihood4plot = [] self.T_aRate = theano.shared(value=np.float32(aRate), name='T_aRate', borrow = True, allow_downcast=True) self.T_bRate = theano.shared(value=np.float32(bRate), name='T_bRate', borrow = True, allow_downcast=True) self.T_omgRate = theano.shared(value=np.float32(omegaRate), name='T_omgRate', borrow = True, allow_downcast = True) self.T_sigRate = theano.shared(value=np.float32(sigmaRate), name='T_sigRate', borrow = True, allow_downcast = True) self.loadedRates = [aRate, bRate, omegaRate, sigmaRate]#for load/saveparameters(), can load to see previous rates but differes from constructor declared rates self.T_rng = RandomStreams() #use_cuda parameter set if on GPU #succesive calls on this T_rng will keep returning new values, so for MCMC even with #same start v vector value called twice consecutively you'll have different outputs #this is normal as the same T_rng gets called, without reset, giving different outputs everytime. self.T_CD_n = theano.shared(value=CD_n, name='T_CD_n', borrow = True, allow_downcast=True) if omega is None: #careful! use "1.0" instead of "1" below else it all rounds to zeros!!! omega = np.float32(np.random.uniform((-1.0)*(1.0/(np.sqrt(self.nh+self.nv))),(1.0/(np.sqrt(self.nh+self.nv))),self.nv*self.nh).reshape((self.nv,self.nh))) self.omega = omega self.T_omega = theano.shared(value=omega,name='T_omega',borrow=True, allow_downcast=True) #rprop previous gradient self.Tomg_grad_prev = theano.shared(value=np.float32(np.abs(omega*omegaRate)+omegaRate), name='Tomg_grad_prev', borrow = True, allow_downcast=True) #RMSprop accumulated gradient RMS self.Tomg_rmsH = theano.shared(value=omega,name='Tomg_rmsH', borrow=True, allow_downcast=True) if b is None: b = np.float32(np.random.uniform((-1.0)*(1.0/(self.nv)),(1.0/(self.nv)),self.nh)) self.b = b self.T_b = theano.shared(value=b,name='T_b',borrow=True, allow_downcast=True) #rprop previous gradient self.Tb_grad_prev = theano.shared(value=np.float32(np.abs(bRate*b)+bRate), name='Tb_grad_prev', borrow = True, allow_downcast=True) #RMSprop accumulated gradient RMS self.Tb_rmsH = theano.shared(value = b, name = 'Tb_rmsH', borrow = True, allow_downcast = True) if a is None: a = np.float32(np.random.uniform((-1.0)*(1.0/(self.nh)),(1.0/(self.nh)),self.nv)) self.a = a self.T_a = theano.shared(value=a,name='T_a',borrow=True, allow_downcast=True) #rprop previous gradient self.Ta_grad_prev = theano.shared(value=np.float32(np.abs(aRate*a)+aRate), name='Ta_grad_prev', borrow = True, allow_downcast=True) #RMSprop accumulated gradient RMS self.Ta_rms = theano.shared(value=a, name='Ta_rms', borrow=True, allow_downcast=True) # for sigma parameter we train z instead with e^z = \sigma^2 if z is None: z = np.float32(np.random.normal(0.0,(1.0/(self.nh*self.nh)),self.nv))#np.asarray([0.0]*self.nv, dtype=theano.config.floatX) self.z = z self.T_z = theano.shared(value=z,name='T_z',borrow=True, allow_downcast=True) self.T_sigmaSqr = T.exp(self.T_z) #rprop previous gradient self.Tz_grad_prev = theano.shared(value=np.float32(np.float32(np.abs(z*sigmaRate)+sigmaRate)), name='Tz_grad_prev', borrow = True, allow_downcast=True) #RMSprop accumulated gradient RMS self.Tz_rmsH = theano.shared(value=z, name = 'Tz_rmsH', borrow=True, allow_downcast=True) self.T_logZk = theano.shared(value = np.float32(0.0), name = 'T_logZk', borrow=True, allow_downcast=True) #will print in ipython notebook: print("RBMrv constructed for " + str(len(self.v)) + " visible units and " + str(len(self.h)) + " hidden units.")
from theano import function, config, shared, sandbox import theano.tensor as T import numpy import time vlen = 10 * 30 * 768 # 10 x #cores x # threads per core iters = 1000 rng = numpy.random.RandomState(22) x = shared(numpy.asarray(rng.rand(vlen), config.floatX)) f = function([], T.exp(x)) print(f.maker.fgraph.toposort()) t0 = time.time() for i in range(iters): r = f() t1 = time.time() print('Looping %d times took' % iters, t1 - t0, 'seconds') print('Result is', r) if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]): print('Used the cpu') else: print('Used the gpu')
def discountModel(alpha, length): """ discount model """ return tensor.exp(alpha * length * (-1))
def GESD(sum_uni_l, sum_uni_r): eucli = 1 / (1 + T.sum((sum_uni_l - sum_uni_r)**2)) kernel = 1 / (1 + T.exp(-(T.dot(sum_uni_l, sum_uni_r.T) + 1))) return (eucli * kernel).reshape((1, 1))
def RBF(sum_uni_l, sum_uni_r): eucli = T.sum((sum_uni_l - sum_uni_r)**2) return T.exp(-0.5 * eucli).reshape((1, 1))
def _log_add(a, b): # TODO: move functions like this to utils max_ = tensor.maximum(a, b) result = (max_ + tensor.log1p(tensor.exp(a + b - 2 * max_))) return T.switch(T.isnan(result), max_, result)
def laplace(x, mean, logvar): sd = T.exp(0.5 * logvar) return - abs(x - mean) / sd - 0.5 * logvar - np.log(2)
def _log_dot_matrix(x, z): y = x[:, :, None] + z[None, :, :] y_max = y.max(axis=1) out = T.log(T.sum(T.exp(y - y_max[:, None, :]), axis=1)) + y_max return T.switch(T.isnan(out), -numpy.inf, out)
def softmax(X): e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x')) return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
def log_sum_exp(x): m = T.max(x, axis=0) return T.log(T.sum(T.exp(x - m))) + m
beta = T.minimum( 1, T.cast(total_iters, theano.config.floatX) / lib.floatX(BETA_ITERS)) return T.nnet.relu(logsig, alpha=beta) # Layer 1 mu_and_logsig1 = Enc1(images) mu1, logsig1 = split(mu_and_logsig1) if VANILLA: latents1 = mu1 else: eps = T.cast(theano_srng.normal(mu1.shape), theano.config.floatX) latents1 = mu1 + (eps * T.exp(logsig1)) outputs1 = Dec1(latents1, images) reconst_cost = T.nnet.categorical_crossentropy( T.nnet.softmax(outputs1.reshape((-1, 256))), images.flatten()).mean() # Layer 2 mu_and_logsig2 = Enc2(latents1) mu2, logsig2 = split(mu_and_logsig2) if VANILLA: latents2 = mu2 else: eps = T.cast(theano_srng.normal(mu2.shape), theano.config.floatX)
def normal2(x, mean, logvar): return c - logvar / 2 - (x - mean) ** 2 / (2 * T.exp(logvar))
# initialize the weight vector w randomly w = theano.shared(rng.randn(feats), name="w") # this and the following bias variable b # are shared so they keep their values # between training iterations (updates) # initialize the bias term b = theano.shared(0., name="b") print("Initial model:") print(w.get_value()) print(b.get_value()) # Construct Theano expression graph p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) # Probability that target = 1 prediction = p_1 > 0.5 # The prediction thresholded xent = -y * T.log(p_1) - (1 - y) * T.log( 1 - p_1) # Cross-entropy loss function cost = xent.mean() + 0.01 * (w**2).sum() # The cost to minimize gw, gb = T.grad(cost, [w, b]) # Compute the gradient of the cost rmse = ((y - p_1)**2).mean() # w.r.t weight vector w and # bias term b # (we shall return to this in a # following section of this tutorial) # Compile train = theano.function(inputs=[x, y],
def expr_generator(a, b): ra = [T.pow(a[i], i) for i in range(len(a))] return ra, T.exp(b)
def standard_prob(self, x, p=None): if p is None: p = self.get_prob(*self.get_params()) return T.exp(-self.neg_log_prob(x, p))
#! /usr/bin/env python3 # Taken from http://deeplearning.net/software/theano/tutorial/using_gpu.html from theano import function, config, shared, tensor import numpy import time vlen = 10 * 30 * 768 # 10 x #cores x # threads per core iters = 1000 rng = numpy.random.RandomState(22) x = shared(numpy.asarray(rng.rand(vlen), config.floatX)) f = function([], tensor.exp(x)) t0 = time.time() for i in range(iters): r = f() t1 = time.time() print("Looping %d times took %f seconds" % (iters, t1 - t0)) print("Result is %s" % (r, )) if numpy.any([ isinstance(x.op, tensor.Elemwise) and ("Gpu" not in type(x.op).__name__) for x in f.maker.fgraph.toposort() ]): print("Used the CPU") else: print("Used the GPU")
def _step( m_, x_, xx_, h_, ctx_, alpha1_, alpha2_, # These ctx and alpha's are not used in the computations pctx1_, pctx2_, cc1_, cc2_, U, Wc, W_comb_att, W_comb_att2, U_att, c_att, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl): # Do a step of classical GRU h1 = gru_step(m_, x_, xx_, h_, U, Ux) ########### # Attention ########### # h1 X W_comb_att # W_comb_att: dim -> dimctx # pstate_ should be 2D as we're working with unrolled timesteps pstate1_ = tensor.dot(h1, W_comb_att) pstate2_ = tensor.dot(h1, W_comb_att2) # Accumulate in pctx*__ and apply tanh() # This becomes the projected context(s) + the current hidden state # of the decoder, e.g. this is the information accumulating # into the returned original contexts with the knowledge of target # sentence decoding. pctx1__ = tanh(pctx1_ + pstate1_[None, :, :]) pctx2__ = tanh(pctx2_ + pstate2_[None, :, :]) # Affine transformation for alpha* = (pctx*__ X U_att) + c_att # We're now down to scalar alpha's for each accumulated # context (0th dim) in the pctx*__ # alpha1 should be n_timesteps, 1, 1 alpha1 = tensor.dot(pctx1__, U_att) + c_att alpha2 = tensor.dot(pctx2__, U_att) + c_att # Drop the last dimension, e.g. (n_timesteps, 1) alpha1 = alpha1.reshape([alpha1.shape[0], alpha1.shape[1]]) alpha2 = alpha2.reshape([alpha2.shape[0], alpha2.shape[1]]) # Exponentiate alpha1 alpha1 = tensor.exp(alpha1 - alpha1.max(0, keepdims=True)) alpha2 = tensor.exp(alpha2 - alpha2.max(0, keepdims=True)) # If there is a context mask, multiply with it to cancel unnecessary steps # We won't have a ctx_mask for image vectors if ctx1_mask: alpha1 = alpha1 * ctx1_mask # Normalize so that the sum makes 1 alpha1 = alpha1 / alpha1.sum(0, keepdims=True) alpha2 = alpha2 / alpha2.sum(0, keepdims=True) # Compute the current context ctx*_ as the alpha-weighted sum of # the initial contexts ctx*'s ctx1_ = (cc1_ * alpha1[:, :, None]).sum(0) ctx2_ = (cc2_ * alpha2[:, :, None]).sum(0) # n_samples x ctxdim (2000) # Sum of contexts ctx_ = tanh(ctx1_ + ctx2_) ############################################ # ctx*_ and alpha computations are completed ############################################ #################################### # The below code is another GRU cell #################################### # Affine transformation: h1 X U_nl + b_nl # U_nl, b_nl: Stacked dim*2 preact = tensor.dot(h1, U_nl) + b_nl # Transform the weighted context sum with Wc # and add it to preact # Wc: dimctx -> Stacked dim*2 preact += tensor.dot(ctx_, Wc) # Apply sigmoid nonlinearity preact = sigmoid(preact) # Slice activations: New gates r2 and u2 r2 = tensor_slice(preact, 0, dim) u2 = tensor_slice(preact, 1, dim) preactx = (tensor.dot(h1, Ux_nl) + bx_nl) * r2 preactx += tensor.dot(ctx_, Wcx) # Candidate hidden h2_tilda = tanh(preactx) # Leaky integration between the new h2 and the # old h1 computed in line 285 h2 = u2 * h2_tilda + (1. - u2) * h1 h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 return h2, ctx_, alpha1.T, alpha2.T
def _softmax(x): axis = x.ndim - 1 e_x = T.exp(x - x.max(axis=axis, keepdims=True)) out = e_x / e_x.sum(axis=axis, keepdims=True) return out
def log_softmax(x): xdev = x - x.max(1, keepdims=True) return xdev - T.log(T.sum(T.exp(xdev), axis=1, keepdims=True))
def step_sample(self, epsilon, p): dim = p.shape[p.ndim - 1] // self.scale mu = _slice(p, 0, dim) log_sigma = _slice(p, 1, dim) return mu + epsilon * T.exp(log_sigma)
import theano import theano.tensor as T import numpy import random x = T.vector() w = theano.shared(numpy.array([-1.,1.])) b = theano.shared(0.) z = T.dot(w,x) + b y = 1 / (1 + T.exp(-z)) neuron = theano.function( inputs = [x], outputs = y) y_hat = T.scalar() #referencia de variable salida cost = T.sum((y - y_hat) ** 2) #funcion costo dw, db = T.grad(cost,[w,b]) #gradiente con respecto a w y b gradient = theano.function( #Funcion para calcular gradientes inputs = [x,y_hat], outputs = [dw,db]) x = [1, -1] y_hat = 1 for i in range(100): print neuron(x) dw, db = gradient(x, y_hat)
def _step_slice(m_, x_, xx_, yg, h_, ctx_, alpha_, alpha_past_, beta, pctx_, cc_, U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl, conv_Q, conv_Uf, conv_b, Whg, bhg, Umg, W_m_att, U_when_att, c_when_att): preact1 = tensor.dot(h_, U) preact1 += x_ preact1 = tensor.nnet.sigmoid(preact1) r1 = _slice(preact1, 0, dim) # reset gate u1 = _slice(preact1, 1, dim) # update gate preactx1 = tensor.dot(h_, Ux) preactx1 *= r1 preactx1 += xx_ h1 = tensor.tanh(preactx1) h1 = u1 * h_ + (1. - u1) * h1 h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ g_m = tensor.dot(h_, Whg) + bhg g_m += yg g_m = tensor.nnet.sigmoid(g_m) mt = tensor.dot(h1, Umg) mt = tensor.tanh(mt) mt *= g_m # attention pstate_ = tensor.dot(h1, W_comb_att) # converage vector cover_F = theano.tensor.nnet.conv2d(alpha_past_[:,None,:,None],conv_Q,border_mode='half') # batch x dim x SeqL x 1 cover_F = cover_F.dimshuffle(1,2,0,3) # dim x SeqL x batch x 1 cover_F = cover_F.reshape([cover_F.shape[0],cover_F.shape[1],cover_F.shape[2]]) assert cover_F.ndim == 3, \ 'Output of conv must be 3-d: #dim x SeqL x batch' #cover_F = cover_F[:,pad:-pad,:] cover_F = cover_F.dimshuffle(1, 2, 0) # cover_F must be SeqL x batch x dimctx cover_vector = tensor.dot(cover_F, conv_Uf) + conv_b # cover_vector = cover_vector * context_mask[:,:,None] pctx__ = pctx_ + pstate_[None, :, :] + cover_vector #pctx__ += xc_ pctx__ = tensor.tanh(pctx__) alpha = tensor.dot(pctx__, U_att)+c_tt # compute alpha_when pctx_when = tensor.dot(mt, W_m_att) pctx_when += pstate_ pctx_when = tensor.tanh(pctx_when) alpha_when = tensor.dot(pctx_when, U_when_att)+c_when_att # batch * 1 alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) # SeqL * batch alpha = tensor.exp(alpha) alpha_when = tensor.exp(alpha_when) if context_mask: alpha = alpha * context_mask if context_mask: alpha_mean = alpha.sum(0, keepdims=True) / context_mask.sum(0, keepdims=True) else: alpha_mean = alpha.mean(0, keepdims=True) alpha_when = concatenate([alpha_mean, alpha_when.T], axis=0) # (SeqL+1)*batch alpha = alpha / alpha.sum(0, keepdims=True) alpha_when = alpha_when / alpha_when.sum(0, keepdims=True) beta = alpha_when[-1, :] alpha_past = alpha_past_ + alpha.T ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context ctx_ = beta[:, None] * mt + (1. - beta)[:, None] * ctx_ preact2 = tensor.dot(h1, U_nl)+b_nl preact2 += tensor.dot(ctx_, Wc) preact2 = tensor.nnet.sigmoid(preact2) r2 = _slice(preact2, 0, dim) u2 = _slice(preact2, 1, dim) preactx2 = tensor.dot(h1, Ux_nl)+bx_nl preactx2 *= r2 preactx2 += tensor.dot(ctx_, Wcx) h2 = tensor.tanh(preactx2) h2 = u2 * h1 + (1. - u2) * h2 h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 return h2, ctx_, alpha.T, alpha_past, beta # pstate_, preact, preactx, r, u
def kmLossFunction(self, vMax, rnaConc, kDeg, isEndoRnase, alpha): ''' Generates the functions used for estimating the per-RNA affinities (Michaelis-Menten constants) to the endoRNAses. The optimization problem is formulated as a multidimensional root-finding problem; the goal is to find a set of Michaelis-Menten constants such that the endoRNAse-mediated degradation under basal concentrations is consistent with the experimentally observed half-lives, thus (nonlinear rate) = (linear rate) where the nonlinear rate is the rate as predicted from some kinetic rate law, and the linear rate is proportional to the inverse of the observed half-life. Then, reordering, 0 = (nonlinear rate) - (linear rate) is (for the moment) the root we wish to find, for each RNA species, giving us the multidimensional function R_aux = (nonlinear rate) - (linear rate) This is the unnormalized residual function; the normalized residuals are R = (nonlinear rate)/(linear rate) - 1 In addition to matching our half-lives we also desire the Michaelis-Menten constants to be non-negative (negative values have no physical meaning). Thus we introduce a penalty term for negative values. TODO (John): explain penalty term The two terms (the residuals R and the negative value penalty Rneg) are combined into one 'loss' function L (alpha is the weighting on the negative value penalty): L = ln((exp(R) + exp(alpha*Rneg))/2) = ln(exp(R) + exp(alpha*Rneg)) - ln(2) The loss function has one element for each RNA. This functional form is a soft (continuous and differentiable) approximation to L = max(R, alpha*Rneg) The root finder, provided with L, will attempt to make each element of L as close to zero as possible, and therefore minimize both R and Rneg. The third-party package Theano is used to create the functions and find an analytic expression for the Jacobian. Parameters ---------- vMax: scalar The total endoRNAse capacity, in dimensions of amount per volume per time. rnaConc: 1-D array, float Concentrations of RNAs (that will be degraded), in dimensions of amount per volume. kDeg: 1-D array, float Experimentally observed degradation rates (computed from half-lives), in dimensions of per unit time. isEndoRnase: 1-D array, bool A vector that is True everywhere that an RNA corresponds to an endoRNAse; that is, an endoRNAse (or endoRNAse subunit) mRNA. alpha: scalar, >0 Regularization weight, used to penalize for negative Michaelis-Menten value predictions during the course of the optimization. Typical value is 0.5. Returns ------- L: function The 'loss' function. Rneg: function The negative Michaelis-Menten constant penalty terms. R: function The residual error (deviation from steady-state). Lp: function The Jacobian of the loss function L with respect to the Michaelis-Menten constants. R_aux: function Unnormalized 'residual' function. L_aux: function Unnormalized 'loss' function. Lp_aux: function Jacobian of the unnormalized 'loss' function. Jacob: function Duplicate with Lp. Jacob_aux: function Duplicate with Lp_aux. Notes ----- The regularization term also includes a penalty for the endoRNAse residuals, as well as a fixed weighting (WFendoR = 0.1). TODO (John): Why is this needed? It seems redundant. TODO (John): How do we know this weight is sufficient? All of the outputs are Theano functions, and take a 1-D array of Michaelis-Menten constants as their sole inputs. All of the functions return a 1-D array, with the exception of the Jacobians, which return matrices. TODO (John): Remove the redundant outputs. TODO (John): Look into removing Theano, since it is no longer maintained. We could use another package with similar functionality (analytic differentiation on algebraic functions), or replace the Theano operations with hand-computed solutions (difficult, as the Jacobian is probably very complicated). TODO (John): Consider redesigning this as an objective minimization problem rather than a root finding problem. TODO (John): Consider replacing the Michaelis-Menten constants with logarithmic equivalents, thereby eliminating the requirement for the negative value penalty. TODO (John): Consider moving this method out of this class, as it is, in fact, a static method, and isn't utilized anywhere within this class. ''' N = rnaConc.size km = T.dvector() # Residuals of non-linear optimization residual = (vMax / km / kDeg) / (1 + (rnaConc / km).sum()) - np.ones(N) residual_aux = (vMax * rnaConc / km) / (1 + (rnaConc / km).sum()) - ( kDeg * rnaConc) # Counting negative Km's (first regularization term) regularizationNegativeNumbers = (np.ones(N) - km / np.abs(km)).sum() / N # Penalties for EndoR Km's, which might be potentially nonf-fitted regularizationEndoR = (isEndoRnase * np.abs(residual)).sum() # Multi objective-based regularization WFendoR = 0.1 # weighting factor to protect Km optimized of EndoRNases regularization = regularizationNegativeNumbers + (WFendoR * regularizationEndoR) # Loss function LossFunction = T.log(T.exp(residual) + T.exp(alpha * regularization)) - T.log(2) LossFunction_aux = T.log( T.exp(residual_aux) + T.exp(alpha * regularization)) - T.log(2) J = theano.gradient.jacobian(LossFunction, km) J_aux = theano.gradient.jacobian(LossFunction_aux, km) Jacob = theano.function([km], J) Jacob_aux = theano.function([km], J_aux) L = theano.function([km], LossFunction) L_aux = theano.function([km], LossFunction_aux) Rneg = theano.function([km], regularizationNegativeNumbers) R = theano.function([km], residual) Lp = theano.function([km], J) Lp_aux = theano.function([km], J_aux) R_aux = theano.function([km], residual_aux) return L, Rneg, R, Lp, R_aux, L_aux, Lp_aux, Jacob, Jacob_aux
def gaussian(self, freq, numax, w, A): return A * tt.exp(-0.5 * tt.sqr((freq - numax)) / tt.sqr(w))
def mixture_model_mobile_centers( data_2d, N, # noqa: N803 M, std, lam_backg, nsteps, nchains ): """Define the mixture model and sample from it. This mobile centers model extends the above mixture model in that allows the center positions of each atom to vary slightly from the center of the lattice site. This should help in cases of lattice inhomogeneity. Parameters ---------- data_2d : ndarray of floats 2D intensity distribution of the collected light N : integer number of lattice sites along one axis M : integer number of pixels per lattice site along one axis std : float Gaussian width of the point spread function lam_backg: integer Expected value of the Poissonian background nsteps : integer number of steps taken by each walker in the pymc3 sampling nchains : integer number of walkers in the pymc3 sampling Returns ------- traces : pymc3 MultiTrace An object that contains the samples. df : dataframe Samples converted into a dataframe object """ # x-pixel locations for the entire image x = np.arange(0, N*M) # X, Y meshgrid of pixel locations X, Y = np.meshgrid(x, x) # noqa: N806 # atom center locations are explicitly supplied as the centers of # the lattice sites centers = np.linspace(0, (N-1)*M, N)+M/2 Xcent_mu, Ycent_mu = np.meshgrid(centers, centers) # noqa: N806 with pm.Model() as mixture_model: # noqa: F841 # Priors # continuous numbers characterizing if lattice sites are filled # or not. q = pm.Uniform('q', lower=0, upper=1, shape=(N, N)) # Allow centers to move but we expect them to be # pretty near their lattice centers Xcent = pm.Normal( # noqa: N806 'Xcent', mu=Xcent_mu, sigma=Xcent_mu/10, shape=(N, N) ) Ycent = pm.Normal( # noqa: N806 'Ycent', mu=Ycent_mu, sigma=Ycent_mu/10, shape=(N, N) ) # Amplitude of the Gaussian signal for the atoms aa = pm.Gamma('Aa', mu=3, sd=0.5) # Amplitude of the uniform background signal ab = pm.Gamma('Ab', mu=0.5, sd=0.1) # Width of the Gaussian likelihood for the atoms sigma_a = pm.Gamma('sigma_a', mu=1, sd=0.1) # Width of the Gaussian likelihood for the background sigma_b = pm.Gamma('sigma_b', mu=1, sd=0.1) # Width of the point spread function atom_std = pm.Gamma('std', mu=std, sd=0.1) # Instead of tiling a single_atom PSF with kronecker, use # broadcasting and summing along appropriate axis # to allow for spill over of one atom to neighboring sites. atom = tt.sum( tt.sum( q*aa * tt.exp( -((X[:, :, None, None] - Xcent)**2 + (Y[:, :, None, None] - Ycent)**2) / (2 * atom_std**2) ), axis=2 ), axis=2 ) atom += ab # background is just flat background = ab*np.ones((N*M, N*M)) # Log-likelihood good_data = pm.Normal.dist(mu=atom, sd=sigma_a).logp(data_2d) bad_data = pm.Normal.dist(mu=background, sd=sigma_b).logp(data_2d) log_like = good_data + bad_data pm.Potential('logp', log_like.sum()) # Sample traces = pm.sample(tune=nsteps, draws=nsteps, chains=nchains) # convert the PymC3 traces into a dataframe df = pm.trace_to_dataframe(traces) return traces, df
def define_layers(self): self.params = [] layer_id = "1" self.W_xh = init_weights((self.in_size, self.hidden_size), self.prefix + "W_xh" + layer_id) self.b_xh = init_bias(self.hidden_size, self.prefix + "b_xh" + layer_id) layer_id = "2" self.W_hu = init_weights((self.hidden_size, self.latent_size), self.prefix + "W_hu" + layer_id) self.b_hu = init_bias(self.latent_size, self.prefix + "b_hu" + layer_id) self.W_hsigma = init_weights((self.hidden_size, self.latent_size), self.prefix + "W_hsigma" + layer_id) self.b_hsigma = init_bias(self.latent_size, self.prefix + "b_hsigma" + layer_id) layer_id = "3" self.W_zh = init_weights((self.latent_size, self.hidden_size), self.prefix + "W_zh" + layer_id) self.b_zh = init_bias(self.hidden_size, self.prefix + "b_zh" + layer_id) self.params += [self.W_xh, self.b_xh, self.W_hu, self.b_hu, self.W_hsigma, self.b_hsigma, \ self.W_zh, self.b_zh] layer_id = "4" if self.continuous: self.W_hyu = init_weights((self.hidden_size, self.out_size), self.prefix + "W_hyu" + layer_id) self.b_hyu = init_bias(self.out_size, self.prefix + "b_hyu" + layer_id) self.W_hysigma = init_weights((self.hidden_size, self.out_size), self.prefix + "W_hysigma" + layer_id) self.b_hysigma = init_bias(self.out_size, self.prefix + "b_hysigma" + layer_id) self.params += [ self.W_hyu, self.b_hyu, self.W_hysigma, self.b_hysigma ] else: self.W_hy = init_weights((self.hidden_size, self.out_size), self.prefix + "W_hy" + layer_id) self.b_hy = init_bias(self.out_size, self.prefix + "b_hy" + layer_id) self.params += [self.W_hy, self.b_hy] # encoder h_enc = T.nnet.relu(T.dot(self.X, self.W_xh) + self.b_xh) self.mu = T.dot(h_enc, self.W_hu) + self.b_hu log_var = T.dot(h_enc, self.W_hsigma) + self.b_hsigma self.var = T.exp(log_var) self.sigma = T.sqrt(self.var) srng = T.shared_randomstreams.RandomStreams(234) eps = srng.normal(self.mu.shape) self.z = self.mu + self.sigma * eps # decoder h_dec = T.nnet.relu(T.dot(self.z, self.W_zh) + self.b_zh) if self.continuous: self.reconstruct = T.dot(h_dec, self.W_hyu) + self.b_hyu self.log_var_dec = T.dot(h_dec, self.W_hysigma) + self.b_hysigma self.var_dec = T.exp(self.log_var_dec) else: self.reconstruct = T.nnet.sigmoid( T.dot(h_dec, self.W_hy) + self.b_hy)
def f(t, x, u): # bygger upp argumenten for "kollisionen" for alla objekt ret = 0. for i, (a, b) in enumerate(bounds): return -tt.exp((u[i] - b) / width) - tt.exp((a - u[i]) / width)
def free_energy(self, V): return -V.dot(self.b) - T.sum(T.log(1 + T.exp(V.dot(self.W) + self.c)), axis=1)