def __init__(self, shape, cat_var, mus, taus, model=None, *args, **kwargs): """ Creates a continuous mixture distribution which can be efficiently evaluated and sampled from. Args: shape: Shape of the distribution. All kernels need to have this same shape as well. weights: weight vector (may be a theano shared variable or expression. Must be able to evaluate this in the context of the model). kernels: list of mixture component distributions. These have to be MixtureKernel instances (for example MvNormalKernel ) of same shape testval: Test value for logp computations """ super(MvGaussianMixture, self).__init__(*args, shape=shape, **kwargs) assert isinstance(cat_var, FreeRV) assert isinstance(cat_var.distribution, Categorical) self.cat_var = cat_var self.model = modelcontext(model) weights = cat_var.distribution.p self.weights = weights self.mus = mus self.taus = taus self.mu_t = T.stacklists(mus) self.tau_t = T.stacklists(taus) self.shape = shape self.testval = np.zeros(self.shape, self.dtype) self.last_cov_value = {} self.last_tau_value = {} self.param_fn = None
def theano_rot(rx, ry, rz, rescale=True): '''Return a theano tensor representing a rotation matrix using specified rotation angles rx,ry, rz If rescale is True, treat the input angles as degrees. ''' if rescale: rx = np.pi / 180. * (rx) ry = np.pi / 180. * (ry) rz = np.pi / 180. * (rz) sx = tt.sin(rx) sy = tt.sin(ry) sz = tt.sin(rz) cx = tt.cos(rx) cy = tt.cos(ry) cz = tt.cos(rz) Rx = [[1, 0, 0], [0, cx, -sx], [0, sx, cx]] Ry = [[cy, 0, sy], [0, 1, 0], [-sy, 0, cy]] Rz = [[cz, -sz, 0], [sz, cz, 0], [0, 0, 1]] Rxt = tt.stacklists(Rx) Ryt = tt.stacklists(Ry) Rzt = tt.stacklists(Rz) full_rotation = tt.dot(Rzt, tt.dot(Ryt, Rxt)) return full_rotation
def theano_rot(rx,ry,rz, rescale=180./np.pi): '''Return a theano tensor representing a rotation matrix using specified rotation angles rx,ry, rz Rescale can be used to change the angular units to i.e. degrees. ''' rx = (rx)/rescale ry = (ry)/rescale rz = (rz)/rescale sx = tt.sin(rx) sy = tt.sin(ry) sz = tt.sin(rz) cx = tt.cos(rx) cy = tt.cos(ry) cz = tt.cos(rz) Rx = [[1.,0.,0.],[0.,cx,-sx],[0.,sx,cx]] Ry = [[cy,0.,sy],[0.,1.,0.],[-sy,0.,cy]] Rz = [[cz,-sz,0.],[sz,cz,0.],[0.,0.,1.]] Rxt = tt.stacklists(Rx) Ryt = tt.stacklists(Ry) Rzt = tt.stacklists(Rz) full_rotation=tt.dot(Rzt,tt.dot(Ryt, Rxt)) return full_rotation
def sequential_drawing(self, num_examples): """Fetches the sequential output of GRAN at each timestep""" canvas = self.gen_network.get_samples(num_examples)[1] sequential_sams = [] for i in xrange(self.num_steps): sequential_sams.append(T.nnet.sigmoid(T.sum(T.stacklists(canvas[:i+1]), axis=0))) return T.stacklists(sequential_sams)
def renet_layer_ud(X, rnn1, rnn2, w, h, wp, hp): # def recurrence1(x_t, h_tm1): # dot = T.dot(Wx1, x_t) # h_t = relu(dot + T.dot(h_tm1, Wh1) + Bh1) # return h_t # def recurrence2(x_t, h_tm1): # dot = T.dot(Wx2, x_t) # h_t = relu(dot + T.dot(h_tm1, Wh2) + Bh2) # return h_t list_of_images = [] for j in xrange(w/wp): # x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten(ndim=2) # reshape the row into a 2-D matrix to be fed into scan x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten().reshape((h/hp, X.shape[0]*wp*hp)) # h1, _ = theano.scan( # fn=recurrence1, # sequences=x, # outputs_info=[H01], # n_steps=x.shape[0] # ) # h2, _ = theano.scan( # fn=recurrence2, # sequences=x, # outputs_info=[H02], # n_steps=x.shape[0], # go_backwards=True # ) h1 = rnn1.output(x) h2 = rnn2.output(x, go_backwards=True) # combine the last values of s1 and s2 into an image img = T.concatenate([h1.T, h2.T]) list_of_images.append(img) return T.stacklists(list_of_images).dimshuffle((1, 0, 2))
def theano(self, x, mu, V, ndim, ncomp): cholesky = Cholesky(nofail=True, lower=True) solve_lower = tt.slinalg.Solve(A_structure="lower_triangular") if x.ndim == 1: onedim = True x = x[None, :] else: onedim = False delta = x[:, None, :] - mu[None, ...] logps = [] for i in range(ncomp): _chol_cov = cholesky(V[i]) k = floatX(ndim) diag = tt.nlinalg.diag(_chol_cov) # Check if the covariance matrix is positive definite. ok = tt.all(diag > 0) # If not, replace the diagonal. We return -inf later, but # need to prevent solve_lower from throwing an exception. chol_cov = tt.switch(ok, _chol_cov, 1) delta_trans = solve_lower(chol_cov, delta[:, i].T).T _quaddist = (delta_trans**2).sum(axis=-1) logdet = tt.sum(tt.log(diag)) if onedim: quaddist = _quaddist[0] else: quaddist = _quaddist norm = -0.5 * k * floatX(np.log(2 * np.pi)) logp = norm - 0.5 * quaddist - logdet safe_logp = tt.switch(alltrue_elemwise([ok]), logp, -np.inf) # safe logp (-inf for invalid) logps.append(safe_logp) return tt.stacklists(logps).T
def get_output_for(self, inputs, **kwargs): # see eq. (1) and sec 3.1 in [1] input, para = inputs num_batch, channels, height, width = input.shape _w = T.cast(width, dtype = self.dtype) _h = T.cast(height, dtype = self.dtype) mat = T.zeros((num_batch, 3, 3), dtype = self.dtype) mat = T.set_subtensor(mat[:, 0, 0], const(1.0)) mat = T.set_subtensor(mat[:, 1, 1], const(1.0)) mat = T.set_subtensor(mat[:, 2, 2], const(1.0)) if self.method == 'perspective': mat = T.set_subtensor(mat[:, 2, 0], (para[:, 0] / 1e4 - 1e-3) * _w) mat = T.set_subtensor(mat[:, 2, 1], (para[:, 1] / 1e4 - 1e-3) * _h) elif self.method == 'angle': angle = T.cast(T.argmax(para, axis = 1), dtype = self.dtype) * np.pi / 90 - np.pi / 3.0 # ss = np.sqrt(2.0) mat = T.set_subtensor(mat[:, :, :], T.stacklists([ [T.cos(angle), T.sin(angle), -(T.cos(angle) * _w + T.sin(angle) * _h - _w) / (2.0 * _w)], [-T.sin(angle), T.cos(angle), -(-T.sin(angle) * _w + T.cos(angle) * _h - _h) / (2.0 * _h)], [constv(0, num_batch, self.dtype), constv(0, num_batch, self.dtype), constv(1, num_batch, self.dtype)]]).dimshuffle(2, 0, 1)) # return [mat, _w, _h] elif self.method == 'all': mat = T.reshape(para, [-1, 3, 3]) mat = T.set_subtensor(mat[:, 0, 2], mat[:, 0, 2] / T.cast(width, dtype)) mat = T.set_subtensor(mat[:, 1, 2], mat[:, 1, 2] / T.cast(height, dtype)) mat = T.set_subtensor(mat[:, 2, 0], mat[:, 2, 0] * T.cast(width, dtype)) mat = T.set_subtensor(mat[:, 2, 1], mat[:, 2, 1] * T.cast(height, dtype)) else: raise Exception('method not understood.') return transform_affine(mat, input, self.method, scale_factor = self.scale_factor)
def jacobian(f: Sequence[Callable], x: Any, constants: list = []) -> TensorVariable: sz = cast( int, shape(f)) # Theano is doing some implicit casting black magic here return tt.stacklists([grad(f[i], x) for i in range(sz)])
def jacobian(f, x, constants=[]): sz = shape(f) return tt.stacklists([grad(f[i], x) for i in range(sz)]) ret = th.gradient.jacobian(f, x, consider_constant=constants) if isinstance(ret, list): ret = tt.concatenate(ret, axis=1) return ret
def fixspeed(self,model,momentums): paramlayers = model.paramlayers() coeff = [] outs = [] pid = 0 for paramlayer in paramlayers: #For W D = 0 if isinstance(paramlayer,(layerbase.ConvLayer,layerbase.ConvMaxoutLayer,layerbase.ConvKeepLayer)): layershape = paramlayer.params[0].get_value().shape fan_in = layershape[1]*layershape[2]*layershape[3] fan_out = np.prod(layershape) D = 1 elif isinstance(paramlayer,(layerbase.FullConnectLayer)): layershape = paramlayer.params[0].get_value().shape fan_in = layershape[0] fan_out = np.prod(layershape) D = 1 else: coeff.append(1) if D: layerrate = (self.layertarget*fan_out/fan_in) / (T.sum(abs(momentums[pid]))+1e-10) * self.layerstr + self.baserate * (1-self.layerstr) * self.basedynamic coeff.append(layerrate) outs.append(momentums[pid]*layerrate) pid+=1 #For other for i in paramlayer.params[D:]: outs.append(momentums[pid]*self.baserate) pid+=1 return outs,T.stacklists(coeff)
def renet_layer_ud(X, Wx, Wh, Wo, Bh, Bo, H0, w, h, wp, hp): def recurrence(x_t, h_tm1): dot = T.dot(Wx, x_t) h_t = T.tanh(dot + T.dot(h_tm1, Wh) + Bh) s_t = T.tanh(T.dot(h_t, Wo) + Bo) return [h_t, s_t] list_of_images = [] for j in xrange(w/wp): # x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten(ndim=2) # reshape the row into a 2-D matrix to be fed into scan x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten().reshape((h/hp, X.shape[0]*wp*hp)) [h1, s1], _ = theano.scan( fn=recurrence, sequences=x, outputs_info=[H0, None], n_steps=x.shape[0] ) [h2, s2], _ = theano.scan( fn=recurrence, sequences=x, outputs_info=[H0, None], n_steps=x.shape[0], go_backwards=True ) # combine the last values of s1 and s2 into an image img = T.concatenate([s1.T, s2.T]) list_of_images.append(img) return T.stacklists(list_of_images).dimshuffle((1, 0, 2))
def f(x, u): return tt.stacklists([ x[3]*tt.cos(x[2]), x[3]*tt.sin(x[2]), x[3]*u[0], u[1]-x[3]*friction ])
def logp_(value): logps = [ tt.log(pi[i]) + logp_normal(mu, sd, value) for i, mu in enumerate(mus) ] return tt.sum(logsumexp(tt.stacklists(logps)[:, :], axis=0))
def renet_layer_ud(X, Wx, Wh, Wo, Bh, Bo, H0, w, h, wp, hp): def recurrence(x_t, h_tm1): dot = T.dot(Wx, x_t) h_t = T.tanh(dot + T.dot(h_tm1, Wh) + Bh) s_t = T.tanh(T.dot(h_t, Wo) + Bo) return [h_t, s_t] list_of_images = [] for j in range(w / wp): # x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten(ndim=2) # reshape the row into a 2-D matrix to be fed into scan x = X[:, :, j * wp:(j * wp + wp)].dimshuffle( (2, 0, 1)).flatten().reshape((h / hp, X.shape[0] * wp * hp)) [h1, s1], _ = theano.scan(fn=recurrence, sequences=x, outputs_info=[H0, None], n_steps=x.shape[0]) [h2, s2], _ = theano.scan(fn=recurrence, sequences=x, outputs_info=[H0, None], n_steps=x.shape[0], go_backwards=True) # combine the last values of s1 and s2 into an image img = T.concatenate([s1.T, s2.T]) list_of_images.append(img) return T.stacklists(list_of_images).dimshuffle((1, 0, 2))
def crossEntropy(self, y, m): return -T.sum( T.stacklists([ T.mean( T.log(self.p_y_given_x)[i][T.arange(y[i].shape[0]), y[i]] * m[0]) for i in xrange(200) ]))
def get_nade_k_rbm_cost_theano(self, x, input_mask, k): """ log p(x_missing | x_observed) x is a matrix of column datapoints (mbxD) D = n_visible, mb = mini batch size """ #x_ = utils.corrupt_with_salt_and_pepper( # x, x.shape, self.noise, rng_theano) #BxD print 'building cost function ...' output_mask = constantX(1) - input_mask D = constantX(self.n_visible) d = input_mask.sum(1) cost = constantX(0) costs_by_step = [] print 'do %d steps of mean field inference' % k P = self.get_nade_k_mean_field(x, input_mask, k) costs = [] for i, p in enumerate(P): # Loglikelihood on missing bits lp = ((x*T.log(p) + (constantX(1)-x)*T.log(constantX(1)-p)) \ * output_mask).sum(1) * D / (D-d) this_cost = -T.mean(lp) costs.append(this_cost) costs_by_step.append(this_cost) costs_by_step = T.stack(costs_by_step) if not self.cost_from_last: cost = T.mean(T.stacklists(costs)) else: cost = costs[-1] return cost, costs_by_step
def get_sensi_speci(y_hat, y): # y_hat = T.concatenate(T.sum(input=y_hat[:, 0:2], axis=1), T.sum(input=y_hat[:, 2:], axis=1)) y_hat = T.stacklists([y_hat[:, 0] + y_hat[:, 1], y_hat[:, 2] + y_hat[:, 3] + y_hat[:, 4]]).T y_hat = T.argmax(y_hat) tag = 10 * y_hat + y tneg = T.cast((T.shape(tag[(T.eq(tag, 0.)).nonzero()]))[0], config.floatX) fneg = T.cast((T.shape(tag[(T.eq(tag, 1.)).nonzero()]))[0], config.floatX) fpos = T.cast((T.shape(tag[(T.eq(tag, 10.)).nonzero()]))[0], config.floatX) tpos = T.cast((T.shape(tag[(T.eq(tag, 11.)).nonzero()]))[0], config.floatX) # assert fneg + fneg + fpos + tpos == 1380 # tneg.astype(config.floatX) # fneg.astype(config.floatX) # fpos.astype(config.floatX) # tpos.astype(config.floatX) speci = ifelse(T.eq((tneg + fpos), 0), np.float64(float('inf')), tneg / (tneg + fpos)) sensi = ifelse(T.eq((tpos + fneg), 0), np.float64(float('inf')), tpos / (tpos + fneg)) # keng die!!! # if T.eq((tneg + fpos), 0): # speci = float('inf') # else: # speci = tneg // (tneg + fpos) # if T.eq((tpos + fneg), 0.): # sensi = float('inf') # else: # sensi = tpos // (tpos + fneg) # speci.astype(config.floatX) # sensi.astype(config.floatX) return [sensi, speci]
def logp_(value): logps = [ tt.log(pi[c]) + logp_normal(mus[c], tau, value) for c in category ] return tt.sum( pm.math.logsumexp(tt.stacklists(logps)[:, :n_samples], axis=0))
def _generate(self): '''Turn the specification of the transform into usable mathematical objects''' identity = {'tx':0., 'ty':0., 'tz':0., 'rx':0., 'ry':0., 'rz':0., 's':self.full_scale} trans = identity if self._trans is not None: for k in identity.keys(): try: trans[k] = self._trans[k] if self._apply_factors_to_trans: if k in ['tx', 'ty', 'tz']: trans[k]*=self.translate_factor elif k in ['rx', 'ry', 'rz']: trans[k]*=self.rotation_scale elif k == 's': trans[k]*=self.full_scale except KeyError: pass if self._R is None: self._R = theano_rot(rx=trans['rx'], ry=trans['ry'], rz=trans['rz'], rescale = self.rotation_scale) if self._tr is None: self._tr = tt.stacklists([trans['tx'],trans['ty'],trans['tz']]) if self._s is None: self._s = trans['s']
def logp_(value): aux = tt.ones((n_samples, 1)) pi = [tt.sum(tt.eq(aux3, aux * cat), axis=1) / 8.0 for cat in range(K)] logps = [(pi[i] - 1) * 2 + logp_normal(mu, tau, value) for i, mu in enumerate(mus)] return tt.sum(tt.stacklists(logps), axis=0)
def logp_(value): logps = [ tt.log(pi[i]) + logp_normal(mus[i, :], taus[i], value) for i in range(n_components) ] return tt.sum( logsumexp(tt.stacklists(logps)[:, :n_samples], axis=0))
def applySentenceAttention(self, premiseOutputs, finalHypothesisOutput, numTimestepsPremise): """ Apply sentence level attention by attending over all premise outputs once with the final hypothesis output. Note this is different from word-by-word attention over the premise. :param premiseOutputs: :param finalHypothesisOutput: :return: """ # Note: Notation follows that in Rocktaschel's attention mechanism explanation: # http://arxiv.org/pdf/1509.06664v2.pdf timestep, numSamp, dimHidden = premiseOutputs.shape Y = premiseOutputs.reshape((numSamp, timestep, dimHidden)) WyY = T.dot(Y, self.W_y) # Computing (WyY).T transformedHn = (T.dot(self.W_h, finalHypothesisOutput.T)).T repeatedHn = [transformedHn] * numTimestepsPremise # TODO: Condense this later if it works repeatedHn = T.stacklists(repeatedHn) repeatedHn = repeatedHn.dimshuffle(1, 0, 2) # (numSample, timestep, dimHidden) M = T.tanh(WyY + repeatedHn) alpha = T.nnet.softmax(T.dot(M, self.w).flatten(2)) # Hackery to make into 2d tensor of (numSamp, timestep) Y = Y.dimshuffle(0, 2, 1) rOut, updates = theano.scan( fn=lambda Yt, alphat: T.dot(Yt, alphat), outputs_info=None, sequences=[Y, alpha], non_sequences=None ) WxHn = T.dot(finalHypothesisOutput, self.W_x) WpR = T.dot(rOut, self.W_p) hstar = T.tanh(WxHn + WpR) return hstar
def f(state: np.ndarray, action: np.ndarray) -> tt.Tensor: return tt.stacklists([ state[3] * tt.cos(state[2]), state[3] * tt.sin(state[2]), state[3] * action[0], action[1] - state[3] * friction, ])
def _compute_nary_hessian_vector_product(self, gradients, arguments): """Returns a function accepting `2 * len(arguments)` arguments to compute a Hessian-vector product of a multivariate function. Notes ----- The implementation is based on TensorFlow's '_hessian_vector_product' function in 'tensorflow.python.ops.gradients_impl'. """ argument_types = [argument.type() for argument in arguments] try: Rop = T.Rop(gradients, arguments, argument_types) except NotImplementedError: proj = [ T.sum(gradient * disconnected_grad(argument_type)) for gradient, argument_type in zip(gradients, argument_types) ] proj_grad = [ T.grad(proj_elem, arguments, disconnected_inputs="ignore", return_disconnected="None") for proj_elem in proj ] proj_grad_transpose = map(list, zip(*proj_grad)) proj_grad_stack = [ T.stacklists([c for c in row if c is not None]) for row in proj_grad_transpose ] Rop = [T.sum(stack, axis=0) for stack in proj_grad_stack] return self._compile_function_without_warnings( list(itertools.chain(arguments, argument_types)), Rop)
def memnn_cost(self, statements, question, pe_matrix): # statements: list of list of word indices # question: list of word indices computed_memories, updates = theano.scan( self._compute_memories, sequences=statements, outputs_info=[ #alloc_zeros_matrix(self.weights.shape[0]) #alloc_zeros_matrix(self.weights.shape[0]),self.n_embedding) alloc_zeros_matrix(self.weights.shape[0], 4800) #init as 3 #alloc_zeros_matrix(self.weights.shape[0], 4800, 4) #init as 4 #alloc_zeros_matrix(4) #alloc_zeros_matrix(110, 4800) ], non_sequences=[ #self.weights.dimshuffle(1, 0, 2), #self.weights.dimshuffle(1, 0, 2), self.weights, pe_matrix ], truncate_gradient=-1, ) #memories = computed_memories #memories = T.stacklists(computed_memories) memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) #print computed_memories.shape[0] # Embed question #s = theano.tensor.scalar('s') #u1 = T.sum(self.weights[0][question], axis=0) #u1 = [question] u1 = question #u1 = u1.astype(np.float64) #u1 = np.asarray(u1, dtype=np.float64) #sv = skipthoughts.encode(model, sentence) # Layer 1 p = T.nnet.softmax(T.dot(u1, memories[0].T)) o1 = T.dot(p, memories[1]) # Layer 2 u2 = o1 + T.dot(u1, self.H) p = T.nnet.softmax(T.dot(u2, memories[1].T)) o2 = T.dot(p, memories[2]) # Layer 3 u3 = o2 + T.dot(u2, self.H) p = T.nnet.softmax(T.dot(u3, memories[2].T)) o3 = T.dot(p, memories[3]) # Final output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T)) print "memnn_cost running" #return output[0, 1, 2, 3] return output[0]
def solve_theano(self, A, bi): a = A[0, 0] b = A[0, 1] c = A[1, 0] d = A[1, 1] A_inv = (T.stacklists([[d, -b], [-c, a]]) / (a * d - b * c)) return T.dot(A_inv, bi).squeeze()
def logp_(value): aux = tt.zeros((n_samples, 1)) pi = [tt.sum(tt.eq(aux3, aux + cat), axis=1) / 8.0 for cat in range(K)] logps = [((pi[i] - 1) * 2 + mv.logp(value)) - tt.sum((pi[i] - 1) * 2 + mv.logp(value)) for i, mv in enumerate(mus)] return tt.sum(tt.stacklists(logps), axis=0)
def __call__(self,X): #out = self.W[:,X] def step(x): return self.W[:x] stk = theano.map( lambda x: self.W[x],X) out = T.stacklists(stk[0]) #return out.dimshuffle('x','x',0,1) return out
def __init__( self, rng, input, vocab_size, embed_dm, embeddings=None, ): """ input: theano.tensor.dmatrix, (number of instances, sentence word number) vocab_size: integer, the size of vocabulary, embed_dm: integer, the dimension of word vector representation embeddings: theano.tensor.TensorType pretrained embeddings """ if embeddings: print "Use pretrained embeddings: ON" assert embeddings.get_value().shape == ( vocab_size, embed_dm), "%r != %r" % (embeddings.get_value().shape, (vocab_size, embed_dm)) self.embeddings = embeddings else: print "Use pretrained embeddings: OFF" embedding_val = np.asarray(rng.normal(0, 0.05, size=(vocab_size, embed_dm)), dtype=theano.config.floatX) embedding_val[ vocab_size - 1, :] = 0 # the <PADDING> character is intialized to 0 self.embeddings = theano.shared(np.asarray( embedding_val, dtype=theano.config.floatX), borrow=True, name='embeddings') self.params = [self.embeddings] self.param_shapes = [(vocab_size, embed_dm)] # Return: # :type, theano.tensor.tensor4 # :param, dimension(1, 1, word embedding dimension, number of words in sentence) # made to be 4D to fit into the dimension of convolution operation sent_embedding_list, updates = theano.map( lambda sent: self.embeddings[sent], input) sent_embedding_tensor = T.stacklists( sent_embedding_list) # make it into a 3D tensor self.output = sent_embedding_tensor.dimshuffle( 0, 'x', 2, 1) # make it a 4D tensor
def step(self, x_t, H_x, H_y, M_x, M_y, W_i, W_f, W_o, W_c): #H_t = T.ones_like(H_x) #M_t = T.ones_like(H_x) #H = T.ones_like(H_x) H = T.stacklists([x_t, H_x[1], H_y[2]]) M = T.stacklists([x_t, M_x[1], M_y[2]]) for i in range(self.n_dim): (H_temp, M_temp) = self.LTSM(H, M[i], W_i[i], W_f[i], W_o[i], W_c[i]) if (i == 0): H_t = H_temp M_t = M_temp else: H_t = T.concatenate([H_t, H_temp], axis=0) M_t = T.concatenate([M_t, M_temp], axis=0) return H_t, M_t
def update_fun(param, grad, penaltyparam, dataset, history, opt, params, globalLR1, globalLR2, momentParam1, momentParam2): epsilon = np.asarray(0.0, dtype=theano.config.floatX) def separateLR(params, sharedName, globalLR1, globalLR2): sharedName = sharedName[:-2];customizedLR = globalLR2 if (sharedName in params.rglrzLR.keys()) or (not params.adaptT2LR): customizedLR = globalLR2*params.rglrzLR[sharedName] return customizedLR assert dataset in ['T1', 'T2'] lr = globalLR1 if dataset == 'T1' else separateLR(params, param.name, globalLR1, globalLR2) # Standard update if opt is None: updates = [] if params.trackGrads: old_grad = theano.shared(np.asarray(param.get_value() * 0., dtype='float32'), broadcastable=param.broadcastable, name='oldgrad_%s' % param.name) updates += [(old_grad, grad)] grad_mean = T.mean(T.sqrt(grad**2)) grad_rel = T.mean(T.sqrt((grad/(param+1e-12))**2)) grad_angle = T.sum(grad*old_grad)/(T.sqrt(T.sum(grad**2))*T.sqrt(T.sum(old_grad**2))+1e-12) check = T.stacklists([grad_mean, grad_rel, grad_angle]) other = [grad] else: check = grad other = [grad] up = - lr * grad else: up, updates, check, other = opt.up(param, grad, params, lr=lr, dataset=dataset) # dictionary param to grad (first time around) if params.useT2 and dataset == 'T1': history['grad'][param] = grad history['up'][param] = up # add momentum to update if params.use_momentum: oldup = theano.shared(np.asarray(param.get_value() * 0., dtype='float32'), broadcastable=param.broadcastable, name='oldup_%s' % param.name) momentParam = momentParam1 if dataset == 'T1' else momentParam2 up += momentParam * oldup updates += [(oldup, up)] # New parameter newparam = param + up # min value | NOTE assumption: all hyperparams can only be positive if dataset == 'T2': newparam = T.maximum(epsilon, newparam) updates += [(param, newparam)] paramUpPair = [(param, check)] adamGrad = [other] return updates, paramUpPair, adamGrad
def step(*args): """ z_tmp, ..., z_tm1 \in R^{1,n_hidden} """ z_stack = T.stacklists(args) z_merge = z_stack * self.W z_t = T.sum(z_merge, axis=0) y_t = T.dot(z_t, self.W_o) + self.b_o return z_t, y_t
def __call__(self, X): #out = self.W[:,X] def step(x): return self.W[:x] stk = theano.map(lambda x: self.W[x], X) out = T.stacklists(stk[0]) #return out.dimshuffle('x','x',0,1) return out
def f(x, u): return tt.stacklists([ ((u[1] - friction * x[3]**2) * dt**2 / 2 + x[3] * dt) * tt.cos(x[2]) + x[0], ((u[1] - friction * x[3]**2) * dt**2 / 2 + x[3] * dt) * tt.sin(x[2]) + x[1], ((u[1] - friction * x[3]**2) * dt**2 / 2 + x[3] * dt) * u[0] + x[2], (u[1] - friction * x[3]**2) * dt + x[3] ])
def __init__(self, rng, input, vocab_size, embed_dm, embeddings = None, ): """ input: theano.tensor.dmatrix, (number of instances, sentence word number) vocab_size: integer, the size of vocabulary, embed_dm: integer, the dimension of word vector representation embeddings: theano.tensor.TensorType pretrained embeddings """ if embeddings: print "Use pretrained embeddings: ON" assert embeddings.get_value().shape == (vocab_size, embed_dm), "%r != %r" %( embeddings.get_value().shape, (vocab_size, embed_dm) ) self.embeddings = embeddings else: print "Use pretrained embeddings: OFF" embedding_val = np.asarray( rng.normal(0, 0.05, size = (vocab_size, embed_dm)), dtype = theano.config.floatX ) embedding_val[vocab_size-1,:] = 0 # the <PADDING> character is intialized to 0 self.embeddings = theano.shared( np.asarray(embedding_val, dtype = theano.config.floatX), borrow = True, name = 'embeddings' ) self.params = [self.embeddings] self.param_shapes = [(vocab_size, embed_dm)] # Return: # :type, theano.tensor.tensor4 # :param, dimension(1, 1, word embedding dimension, number of words in sentence) # made to be 4D to fit into the dimension of convolution operation sent_embedding_list, updates = theano.map(lambda sent: self.embeddings[sent], input) sent_embedding_tensor = T.stacklists(sent_embedding_list) # make it into a 3D tensor self.output = sent_embedding_tensor.dimshuffle(0, 'x', 2, 1) # make it a 4D tensor
def TestStack(): x = T.matrix('x') y = T.matrix('y') z = T.matrix('z') f = theano.function([x, y, z], T.stacklists([x, y, z])) a = np.ones((5, 4), dtype=np.float32) b = np.ones((5, 4), dtype=np.float32) c = np.ones((5, 4), dtype=np.float32) d = f(a, b, c) print(d.shape) print(d)
def jacobian(f, x, constants=[]): #import pdb #pdb.set_trace() #sz = shape(f) #this produced a bug #sz = shape(f)[0] #alternative formulation found later in code, should get the same result sz = int(shape(f)) #put in in response to bug. This seems to work return tt.stacklists([grad(f[i], x) for i in range(sz)]) ret = th.gradient.jacobian(f, x, consider_constant=constants) if isinstance(ret, list): ret = tt.concatenate(ret, axis=1) return ret
def compute_hessian(self, objective, argument): """ Computes the directional derivative of the gradient (which is equal to the Hessian multiplied by direction). """ g = T.grad(objective, argument) # Create a new tensor A, which has the same type (i.e. same # dimensionality) as argument. is_product_manifold = isinstance(argument, (list, tuple)) if not is_product_manifold: A = argument.type() else: A = [arg.type() for arg in argument] # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient. try: R = T.Rop(g, argument, A) except NotImplementedError: # Implementation based on # tensorflow.python.ops.gradients_impl._hessian_vector_product if not is_product_manifold: proj = T.sum(g * disconnected_grad(A)) R = T.grad(proj, argument) else: proj = [ T.sum(g_elem * disconnected_grad(a_elem)) for g_elem, a_elem in zip(g, A) ] proj_grad = [ T.grad(proj_elem, argument, disconnected_inputs="ignore", return_disconnected="None") for proj_elem in proj ] proj_grad_transpose = map(list, zip(*proj_grad)) proj_grad_stack = [ T.stacklists([c for c in row if c is not None]) for row in proj_grad_transpose ] R = [T.sum(stack, axis=0) for stack in proj_grad_stack] if not is_product_manifold: hess = theano.function([argument, A], R, on_unused_input="warn") else: hess_prod = theano.function(argument + A, R, on_unused_input="warn") def hess(x, a): return hess_prod(*(x + a)) return hess
def convert2class(self, y_hat, y): # y_hat = T.set_subtensor(y_hat[(y_hat < 1).nonzero()], 0) # y_hat = T.set_subtensor(y_hat[(y_hat >= 1).nonzero()], 1) # y_hat = T.stacklists([y_hat[:, 0] + y_hat[:, 1], y_hat[:, 2] + y_hat[:, 3] + y_hat[:, 4]]) y_hat = T.stacklists([T.sum(y_hat[:, 0:2], axis=1), T.sum(y_hat[:, 2:], axis=1)]).T y_hat = T.argmax(y_hat, axis=1) # y_hat = T.set_subtensor(y_hat[(y_hat < 2).nonzero()], 0) # y_hat = T.set_subtensor(y_hat[(y_hat >= 2).nonzero()], 1) y = T.set_subtensor(y[(y < 2).nonzero()], 0) y = T.set_subtensor(y[(y >= 2).nonzero()], 1) return [y_hat, y]
def special_SaP_noise_4_jyc(rng, input, corruption_level): # salt and pepper noise print 'DAE uses salt and pepper noise' a = MRG.binomial(size=input.shape, n=1,\ p=1-corruption_level,dtype=theano.config.floatX) b = MRG.binomial(size=input.shape, n=1,\ p=corruption_level,dtype=theano.config.floatX) c = T.eq(a,0) * b mask = - a + c CX = input * a + c return T.stacklists([X, noise_mask])
def get_samples(self, num_sam, scanF=True): """ Retrieves the samples for the current time step. uncomment parts when time step changes. """ print 'Get_sample func: Number of steps iterate over ::: %d' % self.num_steps H_Ct = T.alloc(0., num_sam, self.dim_sample) #Z = MRG.normal(size=(num_sam, self.dim_sample), avg=0., std=1.) Zs = MRG.normal(size=(self.num_steps, num_sam, self.dim_sample), avg=0., std=1.) Canvases = self.apply_recurrence(self.num_steps, Zs, H_Ct) C = T.sum(T.stacklists(Canvases),axis=0) return activation_fn_th(C, atype='sigmoid'), Canvases
def memnn_cost(self, statements, question, ans, pe_matrix): # statements: list of list of word indices # question: list of word indices computed_memories, updates = theano.scan( self._compute_memories, sequences = [statements], outputs_info = [ alloc_zeros_matrix(self.weights.shape[0], self.n_embedding) ], non_sequences = [ self.weights.dimshuffle(1, 0, 2), pe_matrix ], truncate_gradient = -1, ) memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) # Embed question u1 = T.sum(self.weights[0][question], axis=0) # Layer 1 p = T.nnet.softmax(T.dot(u1, memories[0].T)) o1 = T.dot(p, memories[1]) # Layer 2 u2 = o1 + T.dot(u1, self.H) p = T.nnet.softmax(T.dot(u2, memories[1].T)) o2 = T.dot(p, memories[2]) # Layer 3 u3 = o2 + T.dot(u2, self.H) p = T.nnet.softmax(T.dot(u3, memories[2].T)) o3 = T.dot(p, memories[3]) # Score answers u4 = o3 + T.dot(u3, self.H) # Embed answer a1 = T.sum(self.A[ans[0]], axis=0) a2 = T.sum(self.A[ans[1]], axis=0) a3 = T.sum(self.A[ans[2]], axis=0) a4 = T.sum(self.A[ans[3]], axis=0) a = T.stack(a1, a2, a3, a4) scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T)) #scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T)) output = T.nnet.softmax(scores) return output[0]
def _output(self, input, *args, **kwargs): x = srng.uniform(size=(self.batch_size,), high=self.img_size) y = srng.uniform(size=(self.batch_size,), high=self.img_size) x = T.cast(x, 'int32') y = T.cast(y, 'int32') r = [] for i in range(self.batch_size): item = input[i] item = T.concatenate( [item[:, x[i]:, :], item[:, :x[i], :]], axis=1) item = T.concatenate( [item[:, :, y[i]:], item[:, :, :y[i]]], axis=2) r.append(item) r = T.stacklists(r) return r
def ff_step(single_q, single_m, ev1, ev2, ev3, evo, single_proj): qemb_t = tensor.dot(single_q, tparams['ff_q_emb']) l1_t_linear = tensor.dot(single_proj, tparams['W_ff_h1']) + tensor.dot(qemb_t, tparams['W_ff_q']) print 'l1_t_linear.ndim: %d' %(l1_t_linear.ndim) e_l1_t_ = l1_t_linear.mean(axis=0) print 'e_l1_t_.ndim: %d' %(e_l1_t_.ndim) v_l1_t_ = ((l1_t_linear - e_l1_t_) ** 2).mean(axis=0) print 'v_l1_t_.ndim: %d' %(v_l1_t_.ndim) e_l1_t = tensor.switch(use_noise, e_l1_t_, ev1[0]) print 'ev1[0].ndim: %d' %(ev1[0].ndim) print 'e_l1_t.ndim: %d' %(e_l1_t.ndim) v_l1_t = tensor.switch(use_noise, v_l1_t_, ev1[1]) print 'ev1[1].ndim: %d' %(ev1[1].ndim) print 'v_l1_t.ndim: %d' %(v_l1_t.ndim) l1_t_hat = tparams['gamma_l1'] * ((l1_t_linear - e_l1_t) / (v_l1_t + 0.0001) ** 0.5) + tparams['b_ff_h1'] print 'l1_t_hat.ndim: %d' %(l1_t_hat.ndim) h1_t = tensor.nnet.sigmoid(l1_t_hat) print 'h1_t.ndim: %d' %(h1_t.ndim) l2_t_linear = tensor.dot(h1_t, tparams['W_ff_h2']) e_l2_t_ = l2_t_linear.mean(axis=0) v_l2_t_ = ((l2_t_linear - e_l2_t_) ** 2).mean(axis=0) e_l2_t = tensor.switch(use_noise, e_l2_t_, ev2[0]) v_l2_t = tensor.switch(use_noise, v_l2_t_, ev2[1]) l2_t_hat = tparams['gamma_l2'] * ((l2_t_linear - e_l2_t) / (v_l2_t + 0.0001) ** 0.5) + tparams['b_l2'] h2_t = tensor.nnet.sigmoid(l2_t_hat) l3_t_linear = tensor.dot(h2_t, tparams['W_ff_h3']) e_l3_t_ = l3_t_linear.mean(axis=0) v_l3_t_ = ((l3_t_linear - e_l3_t_) ** 2).mean(axis=0) e_l3_t = tensor.switch(use_noise, e_l3_t_, ev3[0]) v_l3_t = tensor.switch(use_noise, v_l3_t_, ev3[1]) l3_t_hat = tparams['gamma_l3'] * ((l3_t_linear - e_l3_t) / (v_l3_t + 0.0001) ** 0.5) + tparams['b_l3'] h3_t = tensor.nnet.softplus(l3_t_hat) o_t_linear = tensor.dot(h3_t, tparams['W_ff_o']) e_o_t_ = o_t_linear.mean(axis=0) v_o_t_ = ((o_t_linear - e_o_t_) ** 2).mean(axis=0) e_o_t = tensor.switch(use_noise, e_o_t_, evo[0]) v_o_t = tensor.switch(use_noise, v_o_t_, evo[1]) o_t_hat = tparams['gamma_o'] * ((o_t_linear - e_o_t) / (v_o_t + 0.0001) ** 0.5) + tparams['b_ff_o'] o_t = o_t_hat * single_m[:,None] return o_t, qemb_t, single_proj, h1_t, h2_t, h3_t, tensor.stacklists([e_l1_t, v_l1_t]), tensor.stacklists([e_l2_t, v_l2_t]), tensor.stacklists([e_l3_t, v_l3_t]), tensor.stacklists([e_o_t, v_o_t])
def __init__(self, input_train, input_test, input_shape, seq_max_len, n_out=10): super(SequenceSoftmax, self).__init__(None, input_train, input_test) self.n_softmax = seq_max_len + 1 self.input_shape = input_shape n_in = np.prod(input_shape[1:]) self.n_out = n_out # generate n_softmax W matrices def gen_W(out, k): return theano.shared(value=np.zeros((n_in, out), dtype=theano.config.floatX), name='W' + str(k), borrow=True) self.Ws = [gen_W(seq_max_len, 0)] self.Ws.extend([gen_W(self.n_out, _ + 1) for _ in range(seq_max_len)]) # generate n_softmax b vectors def gen_b(out, k): return theano.shared(value=np.zeros((out,), dtype=theano.config.floatX), name='b' + str(k), borrow=True) self.bs = [gen_b(seq_max_len, 0)] self.bs.extend([gen_b(n_out, _ + 1) for _ in range(seq_max_len)]) assert len(self.Ws) == self.n_softmax assert len(self.bs) == self.n_softmax # p_y_given_x[k]: kth output for all y, each of size (batch_size * n_out) self.p_y_given_x = [T.nnet.softmax(T.dot(self.input_test, self.Ws[k]) + self.bs[k]) for k in xrange(self.n_softmax)] # self.pred[idx]: output labels of the 'idx' input self.pred = [T.argmax(self.p_y_given_x[k], axis=1) for k in xrange(self.n_softmax)] self.pred = T.stacklists(self.pred).dimshuffle(1, 0) if self.has_dropout_input: self.p_y_given_x = [T.nnet.softmax(T.dot(self.input_train, self.Ws[k]) + self.bs[k]) for k in xrange(self.n_softmax)] self.params = copy(self.Ws) self.params.extend(self.bs)
def memnn_cost(self, statements, question, pe_matrix): computed_memories, updates = theano.scan( self._compute_memories, sequences = statements, outputs_info = [ alloc_zeros_matrix(self.weights.shape[0], 4800) #init as 3 ], non_sequences = [ #self.weights.dimshuffle(1, 0, 2), self.weights, pe_matrix ], truncate_gradient = -1, ) memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) # Embed question #s = theano.tensor.scalar('s') u1 = question #u1 = weights[0] * question #sv = skipthoughts.encode(model, sentence) # Layer 1 p = T.nnet.softmax(T.dot(u1, memories[0].T)) o1 = T.dot(p, memories[1]) # Layer 2 u2 = o1 + T.dot(u1, self.H) p = T.nnet.softmax(T.dot(u2, memories[1].T)) o2 = T.dot(p, memories[2]) # Layer 3 u3 = o2 + T.dot(u2, self.H) p = T.nnet.softmax(T.dot(u3, memories[2].T)) o3 = T.dot(p, memories[3]) # Final output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T)) print "memnn_cost running" #return output[0, 1, 2, 3] return output[0]
def __init__(self, input_list, n_in, n_out, n_total, mask, batch, W=None, b=None, M=None): w = np.zeros((n_in, n_out)) np.fill_diagonal(w, 1) if W is None: #W = theano.shared(np.random.randn(n_in, n_out).astype(dtype=theano.config.floatX)/np.sqrt(n_in)) W = theano.shared(w.astype(dtype=theano.config.floatX)/np.sqrt(n_in)) if b is None: b = theano.shared(np.zeros(n_out).astype(dtype=theano.config.floatX)) if M is None: M = theano.shared(0.5 * np.ones((n_total, 2)).astype(dtype=theano.config.floatX)) self.W = W self.b = b self.M = M self.v_W = theano.shared(np.zeros((n_in, n_out)).astype(dtype=theano.config.floatX)) self.v_b = theano.shared(np.zeros(n_out).astype(dtype=theano.config.floatX)) self.v_M = theano.shared(np.zeros((n_total, 2)).astype(dtype=theano.config.floatX)) self.input_list = input_list self.input_list[0] = self.input_list[0] self.input_list[1] = (self.input_list[1])[::-1] ''' def Merge(input_seq1, input_seq2, merger): return T.dot((input_seq1 * merger[0] + input_seq2 * merger[1]), self.W) + self.b self.temp_y = a.softmax((theano.scan(Merge, sequences=[self.input_list[0], self.input_list[1], self.M], outputs_info=None))[0]) ''' def Merge(input_seq1, input_seq2): return T.dot((input_seq1 * 1 + input_seq2 * 0), self.W) + self.b self.temp_y = a.softmax((theano.scan(Merge, sequences=[self.input_list[0], self.input_list[1]], outputs_info=None))[0]) self.temp_y = self.temp_y.dimshuffle(1,0,2) self.mask = mask self.batch = batch y_pred_list = [] for i in range(self.batch): y_pred_list.append(T.set_subtensor(T.argmax(self.temp_y[i], axis=1)[self.mask[i]:], 0)) self.y_pred = T.stacklists(y_pred_list) self.params = [self.W, self.b, self.M] self.velo = [self.v_W, self.v_b, self.v_M]
def grad_monitor(param, grad, updates, params, opt, g_t=0., m=0., v=0., e=1e-10): zero = np.float32(0.); eps = 1e-10 old_grad = theano.shared(np.float32(param.get_value()) * zero, name="old_grad_%s" % param.name) updates.append((old_grad, grad)) sharedName, _ = param.name.split('_') # tracked gradient values when adaptive learning rate if opt == 'adam': old_g_t = m/(T.sqrt(v) + e) all_grads = { 'grad' : T.mean(T.sqrt(grad**2)), # 'grad_rel' : T.mean(T.sqrt((grad/(param+1e-12))**2)), 'grad_angle' : T.sum(grad*old_grad)/(T.sqrt(T.sum(grad**2))*T.sqrt(T.sum(old_grad**2))+eps) , # 'grad_max' : T.max(T.sqrt(grad**2)), 'p_t' : T.mean(T.sqrt((g_t)**2)), # 'p_t_rel' : T.mean(T.sqrt((g_t/(param+1e-12))**2)), 'p_t_angle' : T.sum(g_t*old_g_t)/(T.sqrt(T.sum(g_t**2))*T.sqrt(T.sum(old_g_t**2)+eps)), # 'p_t_max' : T.max(T.sqrt(grad**2)) } # tracked gradient values when regular SGD (+momentum) elif opt == None: all_grads = { 'grad' : T.mean(T.sqrt(grad**2)), # 'grad_rel' : T.mean(T.sqrt((grad/(param+1e-12))**2)), 'grad_angle' : T.sum(grad*old_grad)/(T.sqrt(T.sum(grad**2))*T.sqrt(T.sum(old_grad**2))+eps) , # 'grad_max' : T.max(T.sqrt(grad**2)) } # store tracked grads for output temp = [] if params.listGrads == 'all': for grad_type in all_grads.keys(): temp += [all_grads[grad_type]] else: for grad_type in filter(lambda name: name in all_grads.keys(), params.listGrads): temp += [all_grads[grad_type]] trackGrads = T.stacklists(temp) return updates, trackGrads
def get_aggregator(self): initialized = shared_like(0.) numerator_acc = shared_like(self.numerator) denominator_acc = shared_like(self.denominator) squared_num_acc = shared_like(self.squared_num) conditional_update_num = ifelse(initialized, self.numerator + numerator_acc, self.numerator) conditional_update_den = ifelse(initialized, self.denominator + denominator_acc, self.denominator) conditional_update_sqn = ifelse(initialized, self.squared_num + squared_num_acc, self.squared_num) initialization_updates = [(numerator_acc, tensor.zeros_like(numerator_acc)), (denominator_acc, tensor.zeros_like(denominator_acc)), (squared_num_acc, tensor.zeros_like(squared_num_acc)), (initialized, 0.)] accumulation_updates = [(numerator_acc, conditional_update_num), (denominator_acc, conditional_update_den), (squared_num_acc, conditional_update_sqn), (initialized, 1.)] readout_variable = tensor.stacklists([(numerator_acc / denominator_acc), ((squared_num_acc / denominator_acc) - (numerator_acc / denominator_acc)**2)]) aggregator = Aggregator(aggregation_scheme=self, initialization_updates=initialization_updates, accumulation_updates=accumulation_updates, readout_variable = readout_variable) return aggregator
def multi_grad(costs, params): """ Computes the gradient for several different costs separately and provides a rank+1 parameter gradient for each gradient with dimension 0 corresponding to a different cost. """ all_grads = [] for param in params: if len(costs) > 1: param_grads = [] for cost in costs: gparam = T.grad(cost, param) param_grads.append(gparam) all_grads.append(T.stacklists(param_grads)) else: gparam = T.grad(costs[0], param) all_grads.append(gparam) return all_grads
def memnn_cost(self, statements, question, pe_matrix): # statements: list of list of word indices # question: list of word indices computed_memories, updates = theano.scan( self._compute_memories, sequences = [statements], outputs_info = [ alloc_zeros_matrix(self.weights.shape[0], self.n_embedding) ], non_sequences = [ self.weights.dimshuffle(1, 0, 2), pe_matrix ], truncate_gradient = -1, ) memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) # Embed question u1 = T.sum(self.weights[0][question], axis=0) # Layer 1 p = T.nnet.softmax(T.dot(u1, memories[0].T)) o1 = T.dot(p, memories[1]) # Layer 2 u2 = o1 + T.dot(u1, self.H) p = T.nnet.softmax(T.dot(u2, memories[1].T)) o2 = T.dot(p, memories[2]) # Layer 3 u3 = o2 + T.dot(u2, self.H) p = T.nnet.softmax(T.dot(u3, memories[2].T)) o3 = T.dot(p, memories[3]) # Final output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T)) return output[0]
def renet_layer_lr_noscan(X, rnn1, rnn2, w, h, wp, hp): list_of_images = [] for i in xrange(h/hp): # x = X[:,i*hp:(i*hp + hp),:].dimshuffle((2, 0, 1)).flatten().reshape((w/wp, X.shape[0]*wp*hp)) h_tm1 = rnn1.H0 hr_tm1 = rnn2.H0 h1 = [] h2 = [] for j in xrange(w/wp): x = X[:,i*hp:(i*hp + hp),j*wp:(j*wp + wp)].flatten() h_t = rnn1.recurrence(x, h_tm1) h1.append(h_t) h_tm1 = h_t jr = w/wp - j - 1 xr = X[:,i*hp:(i*hp + hp),jr*wp:(jr*wp + wp)].flatten() hr_t = rnn2.recurrence(x, hr_tm1) h2.append(hr_t) hr_tm1 = hr_t img = T.concatenate([h1, h2]) list_of_images.append(img) return T.stacklists(list_of_images).dimshuffle((1, 0, 2))
def grid_lstm_cube(tparams, origin_data, options, prefix='lstm', mask=None): # size_1 = origin_data.shape[0] size_1 = options['grid_depth_1'] size_2 = options['grid_depth_2'] size_3 = options['grid_depth_3'] dim_hidden = options['dim_hidden'] if origin_data.ndim == 3: n_samples = origin_data.shape[1] else: n_samples = 1 assert mask is not None input_data = tensor.dot(origin_data, tparams[_p(prefix, 'W')]) h_list_all = [] # four dim tensor of hidden states c_list_all = [] h_input_all = [] for i in range(size_1): h_list_all.append([]) c_list_all.append([]) for j in range(size_2): h_list_all[i].append([]) c_list_all[i].append([]) for k in range(size_3): #print i, j, k if i < 1: h_1 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) c_1 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) else: h_1 = h_list_all[i-1][j][k][0] c_1 = c_list_all[i-1][j][k][0] if j < 1: c_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) #if k >= 1: # h_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) #else: # h_2 = input_data[i] # h_input_all.append(h_2) h_2 = input_data[i] h_input_all.append(h_2) #h_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) else: h_2 = h_list_all[i][j-1][k][1] c_2 = c_list_all[i][j-1][k][1] if k < 1: c_3 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) h_3 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) else: h_3 = h_list_all[i][j][k-1][2] c_3 = c_list_all[i][j][k-1][2] h1, h2, h3, c1, c2, c3 = grid_lstm_block(tparams, h_1, h_2, h_3, c_1, c_2, c_3, options, prefix, mask[i, :]) #print h1.ndim, h2.ndim, h3.ndim h_list_sides = tensor.stack([h1, h2, h3]) #print h_list_sides.ndim h_list_all[i][j].append(h_list_sides) c_list_sides = tensor.stack([c1, c2, c3]) c_list_all[i][j].append(c_list_sides) out_list = [h_list_all[i][-1][-1][2] for i in range(size_1)] # h_list_all: first three are cube index. last is the output dimension of that block, from 0 to 2. print 'every h to stack is in dim: %d' %(h_list_all[-1][-1][-1][2].ndim) proj = tensor.stack(out_list) print 'proj.ndim is %d' %(proj.ndim) all_medium_states = tensor.stacklists(h_list_all) print 'all_medium_states.ndim is %d' %(all_medium_states.ndim) h_input_all = tensor.stacklists(h_input_all) return proj, all_medium_states, h_input_all
def logp_(value): logps = [tt.log(pi[k]) + logp_normal(mus[k], taus[k], value) for k in range(K)] return tt.sum(logsumexp(tt.stacklists(logps)[:,:n_samples], axis=0))
def memnn_cost(self, statements, question, pe_matrix): # statements: list of list of word indices # question: list of word indices computed_memories, updates = theano.scan( self._compute_memories, sequences = statements, outputs_info = [ #alloc_zeros_matrix(self.weights.shape[0]) #alloc_zeros_matrix(self.weights.shape[0]),self.n_embedding) alloc_zeros_matrix(self.weights.shape[0], 4800) #init as 3 #alloc_zeros_matrix(self.weights.shape[0], 4800, 4) #init as 4 #alloc_zeros_matrix(4) #alloc_zeros_matrix(110, 4800) ], non_sequences = [ #self.weights.dimshuffle(1, 0, 2), #self.weights.dimshuffle(1, 0, 2), self.weights, pe_matrix ], truncate_gradient = -1, ) #memories = computed_memories #memories = T.stacklists(computed_memories) memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) #print computed_memories.shape[0] # Embed question #s = theano.tensor.scalar('s') #u1 = T.sum(self.weights[0][question], axis=0) #u1 = [question] u1 = question #u1 = u1.astype(np.float64) #u1 = np.asarray(u1, dtype=np.float64) #sv = skipthoughts.encode(model, sentence) # Layer 1 p = T.nnet.softmax(T.dot(u1, memories[0].T)) o1 = T.dot(p, memories[1]) # Layer 2 u2 = o1 + T.dot(u1, self.H) p = T.nnet.softmax(T.dot(u2, memories[1].T)) o2 = T.dot(p, memories[2]) # Layer 3 u3 = o2 + T.dot(u2, self.H) p = T.nnet.softmax(T.dot(u3, memories[2].T)) o3 = T.dot(p, memories[3]) # Final output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T)) print "memnn_cost running" #return output[0, 1, 2, 3] return output[0]
def grid_lstm_cube(use_noise, population, tparams, origin_data, options, prefix='lstm', mask=None): # size_1 = origin_data.shape[0] size_1 = options['grid_depth_1'] size_2 = options['grid_depth_2'] # size_3 = options['grid_depth_3'] dim_hidden = options['dim_hidden'] if origin_data.ndim == 3: n_samples = origin_data.shape[1] else: n_samples = 1 assert mask is not None input_data = tensor.dot(origin_data, tparams[_p(prefix, 'W')]) h_list_all = [] # four dim tensor of hidden states c_list_all = [] h_input_all = [] bnstates_all = [] for i in range(size_1): h_list_all.append([]) c_list_all.append([]) bnstates_all.append([]) for j in range(size_2): if i < 1: h_1 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) c_1 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) else: h_1 = h_list_all[i-1][j][0] c_1 = c_list_all[i-1][j][0] if j < 1: c_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) #if k >= 1: # h_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) #else: # h_2 = input_data[i] # h_input_all.append(h_2) # #h_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden) h_2 = input_data[i] h_input_all.append(h_2) else: h_2 = h_list_all[i][j-1][1] c_2 = c_list_all[i][j-1][1] h1, h2, c1, c2, bnstates = grid_lstm_block(use_noise, population, i, j, tparams, h_1, h_2, c_1, c_2, options, prefix, mask[i, :]) #print h1.ndim, h2.ndim, h3.ndim h_list_sides = tensor.stacklists([h1, h2]) #print h_list_sides.ndim h_list_all[i].append(h_list_sides) c_list_sides = tensor.stacklists([c1, c2]) c_list_all[i].append(c_list_sides) print type(bnstates) print 'bnstates[1].ndim is %d' %(bnstates[1].ndim) bnstates_ = tensor.stacklists(bnstates) bnstates_all[i].append(bnstates_) out_list_1 = [h_list_all[i][-1][1] for i in range(size_1)] # h_list_all: first three are cube index. last is the output dimension of that block, from 0 to 1. out_list_0 = [h_list_all[i][-1][0] for i in range(size_1)] out_list_c1 = [c_list_all[i][-1][1] for i in range(size_1)] out_list_c0 = [c_list_all[i][-1][0] for i in range(size_1)] print 'every h to stacklists is in dim: %d' %(h_list_all[-1][-1][1].ndim) proj_h1 = tensor.stacklists(out_list_1) proj_h0 = tensor.stacklists(out_list_0) proj_c1 = tensor.stacklists(out_list_c1) proj_c0 = tensor.stacklists(out_list_c0) proj = tensor.concatenate([proj_h1, proj_h0, proj_c1, proj_c0], axis=2) print 'proj.ndim is %d' %(proj.ndim) all_medium_states = tensor.stacklists(h_list_all) all_bn_states = tensor.stacklists(bnstates_all) print 'all_medium_states.ndim is %d' %(all_medium_states.ndim) h_input_all = tensor.stacklists(h_input_all) return proj, all_medium_states, h_input_all, all_bn_states
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, sent_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, batch_norm, dropout, dropout_in, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {None: 0} self.ivocab = {0: None} self.word2vec = word2vec self.word_vector_size = word_vector_size self.sent_vector_size = sent_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.dropout_in = dropout_in self.max_inp_sent_len = 0 self.max_q_len = 0 """ #To Use All Vocab self.vocab = {None: 0, 'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0} self.ivocab = {0: None, 1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'} #self.vocab = {'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0} #self.ivocab = {1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'} #""" self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.imatrix('input_var') self.q_var = T.ivector('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.attentions = [] self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len) self.pe_matrix_q = self.pe_matrix(self.max_q_len) print "==> building input module" #positional encoder weights self.W_pe = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) #biGRU input fusion weights self.W_inp_res_in_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_upd_in_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_hid_in_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_res_in_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_upd_in_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_hid_in_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) #self.V_f = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.V_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.inp_sent_reps, _ = theano.scan( fn=self.sum_pos_encodings_in, sequences=self.input_var) self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps) #self.inp_c = self.input_module_full(self.inp_sent_reps_stacked) self.inp_c = self.input_module_full(self.inp_sent_reps) self.q_q = self.sum_pos_encodings_q(self.q_var) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,)) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,)) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,)) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, 4 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, 1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,)) self.b_2 = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, 1,)) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): self.mem_weight_num = int(iter - 1) current_episode = self.new_episode(memory[iter - 1]) memory.append(self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in[self.mem_weight_num], self.W_mem_res_hid[self.mem_weight_num], self.b_mem_res[self.mem_weight_num], self.W_mem_upd_in[self.mem_weight_num], self.W_mem_upd_hid[self.mem_weight_num], self.b_mem_upd[self.mem_weight_num], self.W_mem_hid_in[self.mem_weight_num], self.W_mem_hid_hid[self.mem_weight_num], self.b_mem_hid[self.mem_weight_num])) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # add conditional ending? dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan(fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [self.W_pe, self.W_inp_res_in_fwd, self.W_inp_res_hid_fwd, self.b_inp_res_fwd, self.W_inp_upd_in_fwd, self.W_inp_upd_hid_fwd, self.b_inp_upd_fwd, self.W_inp_hid_in_fwd, self.W_inp_hid_hid_fwd, self.b_inp_hid_fwd, self.W_inp_res_in_bwd, self.W_inp_res_hid_bwd, self.b_inp_res_bwd, self.W_inp_upd_in_bwd, self.W_inp_upd_hid_bwd, self.b_inp_upd_bwd, self.W_inp_hid_in_bwd, self.W_inp_hid_hid_bwd, self.b_inp_hid_bwd, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a] if self.answer_module == 'recurrent': self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params, learning_rate=0.0001, beta1=0.5) #from DCGAN paper #updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate=0.0005) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) self.attentions = T.stack(self.attentions) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], outputs=[self.prediction, self.loss, self.attentions], updates=updates, on_unused_input='warn', allow_input_downcast=True) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], outputs=[self.prediction, self.loss, self.attentions], on_unused_input='warn', allow_input_downcast=True)