def simple_upsample3d(inpt, up_factor): inpt = T.repeat(inpt, up_factor[0], axis=3) inpt = T.repeat(inpt, up_factor[1], axis=4) inpt = T.repeat(inpt, up_factor[2], axis=1) #rep = [1, up_factor[2], 1, up_factor[0], up_factor[1]] #inpt = T.tile(inpt, rep, ndim=5) return inpt
def run(self, h): channels = self.channels#images.shape[1] if not self.test: gx,gy,dx,dy,s2,g = self.get_params(h) else: gx,gy,dx,dy,s2,g = self.get_params_test(h) w = self.w_transform.run(h) w = w.reshape((self.batch_size*self.channels, self.N, self.N)) muX = gx.dimshuffle([0,'x']) + dx.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5) muY = gy.dimshuffle([0,'x']) + dy.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5) a = T.arange(self.width).astype(theano.config.floatX) b = T.arange(self.height).astype(theano.config.floatX) Fx = T.exp(-(a-muX.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2) Fy = T.exp(-(b-muY.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2) Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4) Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4) self.Fx = T.repeat(Fx, channels, axis=0) self.Fy = T.repeat(Fy, channels, axis=0) self.fint = self.batched_dot(self.Fy.transpose((0,2,1)), w) self.fim = self.batched_dot(self.fint, self.Fx).reshape((self.batch_size, self.channels*self.width*self.height)) return 1./g * self.fim, (gx, gy, dx, dy, self.fint)
def neglog_2d(output, target): i = T.arange(target.shape[0]).reshape((target.shape[0], 1)) i = T.repeat(i, target.shape[1], axis=1).flatten() j = T.arange(target.shape[1]).reshape((1, target.shape[1])) j = T.repeat(j, target.shape[0], axis=0).flatten() k = target.flatten() return -T.mean(T.log(output)[i, j, k])
def run(self, images, h):#, error_images, h): channels = self.channels#images.shape[1] if not self.test: gx,gy,dx,dy,s2,g = self.get_params(h) else: gx,gy,dx,dy,s2,g = self.get_params_test(h) # how to handle variable sized input images? (mask??) I = images.reshape((self.batch_size*self.channels, self.height, self.width)) muX = gx.dimshuffle([0,'x']) + dx.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5) muY = gy.dimshuffle([0,'x']) + dy.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5) a = T.arange(self.width).astype(theano.config.floatX) b = T.arange(self.height).astype(theano.config.floatX) Fx = T.exp(-(a-muX.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2) Fy = T.exp(-(b-muY.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2) Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4) Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4) self.Fx = T.repeat(Fx, channels, axis=0) self.Fy = T.repeat(Fy, channels, axis=0) self.fint = self.batched_dot(self.Fy, I) # self.efint = T.dot(self.Fx, error_images) self.fim = self.batched_dot(self.fint, self.Fx.transpose([0,2,1])).reshape( (self.batch_size, self.channels*self.N*self.N)) # self.feim = T.dot(self.efint, self.Fy.transpose([0,2,1])).reshape( # (self.batch_size, channels,self.N,self.N)) return g * self.fim, (gx, gy, dx, dy, self.fint)#$T.concatenate([self.fim, self.feim], axis=1)
def output(self, train): X = self.get_input(train) # shape: (nb_samples, time (padded with zeros at the end), input_dim) # new shape: (time, nb_samples, input_dim) -> because theano.scan iterates over main dimension X = X.dimshuffle((1, 0, 2)) xf = self.activation(T.dot(X, self.W_if) + self.b_if) xb = self.activation(T.dot(X, self.W_ib) + self.b_ib) b_o=self.b_o b_on= T.repeat(T.repeat(b_o.reshape((1,self.output_dim)),X.shape[0],axis=0).reshape((1,X.shape[0],self.output_dim)),X.shape[1],axis=0) # Iterate forward over the first dimension of the x array (=time). outputs_f, updates_f = theano.scan( self._step, # this will be called with arguments (sequences[i], outputs[i-1], non_sequences[i]) sequences=xf, # tensors to iterate over, inputs to _step # initialization of the output. Input to _step with default tap=-1. outputs_info=alloc_zeros_matrix(X.shape[1], self.output_dim), non_sequences=[self.W_ff,self.b_f], # static inputs to _step truncate_gradient=self.truncate_gradient ) # Iterate backward over the first dimension of the x array (=time). outputs_b, updates_b = theano.scan( self._step, # this will be called with arguments (sequences[i], outputs[i-1], non_sequences[i]) sequences=xb, # tensors to iterate over, inputs to _step # initialization of the output. Input to _step with default tap=-1. outputs_info=alloc_zeros_matrix(X.shape[1], self.output_dim), non_sequences=[self.W_bb,self.b_b], # static inputs to _step truncate_gradient=self.truncate_gradient, go_backwards=True # Iterate backwards through time ) #return outputs_f.dimshuffle((1, 0, 2)) if self.return_sequences: return T.add(T.tensordot(T.add(outputs_f.dimshuffle((1, 0, 2)), outputs_b[::-1].dimshuffle((1,0,2))),self.W_o,[[2],[0]]),b_on) return T.concatenate((outputs_f[-1], outputs_b[0]))
def keep_max(input, theta, k, sent_mask): sig_input = T.nnet.sigmoid(T.dot(input, theta)) sent_mask = sent_mask.dimshuffle(0, 'x', 1, 'x') sig_input = sig_input * sent_mask #sig_input = T.dot(input, theta) if k == 0: result = input * T.addbroadcast(sig_input, 3) return result, sig_input # get the sorted idx sort_idx = T.argsort(sig_input, axis=2) k_max_ids = sort_idx[:,:,-k:,:] dim0, dim1, dim2, dim3 = k_max_ids.shape batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3) mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3)) mapids = T.repeat(mapids, dim0, axis=0).flatten() rowids = k_max_ids.flatten() colids = T.arange(dim3).reshape((1, dim3)) colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten() sig_mask = T.zeros_like(sig_input) choosed = sig_input[batchids, mapids, rowids, colids] sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1) input_mask = sig_mask * sig_input result = input * T.addbroadcast(input_mask, 3) return result, sig_input
def __theano__unpool(self, inp, us, dim=None, issequence=False): # Determine the dimensionality of convolution (2 or 3?) if dim is None: dim = 3 if not issequence and len(us) == 3 and inp.ndim == 5 else 2 # Reshape 2D sequential data if required # Log input shape inpshape = inp.shape reallyissequential = issequence and inp.ndim == 5 if issequence: if reallyissequential: # Reshape inp = inp.reshape((inpshape[0] * inpshape[1], inpshape[2], inpshape[3], inpshape[4]), ndim=4) us = us[0:2] else: warn("Expected 5D sequential output, but got 4D non-sequential instead.") if dim == 2: y = T.repeat(T.repeat(inp, us[0], axis=2), us[1], axis=3) elif dim == 3: y = T.repeat(T.repeat(T.repeat(inp, us[0], axis=3), us[1], axis=4), us[2], axis=1) else: raise NotImplementedError("Upsampling is implemented in 2D and 3D.") if issequence and reallyissequential: # Reshape sequential data (and remember that the spatial size has doubled) y = y.reshape((inpshape[0], inpshape[1], inpshape[2], us[0] * inpshape[3], us[1] * inpshape[4]), ndim=5) return y
def keep_max(input, theta, k): """ :type input: theano.tensor.tensor4 :param input: the input data :type theta: theano.tensor.matrix :param theta: the parameter for sigmoid function :type k: int :param k: the number k used to define top k sentence to remain """ sig_input = T.nnet.sigmoid(T.dot(input, theta)) if k == 0: # using all the sentences result = input * T.addbroadcast(sig_input, 3) return result, sig_input # get the sorted idx sort_idx = T.argsort(sig_input, axis=2) k_max_ids = sort_idx[:,:,-k:,:] dim0, dim1, dim2, dim3 = k_max_ids.shape batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3) mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3)) mapids = T.repeat(mapids, dim0, axis=0).flatten() rowids = k_max_ids.flatten() colids = T.arange(dim3).reshape((1, dim3)) colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten() # construct masked data sig_mask = T.zeros_like(sig_input) choosed = sig_input[batchids, mapids, rowids, colids] sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1) input_mask = sig_mask * sig_input result = input * T.addbroadcast(input_mask, 3) return result, sig_input
def initial_states(self, batch_size, *args, **kwargs): return [ tensor.repeat(self.initial_state_[None, :], batch_size, 0), tensor.repeat(self.initial_cells[None, :], batch_size, 0), tensor.repeat(self.initial_location[None, :], batch_size, 0), tensor.repeat(self.initial_scale[None, :], batch_size, 0), ]
def create_prediction(self):#做一次predict的方法 gfs=self.gfs pm25in=self.pm25in #初始第一次前传 x=T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2],pm25in[:,0],pm25in[:,1],self.cnt[:,:,0]],axis=1) if self.celltype==RNN: init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size, name="RNN.initial_hidden_state")), x.shape[0], axis=0) if x.ndim > 1 else create_shared(layer.hidden_size, name="RNN.initial_hidden_state")) if hasattr(layer, 'initial_hidden_state') else None for layer in self.model.layers] if self.celltype==LSTM: init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")), x.shape[0], axis=0) if x.ndim > 1 else create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")) if hasattr(layer, 'initial_hidden_state') else None for layer in self.model.layers] self.layerstatus=self.model.forward(x,init_hiddens) #results.shape?40*1 self.results=self.layerstatus[-1] if self.steps > 1: self.layerstatus=self.model.forward(T.concatenate([gfs[:,1],gfs[:,2],gfs[:,3],pm25in[:,1],self.results,self.cnt[:,:,1]],axis=1),self.layerstatus) self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) #前传之后step-2次 for i in xrange(2,self.steps): self.layerstatus=self.model.forward(T.concatenate([gfs[:,i],gfs[:,i+1],gfs[:,i+2],T.shape_padright(self.results[:,i-2]),T.shape_padright(self.results[:,i-1]),self.cnt[:,:,i]],axis=1),self.layerstatus) #need T.shape_padright??? self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) return self.results
def fprop(self, X): w, z = X batch_size, num_channel, height, width = self.glimpse_shape w = w.reshape((batch_size*num_channel, height, width)) centey = z[:, 0] centex = z[:, 1] logdel = z[:, 2] logsig = z[:, 3] loggam = z[:, 4] centy = 0.5 * (self.input_shape[2] + 1) * (centey + 1) centx = 0.5 * (self.input_shape[3] + 1) * (centex + 1) delta = T.exp(logdel) delta = (max(self.input_shape[2], self.input_shape[3]) - 1) * delta /\ (max(self.glimpse_shape[2], self.glimpse_shape[3]) - 1) sigma = T.exp(0.5 * logsig) gamma = T.exp(loggam).dimshuffle(0, 'x') Fy, Fx = self.filter_bank(centx, centy, delta, sigma) if num_channel > 1: Fx = T.repeat(Fx, num_channel, axis=0) Fy = T.repeat(Fy, num_channel, axis=0) I = batched_dot(batched_dot(Fy.transpose(0, 2, 1), w), Fx) reshape_shape = (batch_size, num_channel*self.input_shape[2]*self.input_shape[3]) return I.reshape(reshape_shape) / gamma
def fprop(self, X): x, x_hat, z = X batch_size, num_channel, height, width = self.input_shape x = x.reshape((batch_size*num_channel, height, width)) x_hat = x_hat.reshape((batch_size*num_channel, height, width)) centey = z[:, 0] centex = z[:, 1] logdel = z[:, 2] logsig = z[:, 3] loggam = z[:, 4] centy = 0.5 * (self.input_shape[2] + 1) * (centey + 1) centx = 0.5 * (self.input_shape[3] + 1) * (centex + 1) delta = T.exp(logdel) delta = (max(self.input_shape[2], self.input_shape[3]) - 1) * delta /\ (max(self.glimpse_shape[2], self.glimpse_shape[3]) - 1) sigma = T.exp(0.5 * logsig) gamma = T.exp(loggam).dimshuffle(0, 'x') Fy, Fx = self.filter_bank(centx, centy, delta, sigma) if num_channel > 1: Fx = T.repeat(Fx, num_channel, axis=0) Fy = T.repeat(Fy, num_channel, axis=0) x = batched_dot(batched_dot(Fy, x), Fx.transpose(0, 2, 1)) x_hat = batched_dot(batched_dot(Fy, x_hat), Fx.transpose(0, 2, 1)) reshape_shape = (batch_size, num_channel*self.glimpse_shape[2]*self.glimpse_shape[3]) return gamma * T.concatenate([x.reshape(reshape_shape), x_hat.reshape(reshape_shape)], axis=1)
def get_output(self, train=False): X = self.get_input(train) # mask = self.get_padded_shuffled_mask(train, X, pad=0) mask = self.get_input_mask(train=train) ind = T.switch(T.eq(mask[:, -1], 1.), mask.shape[-1], T.argmin(mask, axis=-1)).astype('int32').ravel() max_time = T.max(ind) X = X.dimshuffle((1, 0, 2)) Y = T.dot(X, self.W) + self.b # h0 = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) h0 = T.repeat(self.h_m1, X.shape[1], axis=0) c0 = T.repeat(self.c_m1, X.shape[1], axis=0) [outputs, _], updates = theano.scan( self._step, sequences=Y, outputs_info=[h0, c0], non_sequences=[self.R], n_steps=max_time, truncate_gradient=self.truncate_gradient, strict=True, allow_gc=theano.config.scan.allow_gc) res = T.concatenate([h0.dimshuffle('x', 0, 1), outputs], axis=0).dimshuffle((1, 0, 2)) if self.return_sequences: return res #return outputs[-1] return res[T.arange(mask.shape[0], dtype='int32'), ind]
def construct_graph_ref(self, args, x, length, popstats=None): p = self.allocate_parameters(args) if args.baseline: def bn(x, gammas, betas): return x + betas else: def bn(x, gammas, betas): mean, var = x.mean(axis=0, keepdims=True), x.var(axis=0, keepdims=True) # if only mean.tag.batchstat, var.tag.batchstat = True, True #var = T.maximum(var, args.epsilon) var = var + args.epsilon return (x - mean) / T.sqrt(var) * gammas + betas def stepfn(x, dummy_h, dummy_c, h, c): # a_mean, b_mean, c_mean, # a_var, b_var, c_var): a_mean, b_mean, c_mean = 0, 0, 0 a_var, b_var, c_var = 0, 0, 0 atilde = T.dot(h, p.Wa) btilde = x a_normal = bn(atilde, p.a_gammas, p.ab_betas) b_normal = bn(btilde, p.b_gammas, 0) ab = a_normal + b_normal g, f, i, o = [fn(ab[:, j * args.num_hidden:(j + 1) * args.num_hidden]) for j, fn in enumerate([self.activation] + 3 * [T.nnet.sigmoid])] c = dummy_c + f * c + i * g c_normal = bn(c, p.c_gammas, p.c_betas) h = dummy_h + o * self.activation(c_normal) return h, c, atilde, btilde, c_normal xtilde = T.dot(x, p.Wx) if args.noise: # prime h with white noise Trng = MRG_RandomStreams() h_prime = Trng.normal((xtilde.shape[1], args.num_hidden), std=args.noise) elif args.summarize: # prime h with mean of example h_prime = x.mean(axis=[0, 2])[:, None] else: h_prime = 0 dummy_states = dict(h=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden)), c=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden))) [h, c, atilde, btilde, htilde], _ = theano.scan( stepfn, sequences=[xtilde, dummy_states["h"], dummy_states["c"]], outputs_info=[T.repeat(p.h0[None, :], xtilde.shape[1], axis=0) + h_prime, T.repeat(p.c0[None, :], xtilde.shape[1], axis=0), None, None, None]) return dict(h=h, c=c, atilde=atilde, btilde=btilde, htilde=htilde), [], dummy_states, popstats
def initial_state_with_taps(self, num=None): if num is not None: cell = T.repeat(self.default_cell, num, axis=0) output = T.repeat(self.default_output, num, axis=0) else: cell = self.default_cell output = self.default_output return dict(initial=output, taps=[-1]), dict(initial=cell, taps=[-1])
def softmax( y ): y_max = T.max(y , axis = 2) y_max_rep = y_max.reshape( ( y_max.shape[0] , y_max.shape[1] , 1)) y_opt = y - T.repeat (y_max_rep , y.shape[2] , axis = 2) y_sum = T.sum( T.exp(y_opt) , axis = 2 ) y_reshape = y_sum.reshape( (y_sum.shape[0] , y_sum.shape[1] , 1) ) a = ( T.exp(y_opt) / T.repeat (y_reshape , y.shape[2] , axis = 2 )) return a
def apply(self, x): # lazy hack h0 = self.parameters[0] c0 = self.parameters[1] Wa = self.parameters[2] Wx = self.parameters[3] if self.baseline: ab_betas = self.parameters[4] h_betas = self.parameters[5] a_gammas = None b_gammas = None h_gammas = None else: a_gammas = self.parameters[4] b_gammas = self.parameters[5] h_gammas = self.parameters[6] ab_betas = self.parameters[7] h_betas = self.parameters[8] xtilde = tensor.dot(x, Wx) if self.noise: # prime h with white noise Trng = MRG_RandomStreams() h_prime = Trng.normal((xtilde.shape[1], self.state_dim), std=args.noise) #elif args.summarize: # # prime h with summary of example # Winit = theano.shared(orthogonal((nclasses, self.state_dim)), name="Winit") # parameters.append(Winit) # h_prime = tensor.dot(x, Winit).mean(axis=0) else: h_prime = 0 dummy_states = dict(h=tensor.zeros((xtilde.shape[0], xtilde.shape[1], self.state_dim)), c=tensor.zeros((xtilde.shape[0], xtilde.shape[1], self.state_dim))) def stepfn(xtilde, dummy_h, dummy_c, h, c): atilde = tensor.dot(h, Wa) btilde = xtilde a = self.bn(atilde, a_gammas, ab_betas) b = self.bn(btilde, b_gammas, 0) ab = a + b g, f, i, o = [fn(ab[:, j * self.state_dim:(j + 1) * self.state_dim]) for j, fn in enumerate([self.children[0].apply] + 3 * [tensor.nnet.sigmoid])] c = dummy_c + f * c + i * g htilde = c h = dummy_h + o * self.children[0].apply(self.bn(htilde, h_gammas, h_betas)) return h, c, atilde, btilde, htilde [h, c, atilde, btilde, htilde], _ = theano.scan( stepfn, sequences=[xtilde, dummy_states["h"], dummy_states["c"]], outputs_info=[tensor.repeat(h0[None, :], xtilde.shape[1], axis=0) + h_prime, tensor.repeat(c0[None, :], xtilde.shape[1], axis=0), None, None, None]) #return dict(h=h, c=c, atilde=atilde, btilde=btilde, htilde=htilde), dummy_states, parameters return h
def output(self, train): X = self.get_input(train) X = X.dimshuffle((1,0,2)) if self.is_entity: Entity = X[-1:].dimshuffle(1,0,2) X = X[:-1] b_y = self.b_y b_yn = T.repeat(T.repeat(b_y.reshape((1,self.output_dim)),X.shape[0],axis=0).reshape((1,X.shape[0],self.output_dim)), X.shape[1], axis=0) xif = T.dot(X, self.W_if) + self.b_if xib = T.dot(X, self.W_ib) + self.b_ib xff = T.dot(X, self.W_ff) + self.b_ff xfb = T.dot(X, self.W_fb) + self.b_fb xcf = T.dot(X, self.W_cf) + self.b_cf xcb = T.dot(X, self.W_cb) + self.b_cb xof = T.dot(X, self.W_of) + self.b_of xob = T.dot(X, self.W_ob) + self.b_ob [outputs_f, memories_f], updates_f = theano.scan( self._step, sequences=[xif, xff, xof, xcf], outputs_info=[ alloc_zeros_matrix(X.shape[1], self.output_dim), alloc_zeros_matrix(X.shape[1], self.output_dim) ], non_sequences=[self.U_if, self.U_ff, self.U_of, self.U_cf], truncate_gradient=self.truncate_gradient ) [outputs_b, memories_b], updates_b = theano.scan( self._step, sequences=[xib, xfb, xob, xcb], outputs_info=[ alloc_zeros_matrix(X.shape[1], self.output_dim), alloc_zeros_matrix(X.shape[1], self.output_dim) ], non_sequences=[self.U_ib, self.U_fb, self.U_ob, self.U_cb], truncate_gradient=self.truncate_gradient ) if self.return_sequences: y = T.add(T.add( T.tensordot(outputs_f.dimshuffle((1,0,2)), self.W_yf, [[2],[0]]), T.tensordot(outputs_b[::-1].dimshuffle((1,0,2)), self.W_yb, [[2],[0]])), b_yn) # y = T.add(T.tensordot( # T.add(outputs_f.dimshuffle((1, 0, 2)), # outputs_b[::-1].dimshuffle((1,0,2))), # self.W_y,[[2],[0]]),b_yn) if self.is_entity: return T.concatenate([y, Entity], axis=1) else: return y return T.concatenate((outputs_f[-1], outputs_b[0]))
def apply(self, x): x_to_inter = T.concatenate([self.x_to_f, self.x_to_i, self.x_to_g, self.x_to_o], axis=1) h_to_inter = T.concatenate([self.h_to_f, self.h_to_i, self.h_to_g, self.h_to_o], axis=1) b_inter = T.concatenate([self.b_f, self.b_i, self.b_g, self.b_o]) x_feat = x.dot(x_to_inter) + b_inter.dimshuffle('x', 'x', 0) x_feat = x_feat.dimshuffle(1, 0, 2) initial_h = T.repeat(self.h, x.shape[0], axis=0) initial_c = T.repeat(self.c, x.shape[0], axis=0) def step(x_feat, h, c, h_to_inter): intermediates = T.tanh(x_feat + h.dot(h_to_inter)) i = intermediates[:, :self.num_hidden] o = intermediates[:, self.num_hidden:2 * self.num_hidden] f = intermediates[:, 2 * self.num_hidden:3 * self.num_hidden] g = intermediates[:, 3 * self.num_hidden:] i = T.nnet.sigmoid(i) o = T.nnet.sigmoid(o) f = T.nnet.sigmoid(f) g = T.tanh(g) new_c = f * c + i * g new_h = o * new_c return new_h, new_c outputs, _ = theano.scan(fn=step, sequences=[x_feat], outputs_info=[dict(initial=initial_h), dict(initial=initial_c)], non_sequences=[h_to_inter]) _, states = outputs return states.dimshuffle(1, 0, 2)
def __init__(self, rng, x, n_in, n_h, p, training, rnn_batch_training=False): """ This is to initialise a standard RNN hidden unit :param rng: random state, fixed value for randome state for reproducible objective results :param x: input data to current layer :param n_in: dimension of input data :param n_h: number of hidden units/blocks :param p: the probability of dropout :param training: a binary value to indicate training or testing (for dropout training) """ self.input = x if p > 0.0: if training==1: srng = RandomStreams(seed=123456) self.input = T.switch(srng.binomial(size=x.shape,p=p), x, 0) else: self.input = (1-p) * x #(1-p) * self.n_in = int(n_in) self.n_h = int(n_h) self.rnn_batch_training = rnn_batch_training # random initialisation Wx_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_in), size=(n_in, n_h)), dtype=config.floatX) Wh_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_h), size=(n_h, n_h)), dtype=config.floatX) # Input gate weights self.W_xi = theano.shared(value=Wx_value, name='W_xi') self.W_hi = theano.shared(value=Wh_value, name='W_hi') # bias self.b_i = theano.shared(value=np.zeros((n_h, ), dtype=config.floatX), name='b_i') # initial value of hidden and cell state if self.rnn_batch_training: self.h0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'h0') self.c0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'c0') self.h0 = T.repeat(self.h0, x.shape[1], 0) self.c0 = T.repeat(self.c0, x.shape[1], 0) else: self.h0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'h0') self.c0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'c0') self.Wix = T.dot(self.input, self.W_xi) [self.h, self.c], _ = theano.scan(self.recurrent_as_activation_function, sequences = [self.Wix], outputs_info = [self.h0, self.c0]) self.output = self.h self.params = [self.W_xi, self.W_hi, self.b_i] self.L2_cost = (self.W_xi ** 2).sum() + (self.W_hi ** 2).sum()
def seq_score(out_matrix,img_matrix): out_len=out_matrix.shape[0] img_len=img_matrix.shape[0] k_mat=T.repeat(T.arange(out_len).reshape((1,out_len)),img_len,axis=0) j_mat=T.repeat(T.arange(img_len).reshape((img_len,1)),out_len,axis=1) #entityscore=T.dot(entity,img_matrix.T) eye=T.eye(out_len,img_len) eye=eye/T.sum(eye) return T.sum(T.dot(out_matrix,img_matrix.T)*eye)
def mmd_full(x_t, y_t, alpha=0.5): """ Implementation of the full kernel MMD statistic (gaussian kernel)""" N = x_t.shape[1] M = y_t.shape[1] term1 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, N) - T.tile(x_t, N)))) term2 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, M) - T.tile(y_t, N)))) term3 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(y_t, M) - T.tile(y_t, M)))) return term1 - 2 * term2 + term3
def get_input_vectors(shape, phases, scaling, offset): x = T.repeat(offset[0] + T.arange(shape[0]) / scaling, shape[1] * phases).reshape( (shape[0], shape[1], phases)) * T.pow(2, T.arange(phases)) y = T.repeat(T.tile(offset[1] + T.arange(shape[1]) / scaling, shape[0]).reshape( (shape[0], shape[1], 1)), phases, axis=2) * T.pow(2, T.arange(phases)) z = T.tile(offset[2] + 10 * T.arange(phases), shape[0] * shape[1]).reshape((shape[0], shape[1], phases, 1)) x = x.reshape((shape[0], shape[1], phases, 1)) y = y.reshape((shape[0], shape[1], phases, 1)) return T.concatenate([x, y, z], axis=3).reshape((shape[0] * shape[1] * phases, 3)).astype('float32')
def get_output_for(self, input, **kwargs): mu = input[0] sigma = input[1] x_range = T.arange(0, self.max_support).dimshuffle('x', 0) mu = T.repeat(mu, self.max_support, axis=1) sigma = T.repeat(sigma, self.max_support, axis=1) x = (x_range - mu) / (sigma * T.sqrt(2.) + 1e-16) cdf = (T.erf(x) + 1.) / 2. return cdf
def get_output_for(self, input, **kwargs): mu = input batch_size, num_latent = mu.shape shp = (batch_size, self.eq_samples, self.iw_samples, num_latent) mu_shp = mu.dimshuffle(0,'x','x',1) mu_shp = T.repeat(mu_shp, axis=1, repeats=self.eq_samples) mu_shp = T.repeat(mu_shp, axis=2, repeats=self.iw_samples) samples = self._srng.binomial( size=shp, p=mu_shp, dtype=theano.config.floatX) return samples.reshape((-1, num_latent))
def micro_activate(x, w, b, act): if x.ndim>1: if act is None: return T.dot(x,w) + T.repeat(b, x.shape[0], axis=0) return act(T.dot(x, w) + T.repeat(b, x.shape[0], axis=0)) else: if act is None: res = T.dot(w.T, x) + b res = act(T.dot(w.T, x) + b) return res.flatten()
def _prepare_outputs_info(self, x_dot_w): if self.learn_init_state: outputs_info = [ T.repeat(self.init_c.dimshuffle('x', 0), x_dot_w.shape[1], axis=0), T.repeat(self.init_h.dimshuffle('x', 0), x_dot_w.shape[1], axis=0), ] else: outputs_info = [ self.init_c, self.init_h ] return outputs_info
def sqdist(a,b,data_num = 59000,dimen = 80): a = T.transpose(a) b = T.transpose(b) aa = T.reshape(T.sum(a ** 2, 0),(1,data_num)) bb = T.reshape(T.sum(b ** 2, 0),(1,dimen)) ab= T.dot(T.transpose(a),b) d = T.repeat(T.transpose(aa),bb.shape[1],axis=1) + T.repeat(bb,aa.shape[1],axis = 0) - 2*ab sigma = T.mean(d) d = T.exp(-d / (2 * sigma)) mvec = T.reshape(T.mean(d, 0),(1,dimen)) d = d - T.repeat(mvec, d.shape[0], axis=0) return d,sigma,mvec
def __init__(self, dnodex,inputdim,dim): X=T.ivector() Y=T.ivector() Z=T.lscalar() NP=T.ivector() lambd = T.scalar() eta = T.scalar() temperature=T.scalar() num_input = inputdim self.umatrix=theano.shared(floatX(np.random.rand(dnodex.nuser,inputdim, inputdim))) self.pmatrix=theano.shared(floatX(np.random.rand(dnodex.npoi,inputdim))) self.p_l2_norm=(self.pmatrix**2).sum() self.u_l2_norm=(self.umatrix**2).sum() num_hidden = dim num_output = inputdim inputs = InputPLayer(self.pmatrix[X,:], self.umatrix[Z,:,:], name="inputs") lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1") #lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2") #lstm3 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm2, name="lstm3") softmax = SoftmaxPLayer(num_hidden, num_output, self.umatrix[Z,:,:], input_layer=lstm1, name="yhat", temperature=temperature) Y_hat = softmax.output() self.layers = inputs, lstm1,softmax params = get_params(self.layers) #caches = make_caches(params) tmp_u=T.mean(T.dot(self.pmatrix[X,:],self.umatrix[Z,:,:]),axis=0) tr=T.dot(tmp_u,(self.pmatrix[X,:]-self.pmatrix[NP,:]).transpose()) pfp_loss1=sigmoid(tr) pfp_loss=pfp_loss1*(T.ones_like(pfp_loss1)-pfp_loss1) tmp_u1=T.reshape(T.repeat(tmp_u,X.shape[0]),(inputdim,X.shape[0])).T pfp_lossv=T.reshape(T.repeat(pfp_loss,inputdim),(inputdim,X.shape[0])).T cost = lambd*10*T.mean(T.nnet.categorical_crossentropy(Y_hat, T.dot(self.pmatrix[Y,:],self.umatrix[Z,:,:])))+lambd*self.p_l2_norm+lambd*self.u_l2_norm # updates = PerSGD(cost,params,eta,X,Z,dnodex)#momentum(cost, params, caches, eta) updates = [] grads = T.grad(cost=cost, wrt=params) updates.append([self.pmatrix,T.set_subtensor(self.pmatrix[X,:],self.pmatrix[X,:]-eta*grads[0])]) updates.append([self.umatrix,T.set_subtensor(self.umatrix[Z,:,:],self.umatrix[Z,:,:]-eta*grads[1])]) for p,g in zip(params[2:], grads[2:]): updates.append([p, p - eta * g]) rlist=T.argsort(T.dot(tmp_u,self.pmatrix.T))[::-1] n_updates=[(self.pmatrix, T.set_subtensor(self.pmatrix[NP,:],self.pmatrix[NP,:]-eta*pfp_lossv*tmp_u1-eta*lambd*self.pmatrix[NP,:]))] p_updates=[(self.pmatrix, T.set_subtensor(self.pmatrix[X,:],self.pmatrix[X,:]+eta*pfp_lossv*tmp_u1-eta*lambd*self.pmatrix[X,:])),(self.umatrix, T.set_subtensor(self.umatrix[Z,:,:],self.umatrix[Z,:,:]+eta*T.mean(pfp_loss)*(T.reshape(tmp_u,(tmp_u.shape[0],1))*T.mean(self.pmatrix[X,:]-self.pmatrix[NP,:],axis=0)))-eta*lambd*self.umatrix[Z,:,:])] self.train = theano.function([X,Y,Z, eta, lambd, temperature], cost, updates=updates, allow_input_downcast=True) self.trainpos=theano.function([X,NP,Z,eta, lambd],tmp_u, updates=p_updates,allow_input_downcast=True) self.trainneg=theano.function([X,NP,Z,eta, lambd],T.mean(pfp_loss), updates=n_updates,allow_input_downcast=True) self.predict_pfp = theano.function([X,Z], rlist, allow_input_downcast=True)
def log_prob_correct(mem, desired_output, cost_mask, max_int): """Compute log-probability of correctness over all registers.""" cost = 0 # Add epsilon to every log to avoid having inf in costs. epsilon = 1e-100 samples = mem.shape[0] sample_idxs = repeat(shape_padright(arange(samples), 1), max_int, axis=1) cell_idxs = repeat(shape_padleft(arange(max_int), 1), samples, axis=0) vals = mem[sample_idxs, cell_idxs, desired_output] cost = (cost_mask * tensor.log(vals + epsilon)).sum(axis=1, keepdims=True) return cost
def initial_states(self, batch_size, *args, **kwargs): return [ tensor.repeat(self.parameters.initial_state[None, :], batch_size, 0) ]
def MyRepeat(x, reps, axes): assert len(reps) == len(axes) y = x for r, a in zip(reps, axes): y = T.repeat(y, [r], axis=a) return y
def training_cost_weighted(self, y, weights=None): """ Wrapper for standard name """ loss = self.hinge_sq(y) weights = T.repeat(weights.dimshuffle('x', 0), y.shape[0], axis=0) factors = weights[T.arange(y.shape[0]), y] return T.sum(loss * factors)
def call(self, x, mask): Mean = x Std = T.repeat(T.exp(self.logstd)[None, :], Mean.shape[0], axis=0) return T.concatenate([Mean, Std], axis=1)
def get_rupture_times_theano(slownesses, patch_size, nuc_x, nuc_y): """ Does the same calculation as get_rupture_times_numpy just with symbolic variable input and output for theano graph implementation optimization. """ [step_dip_max, step_str_max] = slownesses.shape StartTimes = tt.ones((step_dip_max, step_str_max)) * 1e8 StartTimes = tt.set_subtensor(StartTimes[nuc_y, nuc_x], 0) # Stopping check var epsilon = theano.shared(0.1) err_val = theano.shared(1e6) # Iterator matrixes dip1 = tt.repeat(tt.arange(step_dip_max), step_str_max) str1 = tt.tile(tt.arange(step_str_max), step_dip_max) dip2 = tt.repeat(tt.arange(step_dip_max), step_str_max) str2 = tt.tile(tt.arange(step_str_max - 1, -1, -1), step_dip_max) dip3 = tt.repeat(tt.arange(step_dip_max - 1, -1, -1), step_str_max) str3 = tt.tile(tt.arange(step_str_max - 1, -1, -1), step_dip_max) dip4 = tt.repeat(tt.arange(step_dip_max - 1, -1, -1), step_str_max) str4 = tt.tile(tt.arange(step_str_max), step_dip_max) DIP = tt.concatenate([dip1, dip2, dip3, dip4]) STR = tt.concatenate([str1, str2, str3, str4]) ### Upwind scheme ### def upwind(dip_ind, str_ind, StartTimes, slownesses, patch_size): [n_patch_dip, n_patch_str] = slownesses.shape zero = theano.shared(0) s1 = str_ind - 1 d1 = dip_ind - 1 s2 = str_ind + 1 d2 = dip_ind + 1 # if a < b return b checked_s1 = ifelse(tt.lt(s1, zero), zero, s1) checked_d1 = ifelse(tt.lt(d1, zero), zero, d1) # if a =< b return a-1 checked_s2 = ifelse(tt.le(n_patch_str, s2), n_patch_str - 1, s2) checked_d2 = ifelse(tt.le(n_patch_dip, d2), n_patch_dip - 1, d2) ST_xmin = tt.min( (StartTimes[checked_d1, str_ind], StartTimes[checked_d2, str_ind])) ST_ymin = tt.min( (StartTimes[dip_ind, checked_s1], StartTimes[dip_ind, checked_s2])) ### Eikonal equation solver ### # The unique solution to the equation # [(x-a)^+]^2 + [(x-b)^+]^2 = f^2 * h^2 # where a = u_xmin, b = u_ymin, is # # | min(a,b) + f*h, |a-b|>= f*h # xnew = | # |0.5 * [ a+b+sqrt( 2*f^2*h^2 - (a-b)^2 ) ], |a-b| < f*h start_new = ifelse( tt.le(slownesses[dip_ind, str_ind] * patch_size, tt.abs_(ST_xmin - ST_ymin)), tt.min((ST_xmin, ST_ymin)) + slownesses[dip_ind, str_ind] * \ patch_size, (ST_xmin + ST_ymin + \ tt.sqrt(2 * tt.pow(slownesses[dip_ind, str_ind], 2) * \ tt.pow(patch_size, 2) - \ tt.pow((ST_xmin - ST_ymin), 2) )) / 2 ) # if a < b return a output = ifelse(tt.lt(start_new, StartTimes[dip_ind, str_ind]), start_new, StartTimes[dip_ind, str_ind]) return tt.set_subtensor( StartTimes[dip_ind:dip_ind + 1, str_ind:str_ind + 1], output) def loop_upwind(StartTimes, PreviousTimes, err_val, iteration, epsilon): [results, updates] = theano.scan(fn=upwind, sequences=[DIP, STR], outputs_info=[StartTimes], non_sequences=[slownesses, patch_size]) StartTimes = results[-1] err_val = tt.sum(tt.sum(tt.pow((StartTimes - PreviousTimes), 2))) PreviousTimes = StartTimes.copy() return (StartTimes, PreviousTimes, err_val, iteration + 1), \ theano.scan_module.until(err_val < epsilon) # while loop until err < epsilon iteration = theano.shared(0) PreviousTimes = StartTimes.copy() ([result, PreviousTimes, errs, Iteration], updates) = theano.scan( fn=loop_upwind, outputs_info=[StartTimes, PreviousTimes, err_val, iteration], non_sequences=[epsilon], n_steps=500) # arbitrary set, stops after few iterations return result[-1]
def _prepare_outputs_info(self, x_dot_w): outputs_info = [ T.repeat(self.init_c.dimshuffle('x', 0), x_dot_w.shape[1], axis=0), T.repeat(self.init_h.dimshuffle('x', 0), x_dot_w.shape[1], axis=0), ] return outputs_info
def matrixify(vector, n): # Cast n to int32 if necessary to prevent error on 32 bit systems return T.repeat(T.shape_padleft(vector), n if (theano.configdefaults.local_bitwidth() == 64) else T.cast(n,'int32'), axis=0)
def heaviside(x): return T.arange(0, 600).dimshuffle('x', 0) - T.repeat(x, 600, axis=1) >= 0
def get_initial_hidden(self): return [T.repeat(self.hidden[None, :], self.batch_size, 0), T.repeat(self.cells[None, :], self.batch_size, 0)]
dis_layers[-1], { dis_in_x: T.concatenate([sym_x_l, sym_x_u_d], axis=0), dis_in_y: T.concatenate([sym_y, cla_out_y_d_hard], axis=0) }, deterministic=False) dis_out_p_g = ll.get_output(dis_layers[-1], { dis_in_x: gen_out_x, dis_in_y: sym_y_g }, deterministic=False) if objective_flag == 'integrate': # integrate dis_out_p_c = ll.get_output( dis_layers[-1], { dis_in_x: T.repeat(sym_x_u, num_classes, axis=0), dis_in_y: np.tile(np.arange(num_classes), batch_size_u_c) }, deterministic=False) elif objective_flag == 'argmax': # argmax approximation cla_out_y_hard = cla_out_y.argmax(axis=1) dis_out_p_c = ll.get_output(dis_layers[-1], { dis_in_x: sym_x_u, dis_in_y: cla_out_y_hard }, deterministic=False) else: raise Exception('Unknown objective flags') image = ll.get_output(gen_layers[-1], {
def _interpolate(im, x, y, out_height, out_width, border_mode): # *_f are floats num_batch, height, width, channels = im.shape height_f = T.cast(height, theano.config.floatX) width_f = T.cast(width, theano.config.floatX) # clip coordinates to [-1, 1] if border_mode == 'nearest': x = T.clip(x, -1, 1) y = T.clip(y, -1, 1) # 0.9 1.0 1.1 -> 0.9 1.0 0.9 elif border_mode == 'mirror': xa = T.mod(x + 1, 4) - 1 ya = T.mod(y + 1, 4) - 1 x = T.minimum(xa, 2 - xa) y = T.minimum(ya, 2 - ya) # 0.9 1.0 1.1 -> 0.9 1.0 -0.9 elif border_mode == 'wrap': x = T.mod(x + 1, 2) - 1 y = T.mod(y + 1, 2) - 1 else: raise ValueError("border_mode must be one of " "'nearest', 'mirror', 'wrap'") # scale coordinates from [-1, 1] to [0, width/height - 1] x = (x + 1) / 2 * (width_f - 1) y = (y + 1) / 2 * (height_f - 1) # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates; # we need those in floatX for interpolation and in int64 for indexing. for # indexing, we need to take care they do not extend past the image. x0_f = T.floor(x) y0_f = T.floor(y) x1_f = x0_f + 1 y1_f = y0_f + 1 x0 = T.cast(x0_f, 'int64') y0 = T.cast(y0_f, 'int64') x1 = T.cast(T.minimum(x1_f, width_f - 1), 'int64') y1 = T.cast(T.minimum(y1_f, height_f - 1), 'int64') # The input is [num_batch, height, width, channels]. We do the lookup in # the flattened input, i.e [num_batch*height*width, channels]. We need # to offset all indices to match the flat version dim2 = width dim1 = width * height base = T.repeat( T.arange(num_batch, dtype='int64') * dim1, out_height * out_width) base_y0 = base + y0 * dim2 base_y1 = base + y1 * dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels for all samples im_flat = im.reshape((-1, channels)) Ia = im_flat[idx_a] Ib = im_flat[idx_b] Ic = im_flat[idx_c] Id = im_flat[idx_d] # calculate interpolated values wa = ((x1_f - x) * (y1_f - y)).dimshuffle(0, 'x') wb = ((x1_f - x) * (y - y0_f)).dimshuffle(0, 'x') wc = ((x - x0_f) * (y1_f - y)).dimshuffle(0, 'x') wd = ((x - x0_f) * (y - y0_f)).dimshuffle(0, 'x') output = T.sum([wa * Ia, wb * Ib, wc * Ic, wd * Id], axis=0) return output
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=40, cand_size=10, hidden_size=[300, 300], max_pred_pick=5): model_options = locals().copy() print "model options", model_options pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'} seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train( sent_len, claim_len, cand_size) train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev( sent_len, claim_len, cand_size, word2id) test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) dev_sents, dev_sent_masks, dev_sent_labels, dev_claims, dev_claim_mask, dev_sent_names, dev_ground_names, dev_labels, word2id = load_fever_test( sent_len, claim_len, cand_size, word2id) dev_3th_sents, dev_3th_sent_masks, dev_3th_sent_labels, dev_3th_claims, dev_3th_claim_mask, dev_3th_labels, word2id = load_fever_test_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) train_sents = np.asarray(train_sents, dtype='int32') train_3th_sents = np.asarray(train_3th_sents, dtype='int32') joint_train_sents = np.concatenate((train_sents, train_3th_sents)) test_sents = np.asarray(test_sents, dtype='int32') test_3th_sents = np.asarray(test_3th_sents, dtype='int32') joint_test_sents = np.concatenate((test_sents, test_3th_sents)) dev_sents = np.asarray(dev_sents, dtype='int32') dev_3th_sents = np.asarray(dev_3th_sents, dtype='int32') joint_dev_sents = np.concatenate((dev_sents, dev_3th_sents)) train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX) train_3th_sent_masks = np.asarray(train_3th_sent_masks, dtype=theano.config.floatX) joint_train_sent_masks = np.concatenate( (train_sent_masks, train_3th_sent_masks)) test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX) test_3th_sent_masks = np.asarray(test_3th_sent_masks, dtype=theano.config.floatX) joint_test_sent_masks = np.concatenate( (test_sent_masks, test_3th_sent_masks)) dev_sent_masks = np.asarray(dev_sent_masks, dtype=theano.config.floatX) dev_3th_sent_masks = np.asarray(dev_3th_sent_masks, dtype=theano.config.floatX) joint_dev_sent_masks = np.concatenate((dev_sent_masks, dev_3th_sent_masks)) train_sent_labels = np.asarray(train_sent_labels, dtype='int32') train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32') joint_train_sent_labels = np.concatenate( (train_sent_labels, train_3th_sent_labels)) test_sent_labels = np.asarray(test_sent_labels, dtype='int32') test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32') joint_test_sent_labels = np.concatenate( (test_sent_labels, test_3th_sent_labels)) dev_sent_labels = np.asarray(dev_sent_labels, dtype='int32') dev_3th_sent_labels = np.asarray(dev_3th_sent_labels, dtype='int32') joint_dev_sent_labels = np.concatenate( (dev_sent_labels, dev_3th_sent_labels)) train_claims = np.asarray(train_claims, dtype='int32') train_3th_claims = np.asarray(train_3th_claims, dtype='int32') joint_train_claims = np.concatenate((train_claims, train_3th_claims)) test_claims = np.asarray(test_claims, dtype='int32') test_3th_claims = np.asarray(test_3th_claims, dtype='int32') joint_test_claims = np.concatenate((test_claims, test_3th_claims)) dev_claims = np.asarray(dev_claims, dtype='int32') dev_3th_claims = np.asarray(dev_3th_claims, dtype='int32') joint_dev_claims = np.concatenate((dev_claims, dev_3th_claims)) train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX) train_3th_claim_mask = np.asarray(train_3th_claim_mask, dtype=theano.config.floatX) joint_train_claim_mask = np.concatenate( (train_claim_mask, train_3th_claim_mask)) test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX) test_3th_claim_mask = np.asarray(test_3th_claim_mask, dtype=theano.config.floatX) joint_test_claim_mask = np.concatenate( (test_claim_mask, test_3th_claim_mask)) dev_claim_mask = np.asarray(dev_claim_mask, dtype=theano.config.floatX) dev_3th_claim_mask = np.asarray(dev_3th_claim_mask, dtype=theano.config.floatX) joint_dev_claim_mask = np.concatenate((dev_claim_mask, dev_3th_claim_mask)) train_labels = np.asarray(train_labels, dtype='int32') train_3th_labels = np.asarray(train_3th_labels, dtype='int32') joint_train_labels = np.concatenate((train_labels, train_3th_labels)) test_labels = np.asarray(test_labels, dtype='int32') test_3th_labels = np.asarray(test_3th_labels, dtype='int32') joint_test_labels = np.concatenate((test_labels, test_3th_labels)) dev_labels = np.asarray(dev_labels, dtype='int32') dev_3th_labels = np.asarray(dev_3th_labels, dtype='int32') joint_dev_labels = np.concatenate((dev_labels, dev_3th_labels)) joint_train_size = len(joint_train_claims) joint_test_size = len(joint_test_claims) joint_dev_size = len(joint_dev_claims) train_size = len(train_claims) test_size = len(test_claims) dev_size = len(dev_claims) test_3th_size = len(test_3th_claims) dev_3th_size = len(dev_3th_claims) vocab_size = len(word2id) + 1 print 'joint_train size: ', joint_train_size, ' joint_dev size: ', joint_test_size, ' joint_test size: ', joint_dev_size print 'train size: ', train_size, ' dev size: ', test_size, ' test size: ', dev_size print 'vocab size: ', vocab_size rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids = T.itensor3() #(batch, cand_size, sent_len) sents_mask = T.ftensor3() sents_labels = T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.fmatrix() joint_sents_ids = T.itensor3() #(batch, cand_size, sent_len) joint_sents_mask = T.ftensor3() joint_sents_labels = T.imatrix() #(batch, cand_size) joint_claim_ids = T.imatrix() #(batch, claim_len) joint_claim_mask = T.fmatrix() joint_labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' embed_input_sents = init_embeddings[sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim = init_embeddings[claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_att_conv_W, task1_att_conv_b = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_conv_W_context, task1_conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) att_conv_W, att_conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W, att_conv_b, task1_conv_W_context, conv_W_context ] conv_model_sents = Conv_with_Mask( rng, input_tensor3=embed_input_sents, mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_sent_emb = sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) conv_model_claims = Conv_with_Mask( rng, input_tensor3=embed_input_claim, mask_matrix=claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero claim_embeddings = conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1), cand_size, axis=1) ''' attentive conv for task1 ''' task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= embed_input_sents, #batch_size*cand_size, emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=task1_att_conv_W, b=task1_att_conv_b, W_context=task1_conv_W_context, b_context=task1_conv_b_context) task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r concate_claim_sent = T.concatenate([ batch_claim_emb, batch_sent_emb, T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x') ], axis=2) concate_2_matrix = concate_claim_sent.reshape( (batch_size * cand_size, hidden_size[0] * 2 + 1)) LR_input = T.concatenate([ concate_2_matrix, task1_attentive_sent_embeddings_l, task1_attentive_sent_embeddings_r ], axis=1) LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2 # LR_input = concate_2_matrix # LR_input_size = hidden_size[0]*2+1 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 1, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para = [U_a] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(LR_input.dot(U_a)) #batch * 12 inter_matrix = score_matrix.reshape((batch_size, cand_size)) # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1) # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size))) ''' maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix) ''' # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix) # loss = -T.mean(T.log(prob_pos)) #f1 as loss batch_overlap = T.sum(sents_labels * inter_matrix, axis=1) batch_recall = batch_overlap / T.sum(sents_labels, axis=1) batch_precision = batch_overlap / T.sum(inter_matrix, axis=1) batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall + batch_precision) loss = -T.mean(T.log(batch_f1)) # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean() ''' training task2, predict 3 labels ''' joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM joint_embed_input_claim = init_embeddings[ joint_claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) joint_conv_model_sents = Conv_with_Mask( rng, input_tensor3=joint_embed_input_sents, mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_sent_embeddings = joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_batch_sent_emb = joint_sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) joint_premise_emb = T.sum(joint_batch_sent_emb * joint_sents_labels.dimshuffle(0, 1, 'x'), axis=1) #(batch, hidden_size) joint_conv_model_claims = Conv_with_Mask( rng, input_tensor3=joint_embed_input_claim, mask_matrix=joint_claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_claim_embeddings = joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_premise_hypo_emb = T.concatenate( [joint_premise_emb, joint_claim_embeddings], axis=1) #(batch, 2*hidden_size) ''' attentive conv in task2 ''' joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) joint_sents_dot = T.batched_dot( joint_sents_tensor3, joint_sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) joint_sents_dot_2_matrix = T.nnet.softmax( joint_sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) joint_sents_context = T.batched_dot( joint_sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), joint_sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= joint_add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0), mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle( 0, 1, 'x') masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle( 0, 1, 'x') fine_max = T.concatenate([ T.max(masked_sents_attconv, axis=1), T.max(masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) "Logistic Regression layer" joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1) joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0] joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3) joint_LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class joint_LR_para = [joint_U_a, joint_LR_b] joint_layer_LR = LogisticRegression( rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector joint_loss = joint_layer_LR.negative_log_likelihood( joint_labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' # binarize_prob = T.where( inter_matrix > 0.5, 1, 0) #(batch_size, cand_size masked_inter_matrix = inter_matrix * sents_labels #(batch, cand_size) test_premise_emb = T.sum(batch_sent_emb * masked_inter_matrix.dimshuffle(0, 1, 'x'), axis=1) test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings], axis=1) #fine-maxsum sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) sents_dot_2_matrix = T.nnet.softmax( sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) sents_context = T.batched_dot( sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) add_sents_context = embed_input_sents + sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_fine_max = T.concatenate([ T.max(test_masked_sents_attconv, axis=1), T.max(test_masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max], axis=1) test_LR_input_size = joint_LR_input_size test_layer_LR = LogisticRegression( rng, input=test_LR_input, n_in=test_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector params = [init_embeddings] + NN_para + LR_para + joint_LR_para cost = loss + joint_loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels ], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_labels ], [ inter_matrix, test_layer_LR.errors(joint_labels), test_layer_LR.y_pred ], allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_labels ], [ inter_matrix, test_layer_LR.errors(joint_labels), test_layer_LR.y_pred ], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False joint_n_train_batches = joint_train_size / batch_size joint_train_batch_start = list( np.arange(joint_n_train_batches) * batch_size) + [joint_train_size - batch_size] n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] n_test_3th_batches = test_3th_size / batch_size test_3th_batch_start = list(np.arange(n_test_3th_batches) * batch_size) + [test_3th_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_dev_3th_batches = dev_3th_size / batch_size dev_3th_batch_start = list(np.arange(n_dev_3th_batches) * batch_size) + [dev_3th_size - batch_size] max_acc = 0.0 max_test_f1 = 0.0 max_test_acc = 0.0 cost_i = 0.0 joint_train_indices = range(joint_train_size) train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( joint_train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed random.Random(100).shuffle(train_indices) iter_accu = 0 for joint_batch_id in joint_train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1 iter_accu += 1 joint_train_id_batch = joint_train_indices[ joint_batch_id:joint_batch_id + batch_size] for i in range(3): batch_id = random.choice(train_batch_start) train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model( train_sents[train_id_batch], train_sent_masks[train_id_batch], train_sent_labels[train_id_batch], train_claims[train_id_batch], train_claim_mask[train_id_batch], #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels joint_train_sents[joint_train_id_batch], joint_train_sent_masks[joint_train_id_batch], joint_train_sent_labels[joint_train_id_batch], joint_train_claims[joint_train_id_batch], joint_train_claim_mask[joint_train_id_batch], joint_train_labels[joint_train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0): if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() f1_sum = 0.0 error_sum = 0.0 full_evi = 0 predictions = [] for test_batch_id in test_batch_start: # for each test batch batch_prob, error_i, pred_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_sent_masks[test_batch_id:test_batch_id + batch_size], test_sent_labels[test_batch_id:test_batch_id + batch_size], test_claims[test_batch_id:test_batch_id + batch_size], test_claim_mask[test_batch_id:test_batch_id + batch_size], test_labels[test_batch_id:test_batch_id + batch_size]) error_sum += error_i batch_sent_labels = test_sent_labels[ test_batch_id:test_batch_id + batch_size] batch_sent_names = test_sent_names[ test_batch_id:test_batch_id + batch_size] batch_ground_names = test_ground_names[ test_batch_id:test_batch_id + batch_size] batch_ground_labels = test_labels[ test_batch_id:test_batch_id + batch_size] for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get( batch_ground_labels[i]) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped = [(batch_prob[i, k], batch_sent_labels[i][k], batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary if triple[0] >0.5: can control the recall, influence the strict_acc ''' if triple[0] > 0.5: # pred_sent_names.append(batch_sent_names[i][j]) pred_sent_names.append(triple[2]) # if len(pred_sent_names) == max_pred_pick: # break instance_i['predicted_evidence'] = pred_sent_names # print 'pred_sent_names:',pred_sent_names # print 'gold_sent_names:',gold_sent_names new_gold_names = [] for gold_name in gold_sent_names: new_gold_names.append([None, None] + gold_name) instance_i['evidence'] = [new_gold_names] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 # test_f1=f1_sum/(len(test_batch_start)*batch_size) for test_batch_id in test_3th_batch_start: # for each test batch _, error_i, pred_i = test_model( test_3th_sents[test_batch_id:test_batch_id + batch_size], test_3th_sent_masks[test_batch_id:test_batch_id + batch_size], test_3th_sent_labels[test_batch_id:test_batch_id + batch_size], test_3th_claims[test_batch_id:test_batch_id + batch_size], test_3th_claim_mask[test_batch_id:test_batch_id + batch_size], test_3th_labels[test_batch_id:test_batch_id + batch_size]) for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get(2) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) instance_i['predicted_evidence'] = [] instance_i['evidence'] = [] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 if f1 > max_test_f1 or strict_score > max_test_acc: if f1 > max_test_f1: max_test_f1 = f1 if strict_score > max_test_acc: max_test_acc = strict_score #test print '....................\n' f1_sum = 0.0 error_sum = 0.0 full_evi = 0 predictions = [] fine_grained_sent_predictions = { 1: [], 2: [], 3: [], 4: [], 5: [] } fine_grained_page_predictions = { 1: [], 2: [], 3: [], 4: [] } for dev_batch_id in dev_batch_start: # for each test batch batch_prob, error_i, pred_i = dev_model( dev_sents[dev_batch_id:dev_batch_id + batch_size], dev_sent_masks[dev_batch_id:dev_batch_id + batch_size], dev_sent_labels[dev_batch_id:dev_batch_id + batch_size], dev_claims[dev_batch_id:dev_batch_id + batch_size], dev_claim_mask[dev_batch_id:dev_batch_id + batch_size], dev_labels[dev_batch_id:dev_batch_id + batch_size]) error_sum += error_i batch_sent_labels = dev_sent_labels[ dev_batch_id:dev_batch_id + batch_size] batch_sent_names = dev_sent_names[ dev_batch_id:dev_batch_id + batch_size] batch_ground_names = dev_ground_names[ dev_batch_id:dev_batch_id + batch_size] batch_ground_labels = dev_labels[ dev_batch_id:dev_batch_id + batch_size] for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get( batch_ground_labels[i]) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped = [(batch_prob[i, k], batch_sent_labels[i][k], batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary if triple[0] >0.5: can control the recall, influence the strict_acc ''' if triple[0] > 0.5: # pred_sent_names.append(batch_sent_names[i][j]) pred_sent_names.append(triple[2]) # if len(pred_sent_names) == max_pred_pick: # break instance_i['predicted_evidence'] = pred_sent_names # print 'pred_sent_names:',pred_sent_names # print 'gold_sent_names:',gold_sent_names new_gold_names = [] for gold_name in gold_sent_names: new_gold_names.append([None, None] + gold_name) instance_i['evidence'] = [new_gold_names] predictions.append(instance_i) evi_sent_size, evi_page_size = count_sent_page( gold_sent_names) fine_grained_sent_predictions.get( evi_sent_size).append(instance_i) fine_grained_page_predictions.get( evi_page_size).append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 print '......sent...\n' for i in range(1, 6): predictions_i = fine_grained_sent_predictions.get(i) if len(predictions_i) > 0: strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions_i) print i, '\tstrict, all, pre, rec, f1: ', strict_score, label_accuracy, precision, recall, f1 else: print i, '\tstrict, all, pre, rec, f1: ', 0.0, 0.0, 0.0, 0.0, 0.0 print '......page...\n' for i in range(1, 5): predictions_i = fine_grained_page_predictions.get(i) if len(predictions_i) > 0: strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions_i) print i, '\tstrict, all, pre, rec, f1: ', strict_score, label_accuracy, precision, recall, f1 else: print i, '\tstrict, all, pre, rec, f1: ', 0.0, 0.0, 0.0, 0.0, 0.0 for dev_batch_id in dev_3th_batch_start: # for each test batch _, error_i, pred_i = dev_model( dev_3th_sents[dev_batch_id:dev_batch_id + batch_size], dev_3th_sent_masks[dev_batch_id:dev_batch_id + batch_size], dev_3th_sent_labels[dev_batch_id:dev_batch_id + batch_size], dev_3th_claims[dev_batch_id:dev_batch_id + batch_size], dev_3th_claim_mask[dev_batch_id:dev_batch_id + batch_size], dev_3th_labels[dev_batch_id:dev_batch_id + batch_size]) for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get(2) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) instance_i['predicted_evidence'] = [] instance_i['evidence'] = [] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def BasicTheano(): #REPEAT vs TILE x = theano.tensor.fmatrix("x") z = theano.tensor.repeat(x, 1, axis = 0) z_one_more = theano.tensor.repeat(z, 2, axis = 1) foo = theano.function([x], z) foo_one_more = theano.function([z], z_one_more) a = np.array([[1, 2, 3]]).astype("float32") print('a.shape: ') print(a.shape) c = foo(a) c_one_more = foo_one_more(c) print("applying repeat along axis 0") print(c) print(c.shape) print("applying one more along axis 1") print(c_one_more) print(c_one_more.shape) z_tile = theano.tensor.tile(x, (3,2)) foo_tile = theano.function([x], z_tile) c_tile = foo_tile(a) print("applying tile along axis 0") print(c_tile) #TRANSPOSE vs RESHAPE vs DIMSHUFFLE v = theano.tensor.ivector("v") u = theano.tensor.ivector("u") u_dot_v = theano.tensor.dot(u, theano.tensor.transpose(v)) v_trans = theano.tensor.transpose(v) u_dot_v_no_transpose = theano.tensor.dot(u, v) foo_dot = theano.function([u, v], u_dot_v) foo_trans = theano.function([v], v_trans) foo_dot_no_transpose = theano.function([u, v], u_dot_v_no_transpose) v_value = np.array([1, 2, 3]).astype("int32") u_value = np.array([1, 2, 3]).astype("int32") foo_dot_value = foo_dot(u_value, v_value) foo_trans_value = foo_trans(v_value) foo_dot_no_transpose_value = foo_dot_no_transpose(u_value, v_value) print('dot product') print(foo_dot_value) print('dot product no transpose: ') print(foo_dot_no_transpose_value) print('transpose: ') print(foo_trans_value) print(foo_trans_value.shape) print('original shape') print(v_value.shape) print('v reshape') v_reshape = v.reshape((v.shape[0], 1)) print(v_reshape.type) foo_reshape = theano.function([v], v_reshape) foo_reshape_value = foo_reshape(v_value) print(foo_reshape_value.shape) #SUM v_sum_0 = v.sum(axis = 0) foo_sum_0 = theano.function([v], v_sum_0) foo_sum_0_value = foo_sum_0(v_value) print(foo_sum_0_value) #v_sum_1 = v_reshape.sum(axis = 1) #foo_sum_1 = theano.function([v], v_sum_1) #foo_sum_1_value = foo_sum_0(v_value) #print(foo_sum_1_value) #test reshape y = theano.tensor.ftensor3("y") y_shape = y.shape y_reshape = y.reshape((y_shape[1], y_shape[2]))#, y_shape[0] function_reshape = theano.function([y], y_reshape) y_value = np.ones((1,3,2)).astype("float32") print(y_value.shape) y_reshape_value = function_reshape(y_value) print('y_reshape:') print(y_reshape_value) #reshape matrix to tensor3 matrix = theano.tensor.fmatrix("matrix") print(matrix.type) mat_shape = matrix.shape mat_reshape = matrix.reshape((-1, mat_shape[0], mat_shape[1])) print(mat_reshape.type) mat_reshape_func= theano.function([matrix], mat_reshape) mat_value = np.ones((3,2)).astype("float32") mat_reshape_func_out = mat_reshape_func(mat_value) print(mat_value.shape) print("matrix to 3D tensor") print(mat_reshape_func_out.shape) print(mat_reshape_func_out) #creating a square matrix with the given vector as its diagonal given_vec = theano.tensor.fvector("given_vec") diag_mat = theano.tensor.nlinalg.AllocDiag()(given_vec) diag_function = theano.function([given_vec], diag_mat) given_vec_value = np.array([1, 2, 3]).astype("float32") diag_function_value = diag_function(given_vec_value) print("diagonal matrix is: ") print(diag_function_value) print(diag_function_value.shape) #multiply an element of vector (1*N) with a row/column of a matrix (N*D*1) multiply_vector_matrix = T.dot(diag_mat, y_reshape) result_function = theano.function([diag_mat, y_reshape], multiply_vector_matrix) output_value = result_function(diag_function_value, y_reshape_value) print(output_value.shape) print(output_value) #Reshape to convert tensor from matrix to vector/column/row/3D print("matrix to vector/row/column/3D") matrix_origin = theano.tensor.fmatrix('Mat') mat_2_vector = matrix_origin.reshape((matrix_origin.shape[0]*matrix_origin.shape[1], )) print(mat_2_vector.type) mat_2_row = matrix_origin.reshape((1, matrix_origin.shape[0]*matrix_origin.shape[1])) print(mat_2_row.type) mat_2_column = matrix_origin.reshape((matrix_origin.shape[0]*matrix_origin.shape[1], 1)) print(mat_2_column.type) mat_2_3dtensor= matrix_origin.reshape((-1, matrix_origin.shape[0], matrix_origin.shape[1])) print(mat_2_3dtensor.type) f = theano.function([matrix_origin], [mat_2_vector, mat_2_column, mat_2_row, mat_2_3dtensor]) input_value = np.array([[1.,2.], [3.,4.]]).astype("float32") print(input_value.shape) for output in f(np.array([[1., 2.], [3., 4.]]).astype("float32")): print(output.shape) print(output) #REPEAT for 3D tensor print("repeat 3D tensor") h_t = theano.tensor.tensor3("h_t") axis_scalar = theano.tensor.dscalar("axis") h_t_repeat = theano.tensor.repeat(h_t, 3,axis= 0) repeat_func = theano.function([h_t], h_t_repeat) input_value = np.ones((1,3,2)).astype("float32") input_value[0, 1, 1] = 5. input_value[0, 2, 1] = 3. repeat_func_out = repeat_func(input_value) print(repeat_func_out.shape) print("input value:") print(input_value) print("element in output: ") print(repeat_func_out[0, :, :]) print("out: ") print(repeat_func_out) #test (3,1, 2) -> (3, 3, 2) h_t_repeat_1 = theano.tensor.repeat(h_t, 3,axis= 1) repeat_func_1 = theano.function([h_t], h_t_repeat_1) print("repeat (312) to (332)") input_values_312 = np.ones((3,1, 2)).astype("float32") input_values_312[2, 0, 0] = 9. repeat_func_out_332 = repeat_func_1(input_values_312) print(repeat_func_out_332.shape) print("input value") print(input_values_312) print("out value") print(repeat_func_out_332) # print("repeat 2 times: ") # #Repeat 3D tensor 2 times with 2 different axes # b_value = np.ones((1, 1 ,5)).astype("float32") # b_value[0, 0, 2] = 3. # b_value[0, 0, 4] = 5. # print("1st time: ") # print(repeat_func(b_value)) # print(repeat_func(b_value).shape) # h_t_repeat_2x = theano.tensor.repeat(h_t_repeat, 2, axis = 1) # repeat_func_2x = theano.function([h_t_repeat], h_t_repeat_2x) # print("2nd time") # print(repeat_func_2x(repeat_func(b_value))) # print(repeat_func_2x(repeat_func(b_value)).shape) #Theano tensor concatenate z_t = theano.tensor.tensor3("z_t") concat = theano.tensor.concatenate([h_t, z_t], axis = 0) concat_func = theano.function([h_t, z_t], concat) z_t_input = np.ones((3,2,1)).astype("float32") h_t_input = np.ones((3,2,1)).astype("float32") print("concat : ") print(concat_func(h_t_input, z_t_input)) print(concat_func(h_t_input, z_t_input).shape) #T.arange, T.mean, T.log, T.neq print("T.arange function:") mat_y = theano.tensor.fmatrix("mat_y") colum_vector = mat_y[theano.tensor.arange(mat_y.shape[0]), :] t_arange_function = theano.function([mat_y], colum_vector) mat_y_value = np.random.randn(3,2).astype("float32") t_arange_out = t_arange_function(mat_y_value) print('input value:') print(mat_y_value) print('output value:') print(t_arange_out.shape) print(t_arange_out) #NUMPY example A(N, M, K) B(N, M) -> C(N, M) = A[arange(N), arange(M), B] using B as indexing matrix A = np.arange(4*2*5).reshape(4,2,5) B = np.arange(4*2).reshape(4,2)%5 # print('arange: ') # print(np.arange(A.shape[0])[:, np.newaxis]) # print(np.arange(A.shape[1])) C = A[np.arange(A.shape[0])[:, np.newaxis], np.arange(A.shape[1]), B] # print(A) print(B) print(C) print(C.shape) #Theano tensor slicing and assigning print("slicing theano") x_vector = theano.tensor.vector() y_slicing = x_vector[0::2] print(y_slicing.eval({x_vector: np.array([1,2, 3, 4]).astype("float32")})) #Theano split---------------------------------------- # print("split theano") # def split_half(x, axis = 0): # if theano.tensor.le(x.shape[axis], 1): # return x # size1 = x.shape[axis]/2 # size2 = x.shape[axis] - size1 # split_out = theano.tensor.split(x, [size1, size2], 2, axis = axis) # first_part= split_out[0] # second_part = split_out[1] # return (split_half(first_part), split_half(second_part)) # def split_6_along_axis(x, axis = 0): # size = [] # for i in range(6): # size.append(1) # return theano.tensor.split(x, size, 6, axis = axis) # split_x = theano.tensor.matrix("split_x") # axis_split = theano.tensor.lscalar() # split_y_first, split_y_second = split_half(split_x, axis= axis_split) # f_split = theano.function([split_x, axis_split], split_y_first, split_y_second) # print(f_split(np.arange(12).reshape(6, 2).astype("float32"), 0)) # # print(split_y.eval({split_x: np.arange(12).reshape(6, 2).astype("float32"), axis_split: 0})) # split_y_individual = split_6_along_axis(split_x, axis = axis_split) # f_split_individual = theano.function([split_x, axis_split], split_y_individual) # print(f_split_individual(np.arange(12).reshape(6, 2).astype("float32"), 0)) #T.dot between two 3D tensors------------------------- tensor_1 = theano.tensor.tensor3("tensor_1") tensor_2 = theano.tensor.tensor3("tensor_2") dot_2_tensors = theano.tensor.dot(tensor_1, tensor_2) dot_2_tensor_func = theano.function([tensor_1, tensor_2], dot_2_tensors) tensor_1_in = np.ones((3,2,2)).astype("float32") tensor_2_in = np.ones((2,2,3)).astype("float32") #2,1,3 -wrong out_dot_2_tensors = dot_2_tensor_func(tensor_1_in, tensor_2_in) print("dot between two 3D tensors") print(out_dot_2_tensors.shape) # print(out_dot_2_tensors) #Theano tensor identity_like print('tensor identity like') identity_3D = T.identity_like(tensor_1) identity_out = identity_3D.eval({tensor_1: tensor_1_in}) print(identity_out.shape) print(identity_out) print(tensor_1_in.shape) print(tensor_1_in) #T.repeat itself # bi = T.tensor3("bi") # bi = T.repeat(bi, 3, axis = 0) # out = bi.eval({bi: np.ones((1,3,2)).astype("float32")}) # print(out.shape) # out_func = theano.function([bi], bi) # print(out_func(np.ones((1,3,2)).astype("float32")).shape) #i_t = i_t + a_t - Add itself -------------------------- print("adding itself") a_t = T.fmatrix("a_t") h_t = T.fmatrix("h_t") i_t = h_t + a_t i_t = i_t + a_t # function_itself = i_t.eval({i_t: np.ones((2,2)).astype("float32"), a_t: np.ones((2,2)).astype("float32")}) function_itself = theano.function([h_t, a_t], i_t) function_itself_out = function_itself(np.zeros((2,2)).astype("float32"), np.ones((2,2)).astype("float32")) print(function_itself_out) # ------------------------------------------------------ #shared variable repeat - It works shared_var = theano.shared(name = "shared", value = np.ones((1,3, 2)).astype("float32"), borrow = True) shared_var = T.repeat(shared_var, 3, axis = 0) shared_var_reshape_out = shared_var.eval() print(shared_var_reshape_out.shape) #Test Max, Min, and along axis print("Test max, min") value_mat = np.asarray([[1.0, 2.0],[3.0, 4.0]]).astype("float32") test_tensor = theano.tensor.fmatrix("tensor") c = test_tensor.min() function_max = theano.function([test_tensor], c) out = function_max(value_mat) print(out) c_along = test_tensor.min(axis = 1) function_max_along = theano.function([test_tensor], c_along) out = function_max_along(value_mat) print(out) #rescale 3D tensor values to range [0, 1] print("scaling value of a tensor to range [0, 1]") def rescale_step(input_tensor): min_value = input_tensor.min() max_value = input_tensor.max() out_rescale = (input_tensor - min_value)/(max_value- min_value) return out_rescale input_rescale = theano.tensor.tensor3("in_rescale", dtype = theano.config.floatX) output_rescale, updates = theano.scan(fn=rescale_step, outputs_info=[], sequences=[input_rescale], non_sequences=[]) rescale_func = theano.function([input_rescale], output_rescale) input_rescale_value = np.linspace(1, 30, num = 2*5*3, dtype = theano.config.floatX).reshape(2, 5, 3) out_rescale = rescale_func(input_rescale_value) print(out_rescale) print(out_rescale.shape) print("input value") print(input_rescale_value)
def __init__(self, z_n, z_k, encoder_net, decoder_net, opt, iw=False, iw_samples=10, val_iw=False, val_iw_samples=100, regularizer=None, initializer=uniform_initializer(0.05), hard=True, tau0=5., tau_min=0.25, tau_decay=1e-6, srng=RandomStreams(123), eps=1e-9): self.z_n = z_n self.z_k = z_k self.encoder_net = encoder_net self.decoder_net = decoder_net self.srng = srng self.hard = hard self.iw = iw self.iw_samples = iw_samples self.val_iw = val_iw self.val_iw_samples = val_iw_samples self.ceps = T.constant(eps, name='epsilon', dtype='float32') # Temperature self.iteration = K.variable(0, dtype='int32', name='iteration') iter_updates = [(self.iteration, self.iteration + 1)] tau = T.constant(tau0, dtype='float32', name='tau0') if tau_decay > 0: tau_decay = T.constant(tau_decay, name='tau_decay', dtype='float32') tau_min = T.constant(tau_min, name='tau_min', dtype='float32') tau = tau / (1. + (tau_decay * self.iteration)) tau = T.nnet.relu(tau - tau_min) + tau_min self.tau = tau # Prior self.z_prior = T.ones((z_n, z_k), dtype='float32') / z_k pz_params = [] # Quantization span = (z_k - 1.) / 2. self.quant_np = (np.arange(z_k, dtype=np.float32) - span) / span self.quant = T.constant(self.quant_np, name='quant', dtype='float32') print("Quantization: {}".format(self.quant_np)) # Input input_x = T.fmatrix(name='input_x') # (n, input_units) rnd = srng.uniform(size=input_x.shape, low=0., high=1., dtype='float32') input_x_binary = T.gt(input_x, rnd) # (n, input_units) (train_loss, mean_nll_x, mean_kl, encode_updates, decode_updates) = self.calc_nll_tot(iw=iw, iw_samples=iw_samples, input_x_binary=input_x_binary, validation=False) val_loss, val_mean_nll_x, val_mean_kl, _1, _2 = self.calc_nll_tot( iw=val_iw, iw_samples=val_iw_samples, input_x_binary=input_x_binary, validation=True) # Validation function val_function = theano.function([input_x], [val_mean_nll_x, val_mean_kl, val_loss]) val_headers = ['Val NLL X', 'KL', 'Val NLL'] # Regularization self.params = pz_params + encoder_net.params + decoder_net.params reg_loss = T.constant(0.) if regularizer: for p in self.params: reg_loss += regularizer(p) # Training loss = train_loss + reg_loss train_updates = opt.get_updates(loss, self.params) all_updates = train_updates + iter_updates + decode_updates + encode_updates train_function = theano.function( [input_x], [mean_nll_x, mean_kl, reg_loss, loss, self.tau], updates=fix_updates(all_updates)) train_headers = ['NLL X', 'KL', 'Reg', 'Loss', 'Tau'] weights = (self.params + opt.weights + [self.iteration] + encoder_net.non_trainable_weights + decoder_net.non_trainable_weights) # Generation input_n = T.iscalar() logitrep = T.log(self.ceps + T.repeat( T.reshape(self.z_prior, (1, z_n, z_k)), repeats=input_n, axis=0)) zsamp = sample_one_hot(logits=logitrep, srng=srng) zqsamp = T.dot(zsamp, self.quant) # (n, z_n) xgen, _ = self.decode(zqsamp, validation=True) # rnd = srng.uniform(size=xgen.shape, low=0., high=1., dtype='float32') # xsamp = T.cast(T.gt(xgen, rnd), 'int32') generate_function = theano.function([input_n], xgen) # xsamp for binarized self.sample_z_function = theano.function([input_n], zqsamp) # Decoding input_zq = T.fmatrix() xgen, _ = self.decode(input_zq, validation=True) self.decode_function = theano.function([input_zq], xgen) # Autoencode # rnd = srng.uniform(low=0., high=1., dtype='float32', size=val_xpred.shape) # xout = T.cast(T.gt(val_xpred, rnd), dtype='float32') pz, z, _ = self.encode(input_x_binary, validation=True) # (n, z_n, z_k) zq = T.dot(z, self.quant) xpred, _ = self.decode(zq, validation=True) # (n, input_units) autoencode_function = theano.function( [input_x], [input_x_binary, xpred]) # xout for binarized super(GumbelQuantizedAutoencoder, self).__init__(train_headers=train_headers, val_headers=val_headers, train_function=train_function, generate_function=generate_function, val_function=val_function, autoencode_function=autoencode_function, weights=weights)
def initial_states(self, batch_size, *args, **kwargs): return [ tensor.repeat(self.initial_state_[None, :], batch_size, 0), tensor.repeat(self.initial_cells[None, :], batch_size, 0) ]
def __init__(self, rng, x, n_in, n_out, p=0.0, training=1, rnn_batch_training=False): """ This is to initialise a standard RNN hidden unit :param rng: random state, fixed value for randome state for reproducible objective results :param x: input data to current layer :param n_in: dimension of input data :param n_out: dimension of output data :param p: the probability of dropout :param training: a binary value to indicate training or testing (for dropout training) """ self.input = x if p > 0.0: if training == 1: srng = RandomStreams(seed=123456) self.input = T.switch(srng.binomial(size=x.shape, p=p), x, 0) else: self.input = (1 - p) * x #(1-p) * self.n_in = int(n_in) self.n_out = int(n_out) self.rnn_batch_training = rnn_batch_training # random initialisation Wx_value = np.asarray(rng.normal(0.0, old_div(1.0, np.sqrt(n_in)), size=(n_in, n_out)), dtype=config.floatX) Wy_value = np.asarray(np.zeros((n_out, n_out)), dtype=config.floatX) # Input gate weights self.W_xi = theano.shared(value=Wx_value, name='W_xi') self.W_yi = theano.shared(value=Wy_value, name='W_yi') # bias self.b_y = theano.shared(value=np.zeros((n_out, ), dtype=config.floatX), name='b_y') # initial value of output if self.rnn_batch_training: self.y0 = theano.shared(value=np.zeros((1, n_out), dtype=config.floatX), name='y0') self.y0 = T.repeat(self.y0, x.shape[1], 0) else: self.y0 = theano.shared(value=np.zeros((n_out, ), dtype=config.floatX), name='y0') self.Wix = T.dot(self.input, self.W_xi) self.y, _ = theano.scan(self.recurrent_as_activation_function, sequences=self.Wix, outputs_info=self.y0) self.output = self.y self.params = [self.W_xi, self.W_yi, self.b_y]
def __init__(self, data_dir, word2vec, word_vector_size, truncate_gradient, learning_rate, dim, cnn_dim, cnn_dim_fc, story_len, patches, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.learning_rate = learning_rate self.truncate_gradient = truncate_gradient self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.cnn_dim = cnn_dim self.cnn_dim_fc = cnn_dim_fc self.story_len = story_len self.mode = mode self.patches = patches self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind(self.data_dir, 'train') self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind(self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) # Since this is pretty expensive, we will pass a story each time. # We assume that the input has been processed such that the sequences of patches # are snake like path. self.input_var = T.tensor4('input_var') # (batch_size, seq_len, patches, cnn_dim) self.q_var = T.matrix('q_var') # Now, it's a batch * image_sieze. self.answer_var = T.imatrix('answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3('answer_inp_var') # answer of example in minibatch print "==> building input module" self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) #self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # First, we embed the visual features before sending it to the bi-GRUs. inp_rhp = T.reshape(self.input_var, (self.batch_size* self.story_len* self.patches, self.cnn_dim)) inp_rhp_dimshuffled = inp_rhp.dimshuffle(1,0) inp_rhp_emb = T.dot(self.W_inp_emb_in, inp_rhp_dimshuffled) inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1,0) inp_emb_raw = T.reshape(inp_rhp_emb_dimshuffled, (self.batch_size, self.story_len, self.patches, self.cnn_dim)) inp_emb = T.tanh(inp_emb_raw) # Just follow the paper DMN for visual and textual QA. # Now, we use a bi-directional GRU to produce the input. # Forward GRU. self.inp_dim = self.dim/2 # since we have forward and backward self.W_inpf_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,)) self.W_inpf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,)) self.W_inpf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,)) # Backward GRU. self.W_inpb_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,)) self.W_inpb_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,)) self.W_inpb_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,)) # Now, we use the GRU to build the inputs. # Two-level of nested scan is unnecessary. It will become too complicated. Just use this one. inp_dummy = theano.shared(np.zeros((self.inp_dim, self.story_len), dtype = floatX)) for i in range(self.batch_size): if i == 0: inp_1st_f, _ = theano.scan(fn = self.input_gru_step_forward, sequences = inp_emb[i,:].dimshuffle(1,2,0), outputs_info=T.zeros_like(inp_dummy)) inp_1st_b, _ = theano.scan(fn = self.input_gru_step_backward, sequences = inp_emb[i,:,::-1,:].dimshuffle(1,2,0), outputs_info=T.zeros_like(inp_dummy)) # Now, combine them. inp_1st = T.concatenate([inp_1st_f.dimshuffle(2,0,1), inp_1st_b.dimshuffle(2,0,1)], axis = -1) self.inp_c = inp_1st.dimshuffle('x', 0, 1, 2) else: inp_f, _ = theano.scan(fn = self.input_gru_step_forward, sequences = inp_emb[i,:].dimshuffle(1,2,0), outputs_info=T.zeros_like(inp_dummy)) inp_b, _ = theano.scan(fn = self.input_gru_step_backward, sequences = inp_emb[i,:,::-1,:].dimshuffle(1,2,0), outputs_info=T.zeros_like(inp_dummy)) # Now, combine them. inp_fb = T.concatenate([inp_f.dimshuffle(2,0,1), inp_b.dimshuffle(2,0,1)], axis = -1) self.inp_c = T.concatenate([self.inp_c, inp_fb.dimshuffle('x', 0, 1, 2)], axis = 0) # Done, now self.inp_c should be batch_size x story_len x patches x cnn_dim # Eventually, we can flattern them. # Now, the input dimension is 1024 because we have forward and backward. inp_c_t = T.reshape(self.inp_c, (self.batch_size, self.story_len * self.patches, self.dim)) inp_c_t_dimshuffled = inp_c_t.dimshuffle(0,'x', 1, 2) inp_batch = T.repeat(inp_c_t_dimshuffled, self.story_len, axis = 1) # Now, its ready for all the 5 images in the same story. # 50 * 980 * 512 self.inp_batch = T.reshape(inp_batch, (inp_batch.shape[0] * inp_batch.shape[1], inp_batch.shape[2], inp_batch.shape[3])) self.inp_batch_dimshuffled = self.inp_batch.dimshuffle(1,2,0) # 980 x 512 x 50 # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) print "==> building question module" # Now, share the parameter with the input module. self.W_inp_emb_q = nn_utils.normal_param(std = 0.1, shape=(self.dim, self.cnn_dim_fc)) self.b_inp_emb_q = nn_utils.normal_param(std = 0.1, shape=(self.dim,)) q_var_shuffled = self.q_var.dimshuffle(1,0) inp_q = T.dot(self.W_inp_emb_q, q_var_shuffled) + self.b_inp_emb_q.dimshuffle(0,'x') # 512 x 50 self.q_q = T.tanh(inp_q) # Since this is used to initialize the memory, we need to make it tanh. print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1,)) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) #current_episode = self.new_episode(m) #current_episode = printing.Print('current_episode')(current_episode) memory.append(self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) logging.info('last_mem size') print last_mem.shape.eval({self.input_var: np.random.rand(10,5,196,512).astype('float32'), self.q_var: np.random.rand(50, 4096).astype('float32')}) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1,2,0) # Sounds good. Now, we need to map last_mem to a new space. self.W_mem_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.dim * 2)) self.W_inp_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.vocab_size + 1)) def _dot2(x, W): return T.dot(W, x) answer_inp_var_shuffled_emb,_ = theano.scan(fn = _dot2, sequences = answer_inp_var_shuffled, non_sequences = self.W_inp_emb ) # seq x dim x batch # Now, we also need to embed the image and use it to do the memory. #q_q_shuffled = self.q_q.dimshuffle(1,0) # dim * batch. init_ans = T.concatenate([self.q_q, last_mem], axis = 0) mem_ans = T.dot(self.W_mem_emb, init_ans) # dim x batchsize. mem_ans_dim = mem_ans.dimshuffle('x',0,1) answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis = 0) # Now, we have both embedding. We can let them go to the rnn. # We also need to map the input layer as well. dummy = theano.shared(np.zeros((self.dim, self.batch_size * self.story_len), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) logging.info('answer_inp size') #print answer_inp.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32')}) #last_mem = printing.Print('prob_sm')(last_mem) results, _ = theano.scan(fn = self.answer_gru_step, sequences = answer_inp, outputs_info = [ dummy ]) # Assume there is a start token #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') results = results[1:-1,:,:] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob,_ = theano.scan(fn = lambda x, w: T.dot(w, x), sequences = results, non_sequences = self.W_a ) prob_shuffled = prob.dimshuffle(2,0,1) # b * len * vocab logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) self.prediction = prob_sm mask = T.reshape(self.answer_mask, (n,)) lbl = T.reshape(self.answer_var, (n,)) self.params = [self.W_inp_emb_in, #self.b_inp_emb_in, self.W_inpf_res_in, self.W_inpf_res_hid,self.b_inpf_res, self.W_inpf_upd_in, self.W_inpf_upd_hid, self.b_inpf_upd, self.W_inpf_hid_in, self.W_inpf_hid_hid, self.b_inpf_hid, self.W_inpb_res_in, self.W_inpb_res_hid, self.b_inpb_res, self.W_inpb_upd_in, self.W_inpb_upd_hid, self.b_inpb_upd, self.W_inpb_hid_in, self.W_inpb_hid_hid, self.b_inpb_hid, self.W_inp_emb_q, self.b_inp_emb_q, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a, self.W_mem_emb, self.W_inp_emb, self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, ] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec ).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate = self.learning_rate) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var], outputs=[self.prediction, self.loss])
def construct_graph_popstats(self, args, x, drops, length, popstats=None): p = self.allocate_parameters(args) def stepfn(x, drops, dummy_h, dummy_c, pop_means_a, pop_means_b, pop_means_c, pop_vars_a, pop_vars_b, pop_vars_c, h, c): atilde = T.dot(h, p.Wa) btilde = x if args.baseline: a_normal, a_mean, a_var = bn(atilde, 1.0, p.ab_betas, pop_means_a, pop_vars_a, args) b_normal, b_mean, b_var = bn(btilde, 1.0, 0, pop_means_b, pop_vars_b, args) else: a_normal, a_mean, a_var = bn(atilde, p.a_gammas, p.ab_betas, pop_means_a, pop_vars_a, args) b_normal, b_mean, b_var = bn(btilde, p.b_gammas, 0, pop_means_b, pop_vars_b, args) ab = a_normal + b_normal g, f, i, o = [ fn(ab[:, j * args.num_hidden:(j + 1) * args.num_hidden]) for j, fn in enumerate([self.activation] + 3 * [T.nnet.sigmoid]) ] if args.elephant: c_n = dummy_c + f * c + drops * (i * g) else: c_n = dummy_c + f * c + i * g if args.baseline: c_normal, c_mean, c_var = bn(c_n, 1.0, p.c_betas, pop_means_c, pop_vars_c, args) else: c_normal, c_mean, c_var = bn(c_n, p.c_gammas, p.c_betas, pop_means_c, pop_vars_c, args) h_n = dummy_h + o * self.activation(c_normal) ## Zoneout if args.zoneout: h = h_n * drops + (1 - drops) * h c = c_n * drops + (1 - drops) * c else: h = h_n c = c_n return (h, c, atilde, btilde, c_normal, a_mean, b_mean, c_mean, a_var, b_var, c_var) xtilde = T.dot(x, p.Wx) if args.noise: # prime h with white noise Trng = MRG_RandomStreams() h_prime = Trng.normal((xtilde.shape[1], args.num_hidden), std=args.noise) elif args.summarize: # prime h with mean of example h_prime = x.mean(axis=[0, 2])[:, None] else: h_prime = 0 dummy_states = dict(h=T.zeros( (xtilde.shape[0], xtilde.shape[1], args.num_hidden)), c=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden))) if popstats is None: popstats = OrderedDict() for key, size in zip( "abc", [4 * args.num_hidden, 4 * args.num_hidden, args.num_hidden]): for stat, init in zip("mean var".split(), [0, 1]): name = "%s_%s" % (key, stat) popstats[name] = theano.shared(init + np.zeros( ( length, size, ), dtype=theano.config.floatX), name=name) popstats_seq = [ popstats['a_mean'], popstats['b_mean'], popstats['c_mean'], popstats['a_var'], popstats['b_var'], popstats['c_var'] ] [ h, c, atilde, btilde, htilde, batch_mean_a, batch_mean_b, batch_mean_c, batch_var_a, batch_var_b, batch_var_c ], _ = theano.scan( stepfn, sequences=[xtilde, drops, dummy_states["h"], dummy_states["c"]] + popstats_seq, outputs_info=[ T.repeat(p.h0[None, :], xtilde.shape[1], axis=0) + h_prime, T.repeat(p.c0[None, :], xtilde.shape[1], axis=0), None, None, None, None, None, None, None, None, None ]) batchstats = OrderedDict() batchstats['a_mean'] = batch_mean_a batchstats['b_mean'] = batch_mean_b batchstats['c_mean'] = batch_mean_c batchstats['a_var'] = batch_var_a batchstats['b_var'] = batch_var_b batchstats['c_var'] = batch_var_c updates = OrderedDict() if not args.use_population_statistics: alpha = 1e-2 for key in "abc": for stat, init in zip("mean var".split(), [0, 1]): name = "%s_%s" % (key, stat) popstats[name].tag.estimand = batchstats[name] updates[popstats[name]] = (alpha * batchstats[name] + (1 - alpha) * popstats[name]) return dict(h=h, c=c, atilde=atilde, btilde=btilde, htilde=htilde), updates, dummy_states, popstats
def matrixify(vector, n): return T.repeat(T.shape_padleft(vector), n, axis=0)
def onestep_attend_tell(x_t, pre_h, pre_c, pre_z, Wi, Wf, Wc, Wo, Ui, Uf, Uc, Uo, Zi, Zf, Zc, Zo, Zcontext, Hcontext, Va, bi, bf, bc, bo, image_feature_region, weight_y): #------------------------------------------------- # pre_h = T.tensor3(name = 'h0_initial', dtype = theano.config.floatX) # x_t = T.tensor3(name ='x', dtype=theano.config.floatX) # pre_z = T.tensor3(name= 'z0_initial', dtype = theano.config.floatX) # Wi, Ui, Zi = T.fmatrices(3) # bi = T.ftensor3("bi") #------------------------------------------------- i_t = T.dot(x_t, Wi) + T.dot(pre_h, Ui) + T.dot(pre_z, Zi) i_t_shape = T.shape(i_t) #------------------------------------------------------------------ # i_t_test = i_t.eval({x_t: x_theano, pre_h: h0_theano, pre_z: z0_theano, Wi: Wx[:, :H], Ui: Wh[:, :H], Zi: Wz[:, :H]}) # print(i_t_test.shape) # pdb.set_trace() #------------------------------------------------------------------ bi_reshape = T.repeat(bi, i_t_shape[0], 0) bi_reshape_2x = T.repeat(bi_reshape, i_t_shape[1], 1) # ----------------------------------------------------------------- # bi_test = bi_reshape_2x.eval({bi: b_theano[:,:,:H], i_t: i_t.eval({i_t: np.zeros((1,2,4)).astype("float32")})}) # print(bi_test.shape) # pdb.set_trace() # ------------------------------------------------------------------ bf_reshape = T.repeat(bf, i_t_shape[0], 0) bf_reshape_2x = T.repeat(bf_reshape, i_t_shape[1], 1) bc_reshape = T.repeat(bc, i_t_shape[0], 0) bc_reshape_2x = T.repeat(bc_reshape, i_t_shape[1], 1) bo_reshape = T.repeat(bo, i_t_shape[0], 0) bo_reshape_2x = T.repeat(bo_reshape, i_t_shape[1], 1) i_t_new= sigmoid(i_t + bi_reshape_2x) # ------------------------------------------------------------------ # i_t_new_eval = i_t_new.eval({i_t: np.zeros((1,2,4)).astype("float32"), bi: b_theano[:, : , :H]}) # print(i_t_new_eval.shape) # pdb.set_trace() # -------------------------------------------------------------------- f_t= sigmoid(T.dot(x_t, Wf) + T.dot(pre_h, Uf) + T.dot(pre_z, Zf) + bf_reshape_2x) # -------------------------------------------------------------------- # f_t_eval = f_t.eval({x_t:x_theano, pre_h: h0_theano, pre_z: z0_theano, # Wf: Wx[:, H:2*H], # Uf: Wh[:, H:2*H], # Zf: Wz[:, H:2*H], # bf: b_theano[:, :, H:2*H], # i_t: np.zeros((1,2,4)).astype("float32")}) # print(f_t_eval.shape) # pdb.set_trace() # -------------------------------------------------------------------- o_t= sigmoid(T.dot(x_t, Wo) + T.dot(pre_h, Uo) + T.dot(pre_z, Zo) + bo_reshape_2x) c_th = tanh(T.dot(x_t, Wc) + T.dot(pre_h, Uc) + T.dot(pre_z, Zc) + bc_reshape_2x) c_t = f_t*pre_c + i_t_new*c_th h_t = o_t*T.tanh(c_t) #shape (1, N, h_dim) # ------------------------------------------------------------------ # ht_test = h_t.eval({x_t:x_theano, pre_h: h0_theano, pre_c: c0_theano, pre_z: z0_theano, # Wi: Wx[:, :H], Wf: Wx[:, H:2*H], Wo: Wx[:, 2*H:3*H], Wc: Wx[:, 3*H:], # Ui: Wh[:, :H], Uf: Wh[:, H:2*H], Uo: Wh[:, 2*H:3*H], Uc: Wh[:, 3*H:], # Zi: Wz[:, :H], Zf: Wz[:, H:2*H], Zo: Wz[:, 2*H:3*H], Zc: Wz[:, 3*H:], # bi: b_theano[:,:,:H], bf: b_theano[:, :, H:2*H], bo: b_theano[ :, :, 2*H:3*H], bc: b_theano[:,:, 3*H:]}) # print(ht_test.shape) # pdb.set_trace() # ------------------------------------------------------------------ h_t_context = T.repeat(h_t, image_feature_region.shape[1], axis = 0) #new shape (No_region, N, h_dim) image_feature_reshape = T.transpose(image_feature_region, (1, 0, 2)) #compute non-linear correlation between h_t(current text) to image_feature_region (64 for 128*128 and 196 for 224*224) # pdb.set_trace() m_t = T.tanh(T.dot(h_t_context, Hcontext) + T.dot(image_feature_reshape, Zcontext)) #shape (No_region, N, context_dim) # ------------------------------------------------------------------ # N = 2 #number of sample # D = 5 #dimension of input # H = 4 #dimension of hidden # T_new = 1 #length of per each sample # context_dim = 3 # K = 5 # x = np.linspace(-0.4, 0.6, num=N*T_new*D, dtype = theano.config.floatX).reshape(T_new, N, D) # h0= np.linspace(-0.4, 0.8, num=N*H, dtype = theano.config.floatX).reshape(N, H) # Wx= np.linspace(-0.2, 0.9, num=4*D*H, dtype = theano.config.floatX).reshape(D, 4*H) # Wh= np.linspace(-0.3,0.6, num =4*H*H, dtype = theano.config.floatX).reshape(H,4*H) # b = np.linspace(0.0, 0.0, num = 4*H, dtype = theano.config.floatX) # Wz= np.linspace(-0.3, 0.6, num=4*H*context_dim, dtype = theano.config.floatX).reshape(context_dim, 4*H) # Hcontext_in = np.linspace(-0.2, 0.6, num=H*K, dtype = theano.config.floatX).reshape(H, K) # Zcontext_in = np.linspace(-0.2, 0.5, num=context_dim*K, dtype= theano.config.floatX).reshape(context_dim, K) # Va= np.linspace(0.1, 0.4, num=K, dtype = theano.config.floatX) # Va_reshape = Va.reshape(K,1) # image_feature_3D = np.linspace(-0.2, 0.5, num=10*N*context_dim, dtype = theano.config.floatX).reshape(N,10, context_dim) # h0_theano = h0.reshape(1, N, H) # # h0_symb = theano.tensor.ftensor3("h_symb") # # lstm_theano_layer.h_m1.set_value(h0_theano) # c0_theano = np.zeros((1, N, H), dtype = theano.config.floatX) # # c0_symb = theano.tensor.ftensor3("c_symb") # # lstm_theano_layer.c_m1.set_value(c0_theano) # z0_theano = np.zeros((1, N, context_dim), dtype = theano.config.floatX) # x_theano = x.reshape(T_new, N, D) # image_feature_input = image_feature_3D # weight_y_in_value = np.zeros(( 10, context_dim) , dtype= theano.config.floatX) # b_theano= b.reshape(1, 1, 4*H) # h_t_context_eval = m_t.eval({h_t: np.ones((1,2,4)).astype("float32"), image_feature_region: image_feature_input, Hcontext: Hcontext_in, Zcontext: Zcontext_in}) # print(h_t_context_eval.shape) # pdb.set_trace() # ------------------------------------------------------------------ e = T.dot(m_t, Va) #No_region, N, 1 e_reshape = e.reshape((e.shape[0], T.prod(e.shape[1:]))) # ------------------------------------------------------------------ # Va_in= np.linspace(0.1, 0.4, num=5*1, dtype = theano.config.floatX).reshape(5,1) # Va_reshape = Va_in.reshape(5,1).astype("float32") # # print(Va_reshape) # e_val = e_reshape.eval({m_t: np.ones((10,2,5)).astype("float32"), Va: Va_reshape}) #np.ones((10,2,5)).astype("float32") # print(e_val.shape) # ------------------------------------------------------------------ e_softmax = softmax_along_axis(e_reshape, axis = 0) #shape No_region, N # ------------------------------------------------------------------- # pdb.set_trace() # e_softmax_eval = e_softmax.eval({e_reshape: np.random.randn(10,2).astype("float32")}) # print(e_softmax_eval.shape) # ------------------------------------------------------------------- e_t = T.transpose(e_softmax, (1,0)) #shape N, No_region e_t_r = e_t.reshape([-1, e_softmax.shape[0], e_softmax.shape[1]]) #3D tensor 1, N, No_region e_t_r_t = T.transpose(e_t_r, (1,0, 2)) # shape N, 1, No_region e_3D = T.repeat(e_t_r_t, e_t_r_t.shape[2], axis = 1) #shape N, No_region, No_region image_feature_region.shape[1] e_3D_t = T.transpose(e_3D, (1,2,0)) #No_region, No_region, N # --------------------------------------------------------------------- # image_feature_3D = np.linspace(-0.2, 0.5, num=10*2*3, dtype = theano.config.floatX).reshape(2,10, 3) # e_3D_t_eval = e_3D_t.eval({e_softmax: np.random.randn(10,2).astype("float32")}) # print(e_3D_t_eval.shape) # pdb.set_trace() # --------------------------------------------------------------------- identity_2D = T.identity_like(e_3D_t)# shape No_region, No_region identity_3D = identity_2D.reshape([-1, identity_2D.shape[0], identity_2D.shape[1]]) # shape 1, No_region, No_region identity_3D_t = T.repeat(identity_3D, image_feature_region.shape[0], axis = 0) e_3D_diagonal = e_3D*identity_3D_t #diagonal tensor 3D (N, No_region, No_region) # ---------------------------------------------------------------------- # image_feature_3D = np.linspace(-0.2, 0.5, num=10*2*3, dtype = theano.config.floatX).reshape(2,10, 3) # e_3D_diagonal_eval = e_3D_diagonal.eval({e_3D_t: np.ones((10, 10, 2)).astype("float32"), # image_feature_region: image_feature_3D, # e_3D: np.ones((2, 10, 10)).astype("float32")}) # print(e_3D_diagonal_eval) # pdb.set_trace() # ---------------------------------------------------------------------- # weight_y = T.fmatrix("weight_y") out_weight_y, updates = theano.scan(fn=onestep_weight_feature_multiply, outputs_info=[weight_y], sequences=[e_3D_diagonal, image_feature_region], non_sequences=[]) #out_weight_y shape (N, No_region, feature_dim) z_t = T.sum(out_weight_y, axis = 1) #shape (N, feature_dim) z_t_r = z_t.reshape((-1,z_t.shape[0],z_t.shape[1])) #------------------------------------------------------------------------ pdb.set_trace() image_feature_3D = np.linspace(-0.2, 0.5, num=10*2*3, dtype = theano.config.floatX).reshape(2,10, 3) z_t_r_eval = z_t_r.eval({e_3D_diagonal: np.ones((2,10,10)).astype("float32"), image_feature_region: image_feature_3D, weight_y: np.zeros((10,3)).astype("float32")}) print(z_t_r_eval.shape) pdb.set_trace() # ----------------------------------------------------------------------- return [h_t, c_t, z_t_r]
def initial_states(self, batch_size, *args, **kwargs): return tensor.repeat( tensor.ones(self.parameters[1][None, :].shape), batch_size, 0)
def _get_initial_states(self, batch_size): init_h = T.repeat(self.init_h.dimshuffle('x', 0), batch_size, axis=0) init_o = apply_model(self.readout, init_h) return init_h, init_o
def __init__(self, x_h_0, v_h_0, t_h_0, x_t_0, v_t_0, a_t_0, t_t_0, time_steps, exist, is_leader, x_goal, turn_vec_h, turn_vec_t, n_steps, lr, game_params, arch_params, solver_params, params): self._init_layers(params, arch_params, game_params) self._connect(game_params, solver_params) def _dist_from_rail(pos, rail_center, rail_radius): d = tt.sqrt(((pos - rail_center)**2).sum()) return tt.sum((d - rail_radius)**2) def _step_state(x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_t, ctrl, exist, time_step): a_t_e, v_t_e, x_t_e, t_t, t_h = step(x_h_, v_h_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_h, exist, time_step) t_h = common.disconnected_grad(t_h) t_t = common.disconnected_grad(t_t) # approximated dynamic of the un-observed parts in the state a_t_a = tt.zeros(shape=(3, 2), dtype=np.float32) v_t_a = v_t_ x_t_a = x_t_ + self.dt * v_t_a # difference in predictions n_v_t = v_t_e - v_t_a n_a_t = a_t_e - a_t_a n_x_t = x_t_e - x_t_a # disconnect the gradient of the noise signals n_v_t = common.disconnected_grad(n_v_t) n_a_t = common.disconnected_grad(n_a_t) n_x_t = common.disconnected_grad(n_x_t) # add the noise to the approximation a_t = a_t_a + n_a_t v_t = v_t_a + n_v_t x_t = x_t_a + n_x_t # update the observed part of the state delta_steer = ctrl[0] accel = ctrl[1] delta_steer = tt.clip(delta_steer, -np.pi / 4, np.pi / 4) angle = angle_ + delta_steer speed = speed_ + accel * self.dt speed = tt.clip(speed, 0, self.v_max) v_h_x = speed * tt.sin(angle) v_h_y = speed * tt.cos(angle) v_h = tt.stack([v_h_x, v_h_y]) x_h = x_h_ + self.dt * v_h x_h = tt.clip(x_h, -self.bw, self.bw) return x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t def _recurrence(time_step, x_h_, v_h_, angle_, speed_, t_h_, x_t_, v_t_, a_t_, t_t_, exist, is_leader, x_goal, turn_vec_h, turn_vec_t): # state ''' 1. host 1.1 position (2) - (x,y) coordinates in cross coordinate system 1.2 speed (2) - (v_x,v_y) # 1.3 acceleration (2) - (a_x,a_y) # 1.4 waiting time (1) - start counting on full stop. stop counting when clearing the junction 1.5 x_goal (2) - destination position (indicates different turns) total = 5 2. right lane car 2.1 position (2) - null value = (-1,-1) 2.2 speed (2) - null value = (0,0) 2.3 acceleration (2) - null value = (0,0) 2.4 waiting time (1) - null value = 0 total = 7 3. front lane car 3.1 position (2) 3.2 speed (2) 3.3 acceleration (2) 3.4 waiting time (1) total = 7 4. target 3 4.1 position (2) 4.2 speed (2) 4.3 acceleration (2) 4.4 waiting time (1) total = 7 total = 26 ''' # host_state_vec = tt.concatenate([x_h_, v_h_, t_h_]) ang_spd = tt.stack([angle_, speed_]) host_state_vec = tt.concatenate([x_h_, ang_spd, x_goal]) # target_state_vec = tt.concatenate([tt.flatten(x_t_), tt.flatten(v_t_), tt.flatten(a_t_), tt.flatten(t_t_)]) target_state_vec = tt.concatenate([ tt.flatten(x_t_), tt.flatten(v_t_), tt.flatten(a_t_), is_leader ]) state = tt.concatenate([host_state_vec, target_state_vec]) h0 = tt.dot(state, self.W_0) + self.b_0 relu0 = tt.nnet.relu(h0) h1 = tt.dot(relu0, self.W_1) + self.b_1 relu1 = tt.nnet.relu(h1) h2 = tt.dot(relu1, self.W_2) + self.b_2 relu2 = tt.nnet.relu(h2) a_h = tt.dot(relu2, self.W_c) x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t = _step_state( x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_t, a_h, exist, time_step) # cost: discount_factor = 0.99**time_step # 0. smooth driving policy cost_steer = discount_factor * a_h[0]**2 cost_accel = discount_factor * a_h[1]**2 # 1. forcing the host to move forward dist_from_goal = tt.mean((x_goal - x_h)**2) cost_progress = discount_factor * dist_from_goal # 2. keeping distance from in front vehicles d_t_h = x_t - x_h h_t_dists = (d_t_h**2).sum(axis=1) # v_h_norm = tt.sqrt((v_h**2).sum()) # d_t_h_norm = tt.sqrt((d_t_h**2).sum(axis=1)) # # denominator = v_h_norm * d_t_h_norm # # host_targets_orientation = tt.dot(d_t_h, v_h) / (denominator + 1e-3) # # in_fornt_targets = tt.nnet.sigmoid(5 * host_targets_orientation) # # close_targets = tt.sum(tt.abs_(d_t_h)) # # cost_accident = tt.mean(in_fornt_targets * close_targets) cost_accident = tt.sum( tt.nnet.relu(self.require_distance - h_t_dists)) # 3. rail divergence cost_right_rail = _dist_from_rail( x_h, self.right_rail_center, self.right_rail_radius) * turn_vec_h[0] cost_front_rail = (x_h[0] - self.lw / 2)**2 * turn_vec_h[1] cost_left_rail = _dist_from_rail( x_h, self.left_rail_center, self.left_rail_radius) * turn_vec_h[2] cost_rail = cost_right_rail + cost_left_rail + cost_front_rail return (x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t, cost_steer, cost_accel, cost_progress, cost_accident, cost_rail, a_h), t.scan_module.until(dist_from_goal < 0.001) [ x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t, costs_steer, costs_accel, costs_progress, costs_accident, costs_rail, a_hs ], scan_updates = t.scan( fn=_recurrence, sequences=time_steps, outputs_info=[ x_h_0, v_h_0, 0., 0., t_h_0, x_t_0, v_t_0, a_t_0, t_t_0, None, None, None, None, None, None ], non_sequences=[exist, is_leader, x_goal, turn_vec_h, turn_vec_t], n_steps=n_steps, name='scan_func') # 3. right of way cost term T = x_h.shape[0] x_h_rpt_1 = tt.repeat(x_h, T, axis=1) # (Tx2T) x_h_rpt_1_3d = x_h_rpt_1.dimshuffle(0, 1, 'x') # (Tx2Tx1) x_h_3D = tt.repeat(x_h_rpt_1_3d, 3, axis=2) # (Tx2Tx3) x_t_rshp_1 = tt.zeros(shape=(2 * T, 3), dtype=np.float32) # (2Tx3) x_t_rshp_1_x = tt.set_subtensor(x_t_rshp_1[:T, :], x_t[:, :, 0]) x_t_rshp_1_xy = tt.set_subtensor(x_t_rshp_1_x[T:, :], x_t[:, :, 1]) x_t_rshp_1_3d = x_t_rshp_1_xy.dimshuffle(0, 1, 'x') # (2Tx3x1) x_t_rpt_2_3d = tt.repeat(x_t_rshp_1_3d, T, axis=2) # (2Tx3xT) x_t_3D = x_t_rpt_2_3d.dimshuffle(2, 0, 1) # (Tx2Tx3) # abs_diff_mat = tt.abs_(x_h_3D - x_t_3D) # (Tx2Tx3) abs_diff_mat = (x_h_3D - x_t_3D)**2 # (Tx2Tx3) dists_mat = abs_diff_mat[:, : T, :] + abs_diff_mat[:, T:, :] # d_x+d_y: (TxTx3) # punish only when cutting a leader host_effective_dists = (tt.triu(dists_mat[:, :, 0]) * is_leader[0] + tt.triu(dists_mat[:, :, 1]) * is_leader[1] + tt.triu(dists_mat[:, :, 2]) * is_leader[2]) costs_row = tt.mean( tt.nnet.sigmoid(self.eps_row - host_effective_dists)) self.cost_steer = tt.mean(costs_steer) self.cost_accel = tt.mean(costs_accel) self.cost_progress = tt.mean(costs_progress) self.cost_accident = tt.mean(costs_accident) self.cost_row = tt.mean(costs_row) self.cost_rail = tt.mean(costs_rail) self.weighted_cost = ( self.w_delta_steer * self.cost_steer + self.w_accel * self.cost_accel + self.w_progress * self.cost_progress + self.w_accident * self.cost_accident + # self.w_row * self.cost_row self.w_rail * self.cost_rail) self.cost = ( self.cost_steer + self.cost_accel + self.cost_progress + self.cost_accident + # self.cost_row self.cost_rail) objective = self.weighted_cost objective = common.weight_decay(objective=objective, params=self.params, l1_weight=self.l1_weight) objective = t.gradient.grad_clip(objective, -self.grad_clip_val, self.grad_clip_val) gradients = tt.grad(objective, self.params) self.updates = optimizers.optimizer(lr=lr, param_struct=self, gradients=gradients, solver_params=solver_params) self.x_h = x_h self.v_h = v_h self.x_t = x_t self.v_t = v_t self.max_a = tt.max(abs(a_hs)) self.max_grad_val = 0 self.grad_mean = 0 for g in gradients: self.grad_mean += tt.mean(tt.abs_(g)) self.max_grad_val = (tt.max(g) > self.max_grad_val) * tt.max(g) + ( tt.max(g) <= self.max_grad_val) * self.max_grad_val self.params_abs_norm = self._calc_params_norm()
def training_cost_weighted(self, y, weights=None): """ Wrapper for standard name """ LL = T.log(self.p_y_given_x)[T.arange(y.shape[0]), y] weights = T.repeat(weights.dimshuffle('x', 0), y.shape[0], axis=0) factors = weights[T.arange(y.shape[0]), y] return -T.mean(LL * factors)
def evaluate_lenet5(learning_rate=0.01, n_epochs=4, emb_size=40, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/' test_file_path = '/save/wenpeng/datasets/LORELEI/il5-setE-as-test-input_ner_filtered_w2.txt' output_file_path = '/save/wenpeng/datasets/LORELEI/il5_system_output_forfun_w2.json' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) word2id = {} # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types( word2id, maxSentLen) train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others( word2id, maxSentLen) test_sents, test_masks, test_lines, word2id = load_official_testData( word2id, maxSentLen, test_file_path) label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_p1_sents = np.asarray(train_p1_sents, dtype='int32') train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX) train_p1_labels = np.asarray(train_p1_labels, dtype='int32') train_p1_size = len(train_p1_labels) train_p2_sents = np.asarray(train_p2_sents, dtype='int32') train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX) train_p2_labels = np.asarray(train_p2_labels, dtype='int32') train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32') train_p2_size = len(train_p2_labels) ''' combine train_p1 and train_p2 ''' train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0) train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0) train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0) train_size = train_p1_size + train_p2_size test_sents = np.asarray(test_sents, dtype='int32') test_masks = np.asarray(test_masks, dtype=theano.config.floatX) # test_labels=np.asarray(all_labels[2], dtype='int32') test_size = len(test_sents) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec', emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec' ], 40) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 other_labels = T.imatrix() #batch*4 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] ''' multi-CNN ''' conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l ''' cross-DNN-dataless ''' #first map label emb into hidden space HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, emb_size, hidden_size[0]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=bow_des, n_in=emb_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) des_rep_hidden = HL_layer_1.output #(type_size, hidden_size) dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot( des_rep_hidden.T)) #(batch_size, type_size) dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) acnn_LR_input = T.concatenate([ dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix, top_k_score_matrix, sent_embeddings, sent_embeddings2, gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb ], axis=1) acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12) acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size, 16) acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b] acnn_other_layer_LR = LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=16, W=acnn_other_U_a, b=acnn_other_LR_b) acnn_other_prob_matrix = T.nnet.softmax( acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4))) acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape( (batch_size, 4, 4)) acnn_other_prob = acnn_other_prob_tensor3[ T.repeat(T.arange(batch_size), 4), T.tile(T.arange(4), (batch_size)), other_labels.flatten()] acnn_other_field_loss = -T.mean(T.log(acnn_other_prob)) params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params # put all model parameters together cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() + (conv_att_W**2).sum() + (conv_att_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) other_paras = params + acnn_other_LR_para cost_other = cost + acnn_other_field_loss other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate) ''' testing ''' ensemble_NN_scores = acnn_score_matrix #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = ensemble_NN_scores #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) ''' test for other fields ''' sum_tensor3 = acnn_other_prob_tensor3 #(batch, 4, 3) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_p1_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') train_p2_model = theano.function([ sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask, other_labels ], cost_other, updates=other_updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], [binarize_prob, ensemble_scores, sum_tensor3], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_train_p2_batches = train_p2_size / batch_size train_p2_batch_start = list(np.arange(n_train_p2_batches) * batch_size) + [train_p2_size - batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] train_p2_batch_start_set = set(train_p2_batch_start) # max_acc_dev=0.0 # max_meanf1_test=0.0 # max_weightf1_test=0.0 train_indices = range(train_size) train_p2_indices = range(train_p2_size) cost_i = 0.0 other_cost_i = 0.0 min_mean_frame = 100.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) random.Random(100).shuffle(train_p2_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_p1_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) if batch_id in train_p2_batch_start_set: train_p2_id_batch = train_p2_indices[batch_id:batch_id + batch_size] other_cost_i += train_p2_model( train_p2_sents[train_p2_id_batch], train_p2_masks[train_p2_id_batch], train_p2_labels[train_p2_id_batch], label_sent, label_mask, train_p2_other_labels[train_p2_id_batch]) # else: # random_batch_id = random.choice(train_p2_batch_start) # train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size] # other_cost_i+=train_p2_model( # train_p2_sents[train_p2_id_batch], # train_p2_masks[train_p2_id_batch], # train_p2_labels[train_p2_id_batch], # label_sent, # label_mask, # train_p2_other_labels[train_p2_id_batch] # ) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), str( other_cost_i / iter), 'uses ', (time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_types = [] pred_confs = [] pred_others = [] for i, test_batch_id in enumerate( test_batch_start): # for each test batch pred_types_i, pred_conf_i, pred_fields_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) if i < len(test_batch_start) - 1: pred_types.append(pred_types_i) pred_confs.append(pred_conf_i) pred_others.append(pred_fields_i) else: pred_types.append(pred_types_i[-n_test_remain:]) pred_confs.append(pred_conf_i[-n_test_remain:]) pred_others.append(pred_fields_i[-n_test_remain:]) pred_types = np.concatenate(pred_types, axis=0) pred_confs = np.concatenate(pred_confs, axis=0) pred_others = np.concatenate(pred_others, axis=0) mean_frame = generate_output_for_EDL(test_lines, output_file_path, pred_types, pred_confs, pred_others, min_mean_frame) # mean_frame = generate_2017_official_output(test_lines, output_file_path, pred_types, pred_confs, pred_others, min_mean_frame) if mean_frame < min_mean_frame: min_mean_frame = mean_frame print '\t\t\t test over, min_mean_frame:', min_mean_frame print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def repeat(self, t, n): T.repeat(t, n)
def evaluate_lenet5(learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=70, useAllSamples=0, kmax=30, ktop=4, filter_size=[7, 5], L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=60): #def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 12], batch_size=70, useAllSamples=0, kmax=30, ktop=5, filter_size=[10,7], # L2_weight=0.000005, dropout_p=0.5, useEmb=0, task=5, corpus=1): root = "/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" embeddingPath = '/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2 = '/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' rng = numpy.random.RandomState(23455) datasets, embedding_size, embeddings, embeddings_Q, unigram = read_data_WP( root + str(task) + 'classes/' + str(corpus) + 'train.txt', root + str(task) + 'classes/' + str(corpus) + 'dev.txt', root + str(task) + 'classes/' + str(corpus) + 'test.txt', embeddingPath, maxSentLength, useEmb, dataMode) #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60) #datasets = load_data(dataset) indices_train, trainY, trainLengths, trainLeftPad, trainRightPad = datasets[ 0] indices_dev, devY, devLengths, devLeftPad, devRightPad = datasets[1] indices_test, testY, testLengths, testLeftPad, testRightPad = datasets[2] n_train_batches = indices_train.shape[0] / batch_size n_valid_batches = indices_dev.shape[0] / batch_size n_test_batches = indices_test.shape[0] / batch_size remain_train = indices_train.shape[0] % batch_size train_batch_start = [] dev_batch_start = [] test_batch_start = [] if useAllSamples: train_batch_start = list( numpy.arange(n_train_batches) * batch_size) + [indices_train.shape[0] - batch_size] dev_batch_start = list(numpy.arange(n_valid_batches) * batch_size) + [ indices_dev.shape[0] - batch_size ] test_batch_start = list(numpy.arange(n_test_batches) * batch_size) + [ indices_test.shape[0] - batch_size ] n_train_batches = n_train_batches + 1 n_valid_batches = n_valid_batches + 1 n_test_batches = n_test_batches + 1 else: train_batch_start = list(numpy.arange(n_train_batches) * batch_size) dev_batch_start = list(numpy.arange(n_valid_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_theano = theano.shared(numpy.asarray( indices_train, dtype=theano.config.floatX), borrow=True) indices_dev_theano = theano.shared(numpy.asarray( indices_dev, dtype=theano.config.floatX), borrow=True) indices_test_theano = theano.shared(numpy.asarray( indices_test, dtype=theano.config.floatX), borrow=True) indices_train_theano = T.cast(indices_train_theano, 'int32') indices_dev_theano = T.cast(indices_dev_theano, 'int32') indices_test_theano = T.cast(indices_test_theano, 'int32') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x_index = T.imatrix( 'x_index') # now, x is the index matrix, must be integer y = T.ivector('y') z = T.ivector('z') left = T.ivector('left') right = T.ivector('right') x = embeddings[x_index.flatten()].reshape( (batch_size, maxSentLength, embedding_size)).transpose(0, 2, 1).flatten() ishape = (embedding_size, maxSentLength ) # this is the size of MNIST images filter_size1 = (embedding_size, filter_size[0]) filter_size2 = (embedding_size / 2, filter_size[1]) #poolsize1=(1, ishape[1]-filter_size1[1]+1) #????????????????????????????? poolsize1 = (1, ishape[1] + filter_size1[1] - 1) ''' left_after_conv=T.maximum(0,left-filter_size1[1]+1) right_after_conv=T.maximum(0, right-filter_size1[1]+1) ''' left_after_conv = left right_after_conv = right #kmax=30 # this can not be too small, like 20 #ktop=6 #poolsize2=(1, kmax-filter_size2[1]+1) #(1,6) poolsize2 = (1, kmax + filter_size2[1] - 1) #(1,6) dynamic_lengths = T.maximum(ktop, z / 2 + 1) # dynamic k-max pooling ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1])) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) ''' layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=kmax) ''' layer0 = Conv_Fold_DynamicK_PoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=dynamic_lengths, unifiedWidth=kmax, left=left_after_conv, right=right_after_conv, firstLayer=True) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) ''' layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ishape[0], kmax), filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop) ''' ''' left_after_conv=T.maximum(0, layer0.leftPad-filter_size2[1]+1) right_after_conv=T.maximum(0, layer0.rightPad-filter_size2[1]+1) ''' left_after_conv = layer0.leftPad right_after_conv = layer0.rightPad dynamic_lengths = T.repeat([ktop], batch_size) # dynamic k-max pooling ''' layer1 = ConvFoldPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ishape[0]/2, kmax), filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop, left=left_after_conv, right=right_after_conv) ''' layer1 = Conv_Fold_DynamicK_PoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ishape[0] / 2, kmax), filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=dynamic_lengths, unifiedWidth=ktop, left=left_after_conv, right=right_after_conv, firstLayer=False) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) dropout = dropout_from_layer(rng, layer2_input, dropout_p) # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size #layer2 = FullyConnectedLayer(rng, input=dropout, n_in=nkerns[1] * (embedding_size/4) * ktop, n_out=task) layer3 = LogisticRegression(rng, input=dropout, n_in=nkerns[1] * (embedding_size / 4) * ktop, n_out=task) #layer3=SoftMaxlayer(input=layer2.output) #layer3 = LogisticRegression(rng, input=layer2.output, n_in=50, n_out=2) # the cost we minimize during training is the NLL of the model #L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer1.W).sum()+abs(layer0.W).sum()+abs(embeddings).sum() L2_reg = (layer3.W**2).sum() + (layer1.W**2).sum() + ( layer0.W**2).sum() + (embeddings**2).sum() #L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+(layer0.W** 2).sum()+(embeddings**2).sum() #cost must have L2, otherwise, will produce nan, while with L2, each word embedding will be updated cost = layer3.negative_log_likelihood(y) + L2_weight * L2_reg #cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x_index: indices_test_theano[index:index + batch_size], y: testY[index:index + batch_size], z: testLengths[index:index + batch_size], left: testLeftPad[index:index + batch_size], right: testRightPad[index:index + batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x_index: indices_dev_theano[index:index + batch_size], y: devY[index:index + batch_size], z: devLengths[index:index + batch_size], left: devLeftPad[index:index + batch_size], right: devRightPad[index:index + batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer1.params + layer0.params + [embeddings] #params = layer3.params + layer2.params + layer0.params+[embeddings] accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. ''' updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) ''' updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) if param_i == embeddings: updates.append( (param_i, T.set_subtensor( (param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(embedding_size))))) #AdaGrad else: updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, layer3.errors(y)], updates=updates, givens={ x_index: indices_train_theano[index:index + batch_size], y: trainY[index:index + batch_size], z: trainLengths[index:index + batch_size], left: trainLeftPad[index:index + batch_size], right: trainRightPad[index:index + batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches / 50, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train cost_ij, error_ij = train_model(batch_start) #if iter ==1: # exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str(iter) + ' cost: ' + str( cost_ij) + ' error: ' + str(error_ij) if iter % validation_frequency == 0: # compute zero-one loss on validation set #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] validation_losses = [ validate_model(i) for i in dev_batch_start ] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(( '\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(self): #def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 12], batch_size=70, useAllSamples=0, kmax=30, ktop=5, filter_size=[10,7], # L2_weight=0.000005, dropout_p=0.5, useEmb=0, task=5, corpus=1): rng = numpy.random.RandomState(23455) #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60) #datasets = load_data(dataset) indices_train, trainLengths, trainLeftPad, trainRightPad= self.datasets[0] #indices_dev, devLengths, devLeftPad, devRightPad= self.datasets[1] ''' print 'indices_train shapes:' print indices_train.shape[0], indices_train.shape[1] print indices_train ''' #create embedding matrix to store the final embeddings sentences_embs=numpy.zeros((indices_train.shape[0],self.sentEm_length), dtype=theano.config.floatX) n_train_batches=indices_train.shape[0]/self.batch_size #n_valid_batches=indices_dev.shape[0]/self.batch_size remain_train=indices_train.shape[0]%self.batch_size train_batch_start=[] dev_batch_start=[] if self.useAllSamples: train_batch_start=list(numpy.arange(n_train_batches)*self.batch_size)+[indices_train.shape[0]-self.batch_size] #dev_batch_start=list(numpy.arange(n_valid_batches)*self.batch_size)+[indices_dev.shape[0]-self.batch_size] n_train_batches=n_train_batches+1 #n_valid_batches=n_valid_batches+1 else: train_batch_start=list(numpy.arange(n_train_batches)*self.batch_size) #dev_batch_start=list(numpy.arange(n_valid_batches)*self.batch_size) ''' print 'train_batch_start:' print train_batch_start ''' indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True) #indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True) indices_train_theano=T.cast(indices_train_theano, 'int32') ''' print 'target_matrix shape' print self.target_matrix.shape[0], self.target_matrix.shape[1] print self.target_matrix ''' indices_target_theano=theano.shared(numpy.asarray(self.target_matrix, dtype=theano.config.floatX), borrow=True) #indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True) indices_target_theano=T.cast(indices_target_theano, 'int32') #print 'context_matrix shape' #print self.context_matrix.shape[0], self.context_matrix.shape[1] #print self.context_matrix[:,0:300], self.context_matrix[:,300:600], self.context_matrix[:,600:900], self.context_matrix[:,900:] indices_context_theano=theano.shared(numpy.asarray(self.context_matrix, dtype=theano.config.floatX), borrow=True) #indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True) indices_context_theano=T.cast(indices_context_theano, 'int32') #indices_dev_theano=T.cast(indices_dev_theano, 'int32') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x_index = T.imatrix('x_index') # now, x is the index matrix, must be integer #y = T.ivector('y') z = T.ivector('z') # sentence length left=T.ivector('left') right=T.ivector('right') iteration= T.lscalar() t_index=T.imatrix('t_index') c_index=T.imatrix('c_index') x_index=debug_print(x_index,'x_index') x_transpose=debug_print(self.embeddings_R[x_index.flatten()].reshape((self.batch_size,self.maxSentLength, self.context_embedding_size)).transpose(0, 2, 1),'x_transpose') x=debug_print(x_transpose.flatten(),'x') ishape = (self.context_embedding_size, self.maxSentLength) # this is the size of MNIST images filter_size1=(self.context_embedding_size,self.filter_size[0]) filter_size2=(self.context_embedding_size/2,self.filter_size[1]) #poolsize1=(1, ishape[1]-filter_size1[1]+1) #????????????????????????????? poolsize1=(1, ishape[1]+filter_size1[1]-1) ''' left_after_conv=T.maximum(0,left-filter_size1[1]+1) right_after_conv=T.maximum(0, right-filter_size1[1]+1) ''' left_after_conv=left right_after_conv=right #kmax=30 # this can not be too small, like 20 #ktop=6 #poolsize2=(1, kmax-filter_size2[1]+1) #(1,6) poolsize2=(1, self.kmax+filter_size2[1]-1) #(1,6) dynamic_lengths=T.maximum(self.ktop,z/2+1) # dynamic k-max pooling ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input=debug_print(x.reshape((self.batch_size, 1, ishape[0], ishape[1])),'layer0_input') # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) ''' layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=kmax) ''' layer0 = Conv_Fold_DynamicK_PoolLayer(rng, input=layer0_input, image_shape=(self.batch_size, 1, ishape[0], ishape[1]), filter_shape=(self.nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=dynamic_lengths, unifiedWidth=self.kmax, left=left_after_conv, right=right_after_conv, firstLayer=True) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) ''' layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ishape[0], kmax), filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop) ''' ''' left_after_conv=T.maximum(0, layer0.leftPad-filter_size2[1]+1) right_after_conv=T.maximum(0, layer0.rightPad-filter_size2[1]+1) ''' left_after_conv=layer0.leftPad right_after_conv=layer0.rightPad dynamic_lengths=T.repeat([self.ktop],self.batch_size) # dynamic k-max pooling layer1_input=debug_print(layer0.output, 'layer0_output') ''' layer1 = ConvFoldPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ishape[0]/2, kmax), filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop, left=left_after_conv, right=right_after_conv) ''' layer1 = Conv_Fold_DynamicK_PoolLayer(rng, input=layer1_input, image_shape=(self.batch_size, self.nkerns[0], ishape[0]/2, self.kmax), filter_shape=(self.nkerns[1], self.nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=dynamic_lengths, unifiedWidth=self.ktop, left=left_after_conv, right=right_after_conv, firstLayer=False) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer1_output = debug_print(layer1.output.flatten(2), 'layer1_output') #layer2_input=theano.printing.Print('layer2_input')(layer2_input) #produce sentence embeddings #layer2 = HiddenLayer(rng, input=layer2_input, n_in=self.nkerns[1] * (self.context_embedding_size/4) * self.ktop, n_out=self.sentEm_length, activation=T.tanh) #context_matrix, target_matrix=self.extract_contexts_targets(indices_matrix=x_index, sentLengths=z, leftPad=left) target_matrix=t_index context_matrix=c_index #note that context indices might be zero embeddings h_indices=debug_print(context_matrix[:, self.context_size*iteration:self.context_size*(iteration+1)],'h_indices') w_indices=debug_print(target_matrix[:, iteration:(iteration+1)],'w_indices') #r_h is the concatenation of context embeddings r_h=debug_print(self.embed_context(h_indices), 'embedded_context') #(batch_size, context_size*embedding_size) q_w=debug_print(self.embed_target(w_indices), 'embedded_target') #q_hat: concatenate sentence embeddings and context embeddings #q_hat=self.concatenate_sent_context(layer2.output, r_h) q_hat=self.concatenate_sent_context(layer1_output, r_h) layer3 = HiddenLayer(rng, input=q_hat, n_in=self.nkerns[1] * (self.context_embedding_size/4) * self.ktop+self.context_size*self.context_embedding_size, n_out=self.target_embedding_size, activation=T.tanh) layer3_output=debug_print(layer3.output, 'layer3.output') noise_indices, p_n_noise=self.get_noise() noise_indices=debug_print(noise_indices, 'noise_indices') #noise_indices=theano.printing.Print('noise_indices')(noise_indices) s_theta_data=debug_print(T.sum(layer3_output * q_w, axis=1).reshape((self.batch_size,1)) + self.bias[w_indices] , 's_theta_data') #s_theta_data=theano.printing.Print('s_theta_data')(s_theta_data) p_n_data = debug_print(self.p_n[w_indices],'p_n_data') #p_n[0] indicates the probability of word indexed 1 delta_s_theta_data = debug_print(s_theta_data - T.log(self.k * p_n_data),'delta_s_theta_data') log_sigm_data = debug_print(T.log(T.nnet.sigmoid(delta_s_theta_data)),'log_sigm_data') #create the noise, q_noise has shape(self.batch_size, self.k, self.embedding_size ) q_noise = debug_print(self.embed_noise(noise_indices),'embed_noise') q_hat_res = layer3_output.reshape((self.batch_size, 1, self.target_embedding_size)) s_theta_noise = debug_print(T.sum(q_hat_res * q_noise, axis=2) + self.bias[noise_indices],'s_theta_noise') #(batch_size, k) delta_s_theta_noise = debug_print(s_theta_noise - T.log(self.k * p_n_noise), 'delta_s_theta_noise') # it should be matrix (batch_size, k) log_sigm_noise = debug_print(T.log(1 - T.nnet.sigmoid(delta_s_theta_noise)), 'log_sigm_noise') sum_noise_per_example =debug_print(T.sum(log_sigm_noise, axis=1), 'sum_noise_per_example') #(batch_size, 1) # Calc objective function J = debug_print(-T.mean(log_sigm_data) - T.mean(sum_noise_per_example),'J') L2_reg = (layer3.W** 2).sum()+ (layer1.W** 2).sum()+(layer0.W** 2).sum()+(self.embeddings_R**2).sum()#+( self.embeddings_Q**2).sum() self.cost = J + self.L2_weight*L2_reg ''' validate_model = theano.function([index,iteration], self.cost, givens={ x_index: indices_dev_theano[index: index + self.batch_size], z: devLengths[index: index + self.batch_size], left: devLeftPad[index: index + self.batch_size], right: devRightPad[index: index + self.batch_size]}) ''' # create a list of all model parameters to be fit by gradient descent self.params = layer3.params+layer1.params + layer0.params+[self.embeddings_R]#, self.embeddings_Q] #params = layer3.params + layer2.params + layer0.params+[embeddings] accumulator=[] for para_i in self.params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(self.cost, self.params) updates = [] for param_i, grad_i, acc_i in zip(self.params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) if param_i == self.embeddings_R:# or param_i == self.embeddings_Q: updates.append((param_i, T.set_subtensor((param_i - self.ini_learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(self.context_embedding_size))))) #AdaGrad else: updates.append((param_i, param_i - self.ini_learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,iteration], [self.cost], updates=updates, givens={ x_index: indices_train_theano[index: index + self.batch_size], z: trainLengths[index: index + self.batch_size], left: trainLeftPad[index: index + self.batch_size], right: trainRightPad[index: index + self.batch_size], t_index: indices_target_theano[index: index + self.batch_size], c_index: indices_context_theano[index: index + self.batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(10, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False vali_loss_list=[] train_loss_list=[] while (epoch < self.n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #print 'batch_start: '+str(batch_start) total_iteration=min(max(self.target_lengths[batch_start: batch_start + self.batch_size]), 60) # total iteration is not allowed to surpass 60 # we only care the last cost within those iterations cost_of_end_batch=0.0 costs_in_batch=[] for iteration in range(total_iteration): #print 'iteration: '+str(iteration)+'/'+str(total_iteration)+' in iter '+str(iter) #if iteration==3: # exit(0) cost_of_end_batch = train_model(batch_start, iteration) ''' print 'updated self.embeddings_R:' print self.embeddings_R.get_value()[:37,:] print self.embeddings_R.get_value()[37:,:] print 'updated layer0 W: ' print layer0.W.get_value()[0:1,0:1,0:1,:] print 'updated layer1 W:' print layer1.W.get_value()[0:1,0:1,0:1,:] print 'updated layer2 W: ' print layer2.W.get_value() print 'updated layer3 W:' print layer3.W.get_value() ''' costs_in_batch.append(cost_of_end_batch) #print 'cost_of_each_iteration: '+str(cost_of_end_batch) average_cost_per_batch=numpy.mean(costs_in_batch) #print 'cost_of_batch: '+str(average_cost_per_batch) if iter % validation_frequency == 0: print 'training @ iter = '+str(iter)+' cost: '+str(average_cost_per_batch)# +' error: '+str(error_ij) #print batch_embs #store sentence embeddings #for row in range(batch_start, batch_start + self.batch_size): # sentences_embs[row]=batch_embs[row-batch_start] if average_cost_per_batch<minimal_of_list(train_loss_list): del train_loss_list[:] train_loss_list.append(average_cost_per_batch) self.best_params=self.params elif len(train_loss_list)<self.vali_cost_list_length: train_loss_list.append(average_cost_per_batch) if len(train_loss_list)==self.vali_cost_list_length: self.store_model_to_file() #self.store_sentence_embeddings(sentences_embs) self.store_embeddings() print 'Training over, best model got at train_cost:'+str(train_loss_list[0]) exit(0) #print 'sentence embeddings:' #print sentences_embs[:6,:] #if iter ==1: # exit(0) ''' if iter % validation_frequency == 0: print 'training @ iter = '+str(iter)+' cost: '+str(cost_of_end_batch)# +' error: '+str(error_ij) if iter % validation_frequency == 0: #print '\t iter: '+str(iter) # compute zero-one loss on validation set #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] validation_losses=[] for batch_start in dev_batch_start: #print '\t\t batch_start: '+str(batch_start) total_iteration=max(self.dev_lengths[batch_start: batch_start + self.batch_size]) #for validate, we need the cost among all the iterations in that batch for iteration in range(total_iteration): vali_loss_i=validate_model(batch_start, iteration) #print vali_loss_i validation_losses.append(vali_loss_i) this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation cost %f ' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss)) if this_validation_loss < minimal_of_list(vali_loss_list): del vali_loss_list[:] vali_loss_list.append(this_validation_loss) #store params self.best_params=self.params #fake elif len(vali_loss_list)<self.vali_cost_list_length: vali_loss_list.append(this_validation_loss) if len(vali_loss_list)==self.vali_cost_list_length: self.store_model_to_file() self.store_sentence_embeddings(sentences_embs) print 'Training over, best model got at vali_cost:'+str(vali_loss_list[0]) exit(0) ''' if patience <= iter: done_looping = True break end_time = time.clock() ''' print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) ''' print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def onestep_attend_copy(): i_t = T.dot(x_t, Wi) + T.dot(pre_h, Ui) + T.dot(pre_z, Zi) i_t_shape = T.shape(i_t) bi_reshape = T.repeat(bi, i_t_shape[0], 0) bi_reshape_2x = T.repeat(bi_reshape, i_t_shape[1], 1) bf_reshape = T.repeat(bf, i_t_shape[0], 0) bf_reshape_2x = T.repeat(bf_reshape, i_t_shape[1], 1) bc_reshape = T.repeat(bc, i_t_shape[0], 0) bc_reshape_2x = T.repeat(bc_reshape, i_t_shape[1], 1) bo_reshape = T.repeat(bo, i_t_shape[0], 0) bo_reshape_2x = T.repeat(bo_reshape, i_t_shape[1], 1) i_t_new= sigmoid(i_t + bi_reshape_2x) f_t= sigmoid(T.dot(x_t, Wf) + T.dot(pre_h, Uf) + T.dot(pre_z, Zf) + bf_reshape_2x) o_t= sigmoid(T.dot(x_t, Wo) + T.dot(pre_h, Uo) + T.dot(pre_z, Zo) + bo_reshape_2x) c_th = tanh(T.dot(x_t, Wc) + T.dot(pre_h, Uc) + T.dot(pre_z, Zc) + bc_reshape_2x) c_t = f_t*pre_c + i_t_new*c_th h_t = o_t*T.tanh(c_t) #shape (1, N, h_dim) h_t_context = T.repeat(h_t, image_feature_region.shape[1], axis = 0) #new shape (No_region, N, h_dim) image_feature_reshape = T.transpose(image_feature_region, (1, 0, 2)) #compute non-linear correlation between h_t(current text) to image_feature_region (64 for 128*128 and 196 for 224*224) # pdb.set_trace() m_t = T.tanh(T.dot(h_t_context, Hcontext) + T.dot(image_feature_reshape, Zcontext)) #shape (No_region, N, context_dim) e = T.dot(m_t, Va) #No_region, N, 1 e_reshape = e.reshape((e.shape[0], T.prod(e.shape[1:]))) e_softmax = softmax_along_axis(e_reshape, axis = 0) #shape No_region, N e_t = T.transpose(e_softmax, (1,0)) #shape N, No_region e_t_r = e_t.reshape([-1, e_softmax.shape[0], e_softmax.shape[1]]) #3D tensor 1, N, No_region e_t_r_t = T.transpose(e_t_r, (1,0, 2)) # shape N, 1, No_region e_3D = T.repeat(e_t_r_t, e_t_r_t.shape[2], axis = 1) #shape N, No_region, No_region image_feature_region.shape[1] e_3D_t = T.transpose(e_3D, (1,2,0)) #No_region, No_region, N identity_2D = T.identity_like(e_3D_t)# shape No_region, No_region identity_3D = identity_2D.reshape([-1, identity_2D.shape[0], identity_2D.shape[1]]) # shape 1, No_region, No_region identity_3D_t = T.repeat(identity_3D, image_feature_region.shape[0], axis = 0) e_3D_diagonal = e_3D*identity_3D_t #diagonal tensor 3D (N, No_region, No_region) out_weight_y, updates = theano.scan(fn=onestep_weight_feature_multiply, outputs_info=[weight_y], sequences=[e_3D_diagonal, image_feature_region], non_sequences=[]) z_t = T.sum(out_weight_y, axis = 1) #shape (N, feature_dim) z_t_r = z_t.reshape((-1,z_t.shape[0],z_t.shape[1])) return [h_t, c_t, z_t_r]
def logdet(self): return tt.repeat(tt.sum(tt.log(self.scale)), self.z0.shape[0])